diff --git a/crawler/exploration.ipynb b/crawler/exploration.ipynb
index 93e7751..ab3dc60 100644
--- a/crawler/exploration.ipynb
+++ b/crawler/exploration.ipynb
@@ -10,14 +10,15 @@
"name": "stderr",
"output_type": "stream",
"text": [
- "/Users/kadir/code/realestate/crawler/venv/lib/python3.12/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+ "/home/wizard/.cache/pypoetry/virtualenvs/rec-g1fA4zXM-py3.13/lib/python3.13/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
" from .autonotebook import tqdm as notebook_tqdm\n"
]
}
],
"source": [
"from data_access import Listing\n",
- "import pandas as pd"
+ "import pandas as pd\n",
+ "\n"
]
},
{
@@ -33,7 +34,18 @@
"execution_count": 2,
"id": "424501ab-ecc6-42f5-b87e-b0d2871bdc74",
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/tmp/ipykernel_1627094/3290399543.py:3: FutureWarning: The behavior of 'to_datetime' with 'unit' when parsing strings is deprecated. In a future version, strings will be parsed as datetime strings, matching the behavior without a 'unit'. To retain the old behavior, explicitly cast ints or floats to numeric type before calling to_datetime.\n",
+ " decisions = pd.read_json(decisions_path)\n",
+ "/tmp/ipykernel_1627094/3290399543.py:3: FutureWarning: The behavior of 'to_datetime' with 'unit' when parsing strings is deprecated. In a future version, strings will be parsed as datetime strings, matching the behavior without a 'unit'. To retain the old behavior, explicitly cast ints or floats to numeric type before calling to_datetime.\n",
+ " decisions = pd.read_json(decisions_path)\n"
+ ]
+ }
+ ],
"source": [
"# read decisions on file\n",
"decisions_path = 'data/decisions.json'\n",
@@ -128,7 +140,7 @@
},
{
"cell_type": "code",
- "execution_count": 6,
+ "execution_count": 5,
"id": "b1101088-9613-465f-81fd-79801e0202b8",
"metadata": {},
"outputs": [
@@ -136,7 +148,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "10574\n"
+ "2254\n"
]
}
],
@@ -153,7 +165,7 @@
},
{
"cell_type": "code",
- "execution_count": 7,
+ "execution_count": 6,
"id": "63e61601-7e3f-4d58-89f6-1794e4868cc3",
"metadata": {},
"outputs": [],
@@ -163,7 +175,7 @@
},
{
"cell_type": "code",
- "execution_count": 8,
+ "execution_count": 9,
"id": "1c222721-f426-42c0-9ac5-badc1f7a2034",
"metadata": {},
"outputs": [
@@ -209,97 +221,97 @@
"
\n",
" \n",
" | 0 | \n",
- " 105484772 | \n",
- " 45.7 | \n",
- " 325000.0 | \n",
- " 7111.597374 | \n",
- " https://www.rightmove.co.uk/properties/105484772 | \n",
- " 1 | \n",
- " {'duration': 1983, 'distance': 10095, 'duratio... | \n",
- " {'duration': 2043, 'distance': 10083, 'duratio... | \n",
- " 104.0 | \n",
- " 641.53 | \n",
- " False | \n",
- " Leasehold | \n",
- " 116 | \n",
+ " 111829454 | \n",
+ " 57.40 | \n",
+ " 500000.0 | \n",
+ " 8710.801394 | \n",
+ " https://www.rightmove.co.uk/properties/111829454 | \n",
+ " 2 | \n",
" None | \n",
- " 0 | \n",
+ " None | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " False | \n",
+ " Share of Freehold | \n",
+ " 2 | \n",
+ " None | \n",
+ " 2 | \n",
" None | \n",
"
\n",
" \n",
" | 1 | \n",
- " 105827126 | \n",
- " 58.5 | \n",
- " 950000.0 | \n",
- " 16239.316239 | \n",
- " https://www.rightmove.co.uk/properties/105827126 | \n",
- " 1 | \n",
- " {'duration': 2478, 'distance': 9584, 'duration... | \n",
- " {'duration': 2478, 'distance': 9584, 'duration... | \n",
+ " 118624844 | \n",
" NaN | \n",
+ " 400000.0 | \n",
" NaN | \n",
- " True | \n",
- " Leasehold | \n",
- " 83 | \n",
+ " https://www.rightmove.co.uk/properties/118624844 | \n",
+ " 3 | \n",
" None | \n",
- " 0 | \n",
+ " None | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " False | \n",
+ " None | \n",
+ " 8 | \n",
+ " None | \n",
+ " 2 | \n",
" None | \n",
"
\n",
" \n",
" | 2 | \n",
- " 108102476 | \n",
- " 53.7 | \n",
- " 515000.0 | \n",
- " 9590.316574 | \n",
- " https://www.rightmove.co.uk/properties/108102476 | \n",
+ " 121349855 | \n",
+ " 45.23 | \n",
+ " 630000.0 | \n",
+ " 13928.808313 | \n",
+ " https://www.rightmove.co.uk/properties/121349855 | \n",
" 1 | \n",
- " {'duration': 1266, 'distance': 4042, 'duration... | \n",
- " {'duration': 1861, 'distance': 4548, 'duration... | \n",
- " 104.0 | \n",
+ " None | \n",
+ " None | \n",
+ " NaN | \n",
" NaN | \n",
" False | \n",
" Leasehold | \n",
- " 97 | \n",
+ " 24 | \n",
" None | \n",
- " 0 | \n",
+ " 2 | \n",
" None | \n",
"
\n",
" \n",
" | 3 | \n",
- " 108171770 | \n",
- " 45.0 | \n",
- " 650000.0 | \n",
- " 14444.444444 | \n",
- " https://www.rightmove.co.uk/properties/108171770 | \n",
- " 2 | \n",
- " {'duration': 1591, 'distance': 7827, 'duration... | \n",
- " {'duration': 1591, 'distance': 7827, 'duration... | \n",
- " 962.0 | \n",
- " 2000.00 | \n",
+ " 122409413 | \n",
+ " 51.59 | \n",
+ " 550000.0 | \n",
+ " 10660.980810 | \n",
+ " https://www.rightmove.co.uk/properties/122409413 | \n",
+ " 1 | \n",
+ " None | \n",
+ " None | \n",
+ " 259.0 | \n",
+ " NaN | \n",
" False | \n",
" Leasehold | \n",
- " 261 | \n",
+ " 241 | \n",
" None | \n",
- " 0 | \n",
+ " 2 | \n",
" None | \n",
"
\n",
" \n",
" | 4 | \n",
- " 109595123 | \n",
- " NaN | \n",
- " 1000000.0 | \n",
- " NaN | \n",
- " https://www.rightmove.co.uk/properties/109595123 | \n",
+ " 123219209 | \n",
+ " 59.34 | \n",
+ " 960000.0 | \n",
+ " 16177.957533 | \n",
+ " https://www.rightmove.co.uk/properties/123219209 | \n",
" 1 | \n",
- " {'duration': 2463, 'distance': 9565, 'duration... | \n",
- " {'duration': 2463, 'distance': 9565, 'duration... | \n",
- " NaN | \n",
- " NaN | \n",
- " True | \n",
- " Please confirm if this is a freehold or leaseh... | \n",
- " 96 | \n",
" None | \n",
- " 0 | \n",
+ " None | \n",
+ " 993.0 | \n",
+ " 6300.0 | \n",
+ " False | \n",
+ " Leasehold | \n",
+ " 9 | \n",
+ " None | \n",
+ " 2 | \n",
" None | \n",
"
\n",
" \n",
@@ -322,201 +334,162 @@
" | ... | \n",
"
\n",
" \n",
- " | 10569 | \n",
+ " 2249 | \n",
+ " 87149832 | \n",
+ " NaN | \n",
+ " 800000.0 | \n",
+ " NaN | \n",
+ " https://www.rightmove.co.uk/properties/87149832 | \n",
+ " 2 | \n",
+ " None | \n",
+ " None | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " False | \n",
+ " Leasehold | \n",
+ " 2 | \n",
+ " None | \n",
+ " 2 | \n",
+ " None | \n",
+ "
\n",
+ " \n",
+ " | 2250 | \n",
+ " 87150201 | \n",
+ " 55.40 | \n",
+ " 500000.0 | \n",
+ " 9025.270758 | \n",
+ " https://www.rightmove.co.uk/properties/87150201 | \n",
+ " 1 | \n",
+ " None | \n",
+ " None | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " False | \n",
+ " Share of Freehold | \n",
+ " 8 | \n",
+ " None | \n",
+ " 2 | \n",
+ " None | \n",
+ "
\n",
+ " \n",
+ " | 2251 | \n",
+ " 87150213 | \n",
+ " 55.48 | \n",
+ " 495000.0 | \n",
+ " 8922.134102 | \n",
+ " https://www.rightmove.co.uk/properties/87150213 | \n",
+ " 2 | \n",
+ " None | \n",
+ " None | \n",
+ " 106.0 | \n",
+ " 1773.0 | \n",
+ " False | \n",
+ " Leasehold | \n",
+ " 2 | \n",
+ " None | \n",
+ " 2 | \n",
+ " None | \n",
+ "
\n",
+ " \n",
+ " | 2252 | \n",
" 88731877 | \n",
" NaN | \n",
" 570000.0 | \n",
" NaN | \n",
" https://www.rightmove.co.uk/properties/88731877 | \n",
" 1 | \n",
- " {'duration': 912, 'distance': 6329, 'duration_... | \n",
- " {'duration': 852, 'distance': 6329, 'duration_... | \n",
- " 998.0 | \n",
+ " None | \n",
+ " None | \n",
+ " 995.0 | \n",
" NaN | \n",
" False | \n",
" Leasehold | \n",
- " 407 | \n",
- " None | \n",
- " 0 | \n",
- " None | \n",
- "
\n",
- " \n",
- " | 10570 | \n",
- " 89825950 | \n",
- " 48.9 | \n",
- " 680000.0 | \n",
- " 13905.930470 | \n",
- " https://www.rightmove.co.uk/properties/89825950 | \n",
- " 1 | \n",
- " {'duration': 273, 'distance': 762, 'duration_s... | \n",
- " {'duration': 273, 'distance': 762, 'duration_s... | \n",
- " 112.0 | \n",
- " 1700.00 | \n",
- " False | \n",
- " Leasehold | \n",
- " 113 | \n",
- " None | \n",
- " 0 | \n",
- " None | \n",
- "
\n",
- " \n",
- " | 10571 | \n",
- " 94206080 | \n",
- " 49.6 | \n",
- " 899000.0 | \n",
- " 18125.000000 | \n",
- " https://www.rightmove.co.uk/properties/94206080 | \n",
- " 1 | \n",
- " {'duration': 1125, 'distance': 4637, 'duration... | \n",
- " {'duration': 1125, 'distance': 4641, 'duration... | \n",
- " NaN | \n",
- " NaN | \n",
- " True | \n",
- " Leasehold | \n",
- " 337 | \n",
- " None | \n",
- " 0 | \n",
- " None | \n",
- "
\n",
- " \n",
- " | 10572 | \n",
- " 94508306 | \n",
- " 94.0 | \n",
- " 1000000.0 | \n",
- " 10638.297872 | \n",
- " https://www.rightmove.co.uk/properties/94508306 | \n",
" 2 | \n",
- " {'duration': 1046, 'distance': 2193, 'duration... | \n",
- " {'duration': 1046, 'distance': 2193, 'duration... | \n",
- " 977.0 | \n",
- " NaN | \n",
- " False | \n",
- " Leasehold | \n",
- " 230 | \n",
" None | \n",
- " 9 | \n",
+ " 2 | \n",
" None | \n",
"
\n",
" \n",
- " | 10573 | \n",
+ " 2253 | \n",
" 95975483 | \n",
" NaN | \n",
- " 800000.0 | \n",
+ " 799950.0 | \n",
" NaN | \n",
" https://www.rightmove.co.uk/properties/95975483 | \n",
" 2 | \n",
- " {'duration': 2281, 'distance': 7262, 'duration... | \n",
- " {'duration': 2815, 'distance': 5607, 'duration... | \n",
+ " None | \n",
+ " None | \n",
" 999.0 | \n",
- " 0.00 | \n",
+ " 0.0 | \n",
" False | \n",
" Leasehold | \n",
- " 84 | \n",
+ " 25 | \n",
" None | \n",
- " 0 | \n",
+ " 2 | \n",
" None | \n",
"
\n",
" \n",
"\n",
- "10574 rows × 16 columns
\n",
+ "2254 rows × 16 columns
\n",
""
],
"text/plain": [
- " identifier sqm_ocr price price_per_sqm \\\n",
- "0 105484772 45.7 325000.0 7111.597374 \n",
- "1 105827126 58.5 950000.0 16239.316239 \n",
- "2 108102476 53.7 515000.0 9590.316574 \n",
- "3 108171770 45.0 650000.0 14444.444444 \n",
- "4 109595123 NaN 1000000.0 NaN \n",
- "... ... ... ... ... \n",
- "10569 88731877 NaN 570000.0 NaN \n",
- "10570 89825950 48.9 680000.0 13905.930470 \n",
- "10571 94206080 49.6 899000.0 18125.000000 \n",
- "10572 94508306 94.0 1000000.0 10638.297872 \n",
- "10573 95975483 NaN 800000.0 NaN \n",
+ " identifier sqm_ocr price price_per_sqm \\\n",
+ "0 111829454 57.40 500000.0 8710.801394 \n",
+ "1 118624844 NaN 400000.0 NaN \n",
+ "2 121349855 45.23 630000.0 13928.808313 \n",
+ "3 122409413 51.59 550000.0 10660.980810 \n",
+ "4 123219209 59.34 960000.0 16177.957533 \n",
+ "... ... ... ... ... \n",
+ "2249 87149832 NaN 800000.0 NaN \n",
+ "2250 87150201 55.40 500000.0 9025.270758 \n",
+ "2251 87150213 55.48 495000.0 8922.134102 \n",
+ "2252 88731877 NaN 570000.0 NaN \n",
+ "2253 95975483 NaN 799950.0 NaN \n",
"\n",
- " url bedrooms \\\n",
- "0 https://www.rightmove.co.uk/properties/105484772 1 \n",
- "1 https://www.rightmove.co.uk/properties/105827126 1 \n",
- "2 https://www.rightmove.co.uk/properties/108102476 1 \n",
- "3 https://www.rightmove.co.uk/properties/108171770 2 \n",
- "4 https://www.rightmove.co.uk/properties/109595123 1 \n",
- "... ... ... \n",
- "10569 https://www.rightmove.co.uk/properties/88731877 1 \n",
- "10570 https://www.rightmove.co.uk/properties/89825950 1 \n",
- "10571 https://www.rightmove.co.uk/properties/94206080 1 \n",
- "10572 https://www.rightmove.co.uk/properties/94508306 2 \n",
- "10573 https://www.rightmove.co.uk/properties/95975483 2 \n",
+ " url bedrooms \\\n",
+ "0 https://www.rightmove.co.uk/properties/111829454 2 \n",
+ "1 https://www.rightmove.co.uk/properties/118624844 3 \n",
+ "2 https://www.rightmove.co.uk/properties/121349855 1 \n",
+ "3 https://www.rightmove.co.uk/properties/122409413 1 \n",
+ "4 https://www.rightmove.co.uk/properties/123219209 1 \n",
+ "... ... ... \n",
+ "2249 https://www.rightmove.co.uk/properties/87149832 2 \n",
+ "2250 https://www.rightmove.co.uk/properties/87150201 1 \n",
+ "2251 https://www.rightmove.co.uk/properties/87150213 2 \n",
+ "2252 https://www.rightmove.co.uk/properties/88731877 1 \n",
+ "2253 https://www.rightmove.co.uk/properties/95975483 2 \n",
"\n",
- " travel_time_fastest \\\n",
- "0 {'duration': 1983, 'distance': 10095, 'duratio... \n",
- "1 {'duration': 2478, 'distance': 9584, 'duration... \n",
- "2 {'duration': 1266, 'distance': 4042, 'duration... \n",
- "3 {'duration': 1591, 'distance': 7827, 'duration... \n",
- "4 {'duration': 2463, 'distance': 9565, 'duration... \n",
- "... ... \n",
- "10569 {'duration': 912, 'distance': 6329, 'duration_... \n",
- "10570 {'duration': 273, 'distance': 762, 'duration_s... \n",
- "10571 {'duration': 1125, 'distance': 4637, 'duration... \n",
- "10572 {'duration': 1046, 'distance': 2193, 'duration... \n",
- "10573 {'duration': 2281, 'distance': 7262, 'duration... \n",
+ " travel_time_fastest travel_time_second lease_left service_charge \\\n",
+ "0 None None 0.0 0.0 \n",
+ "1 None None NaN NaN \n",
+ "2 None None NaN NaN \n",
+ "3 None None 259.0 NaN \n",
+ "4 None None 993.0 6300.0 \n",
+ "... ... ... ... ... \n",
+ "2249 None None NaN NaN \n",
+ "2250 None None NaN NaN \n",
+ "2251 None None 106.0 1773.0 \n",
+ "2252 None None 995.0 NaN \n",
+ "2253 None None 999.0 0.0 \n",
"\n",
- " travel_time_second lease_left \\\n",
- "0 {'duration': 2043, 'distance': 10083, 'duratio... 104.0 \n",
- "1 {'duration': 2478, 'distance': 9584, 'duration... NaN \n",
- "2 {'duration': 1861, 'distance': 4548, 'duration... 104.0 \n",
- "3 {'duration': 1591, 'distance': 7827, 'duration... 962.0 \n",
- "4 {'duration': 2463, 'distance': 9565, 'duration... NaN \n",
- "... ... ... \n",
- "10569 {'duration': 852, 'distance': 6329, 'duration_... 998.0 \n",
- "10570 {'duration': 273, 'distance': 762, 'duration_s... 112.0 \n",
- "10571 {'duration': 1125, 'distance': 4641, 'duration... NaN \n",
- "10572 {'duration': 1046, 'distance': 2193, 'duration... 977.0 \n",
- "10573 {'duration': 2815, 'distance': 5607, 'duration... 999.0 \n",
+ " development tenure_type updated_days status last_seen decision \n",
+ "0 False Share of Freehold 2 None 2 None \n",
+ "1 False None 8 None 2 None \n",
+ "2 False Leasehold 24 None 2 None \n",
+ "3 False Leasehold 241 None 2 None \n",
+ "4 False Leasehold 9 None 2 None \n",
+ "... ... ... ... ... ... ... \n",
+ "2249 False Leasehold 2 None 2 None \n",
+ "2250 False Share of Freehold 8 None 2 None \n",
+ "2251 False Leasehold 2 None 2 None \n",
+ "2252 False Leasehold 2 None 2 None \n",
+ "2253 False Leasehold 25 None 2 None \n",
"\n",
- " service_charge development \\\n",
- "0 641.53 False \n",
- "1 NaN True \n",
- "2 NaN False \n",
- "3 2000.00 False \n",
- "4 NaN True \n",
- "... ... ... \n",
- "10569 NaN False \n",
- "10570 1700.00 False \n",
- "10571 NaN True \n",
- "10572 NaN False \n",
- "10573 0.00 False \n",
- "\n",
- " tenure_type updated_days status \\\n",
- "0 Leasehold 116 None \n",
- "1 Leasehold 83 None \n",
- "2 Leasehold 97 None \n",
- "3 Leasehold 261 None \n",
- "4 Please confirm if this is a freehold or leaseh... 96 None \n",
- "... ... ... ... \n",
- "10569 Leasehold 407 None \n",
- "10570 Leasehold 113 None \n",
- "10571 Leasehold 337 None \n",
- "10572 Leasehold 230 None \n",
- "10573 Leasehold 84 None \n",
- "\n",
- " last_seen decision \n",
- "0 0 None \n",
- "1 0 None \n",
- "2 0 None \n",
- "3 0 None \n",
- "4 0 None \n",
- "... ... ... \n",
- "10569 0 None \n",
- "10570 0 None \n",
- "10571 0 None \n",
- "10572 9 None \n",
- "10573 0 None \n",
- "\n",
- "[10574 rows x 16 columns]"
+ "[2254 rows x 16 columns]"
]
},
- "execution_count": 8,
+ "execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
@@ -536,7 +509,7 @@
{
"data": {
"text/plain": [
- "(10574, 16)"
+ "(2254, 16)"
]
},
"execution_count": 9,
@@ -548,22 +521,6 @@
"df.shape"
]
},
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "51e2770c-7633-4bd3-9a63-11ea705f0694",
- "metadata": {},
- "outputs": [],
- "source": []
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "127c5377-594d-450f-81c5-235acd8ca863",
- "metadata": {},
- "outputs": [],
- "source": []
- },
{
"cell_type": "code",
"execution_count": 10,
@@ -595,28 +552,30 @@
},
{
"cell_type": "code",
- "execution_count": 12,
+ "execution_count": 14,
"id": "90500b06-9eb9-49e9-a0e0-6adef0b8effd",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
- "(9494, 18)"
+ "(2254, 16)"
]
},
- "execution_count": 12,
+ "execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# remove all entries where we didnt calculate transit time (probably due to a too far distance)\n",
- "df2 = df[df.travel_time_fastest.notna()]\n",
+ "# df2 = df[df.travel_time_fastest.notna()]\n",
+ "df2 = df\n",
"\n",
"# drop columns\n",
"dropcolumns = ['distance_per_transit', 'duration_static', 'distance']\n",
- "s1 = df2['travel_time_fastest'].apply(pd.Series).drop(dropcolumns, axis=1)\n",
+ "# s1 = df2['travel_time_fastest'].apply(pd.Series).drop(dropcolumns, axis=1)\n",
+ "s1 = df2\n",
"\n",
"# fill in gap values for service charge and lease left. This is for excel so we can use filters better there\n",
"df2.loc[:, 'service_charge'] = df2.service_charge.fillna(-1)\n",
@@ -624,43 +583,36 @@
"df2.loc[:, 'sqm_ocr'] = df2.sqm_ocr.fillna(-1)\n",
"\n",
"\n",
- "df3 = pd.concat([df2.drop(['travel_time_fastest', 'travel_time_second'], axis=1), s1], axis=1)\n",
- "df3.loc[:, 'duration'] = (df3.loc[:, ['duration']].min(axis=1) / 60).round()\n",
+ "df3 = df2\n",
+ "# df3 = pd.concat([df2.drop(['travel_time_fastest', 'travel_time_second'], axis=1), s1], axis=1)\n",
+ "# df3.loc[:, 'duration'] = (df3.loc[:, ['duration']].min(axis=1) / 60).round()\n",
"df3.shape"
]
},
{
"cell_type": "code",
- "execution_count": null,
- "id": "2735f38e-7f05-4b5d-b835-1f3f31745ec7",
- "metadata": {},
- "outputs": [],
- "source": []
- },
- {
- "cell_type": "code",
- "execution_count": 13,
+ "execution_count": 15,
"id": "227f434a-7daf-4f9b-944b-b22ce0216b13",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
- "0 None\n",
- "1 None\n",
- "2 None\n",
- "3 None\n",
- "4 None\n",
- " ... \n",
- "10569 None\n",
- "10570 None\n",
- "10571 None\n",
- "10572 None\n",
- "10573 None\n",
- "Name: status, Length: 9494, dtype: object"
+ "0 None\n",
+ "1 None\n",
+ "2 None\n",
+ "3 None\n",
+ "4 None\n",
+ " ... \n",
+ "2249 None\n",
+ "2250 None\n",
+ "2251 None\n",
+ "2252 None\n",
+ "2253 None\n",
+ "Name: status, Length: 2254, dtype: object"
]
},
- "execution_count": 13,
+ "execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
@@ -671,33 +623,34 @@
},
{
"cell_type": "code",
- "execution_count": 14,
+ "execution_count": 17,
"id": "cc96e017-1a6f-4e27-b128-e3ef9ff9cb27",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
- "(6578, 17)"
+ "(2254, 16)"
]
},
- "execution_count": 14,
+ "execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# filter out undesirable\n",
- "available = df3.status.isna()\n",
- "near = df3.duration < 40\n",
- "last_seen_under_30 = df3.last_seen < 30\n",
- "df4 = df3[available & near & last_seen_under_30].drop(axis=1, columns=[\"status\"])\n",
+ "df4 = df3\n",
+ "# available = df3.status.isna()\n",
+ "# near = df3.duration < 40\n",
+ "# last_seen_under_30 = df3.last_seen < 30\n",
+ "# df4 = df3[available & near & last_seen_under_30].drop(axis=1, columns=[\"status\"])\n",
"df4.shape"
]
},
{
"cell_type": "code",
- "execution_count": 15,
+ "execution_count": 19,
"id": "7c1e9779-b3f8-4098-b2a4-4547ac8ca8f9",
"metadata": {},
"outputs": [
@@ -728,59 +681,56 @@
" price_per_sqm | \n",
" url | \n",
" bedrooms | \n",
+ " travel_time_fastest | \n",
+ " travel_time_second | \n",
" lease_left | \n",
" service_charge | \n",
" development | \n",
" tenure_type | \n",
" updated_days | \n",
+ " status | \n",
" last_seen | \n",
" decision | \n",
- " duration | \n",
- " initial_walk_duration | \n",
- " duration_per_transit | \n",
- " number_of_transit_stops | \n",
" \n",
" \n",
" \n",
" \n",
" | 0 | \n",
- " 105484772 | \n",
- " 45.7 | \n",
- " 325000.0 | \n",
- " 7111.597374 | \n",
- " https://www.rightmove.co.uk/properties/105484772 | \n",
- " 1 | \n",
- " 104.0 | \n",
- " 641.53 | \n",
- " False | \n",
- " Leasehold | \n",
- " 116 | \n",
- " 0 | \n",
- " None | \n",
- " 33 | \n",
- " 372 | \n",
- " {'WALK': 609, 'TRANSIT': 1109} | \n",
+ " 111829454 | \n",
+ " 57.4 | \n",
+ " 500000.0 | \n",
+ " 8710.801394 | \n",
+ " https://www.rightmove.co.uk/properties/111829454 | \n",
" 2 | \n",
+ " None | \n",
+ " None | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " False | \n",
+ " Share of Freehold | \n",
+ " 2 | \n",
+ " None | \n",
+ " 2 | \n",
+ " None | \n",
"
\n",
" \n",
- " | 2 | \n",
- " 108102476 | \n",
- " 53.7 | \n",
- " 515000.0 | \n",
- " 9590.316574 | \n",
- " https://www.rightmove.co.uk/properties/108102476 | \n",
- " 1 | \n",
- " 104.0 | \n",
- " -1.00 | \n",
- " False | \n",
- " Leasehold | \n",
- " 97 | \n",
- " 0 | \n",
+ " 1 | \n",
+ " 118624844 | \n",
+ " -1.0 | \n",
+ " 400000.0 | \n",
+ " NaN | \n",
+ " https://www.rightmove.co.uk/properties/118624844 | \n",
+ " 3 | \n",
+ " None | \n",
+ " None | \n",
+ " -1.0 | \n",
+ " -1.0 | \n",
+ " False | \n",
+ " None | \n",
+ " 8 | \n",
+ " None | \n",
+ " 2 | \n",
" None | \n",
- " 21 | \n",
- " 593 | \n",
- " {'WALK': 819, 'TRANSIT': 445} | \n",
- " 1 | \n",
"
\n",
" \n",
"\n",
@@ -788,33 +738,30 @@
],
"text/plain": [
" identifier sqm_ocr price price_per_sqm \\\n",
- "0 105484772 45.7 325000.0 7111.597374 \n",
- "2 108102476 53.7 515000.0 9590.316574 \n",
+ "0 111829454 57.4 500000.0 8710.801394 \n",
+ "1 118624844 -1.0 400000.0 NaN \n",
"\n",
- " url bedrooms lease_left \\\n",
- "0 https://www.rightmove.co.uk/properties/105484772 1 104.0 \n",
- "2 https://www.rightmove.co.uk/properties/108102476 1 104.0 \n",
+ " url bedrooms \\\n",
+ "0 https://www.rightmove.co.uk/properties/111829454 2 \n",
+ "1 https://www.rightmove.co.uk/properties/118624844 3 \n",
"\n",
- " service_charge development tenure_type updated_days last_seen decision \\\n",
- "0 641.53 False Leasehold 116 0 None \n",
- "2 -1.00 False Leasehold 97 0 None \n",
+ " travel_time_fastest travel_time_second lease_left service_charge \\\n",
+ "0 None None 0.0 0.0 \n",
+ "1 None None -1.0 -1.0 \n",
"\n",
- " duration initial_walk_duration duration_per_transit \\\n",
- "0 33 372 {'WALK': 609, 'TRANSIT': 1109} \n",
- "2 21 593 {'WALK': 819, 'TRANSIT': 445} \n",
- "\n",
- " number_of_transit_stops \n",
- "0 2 \n",
- "2 1 "
+ " development tenure_type updated_days status last_seen decision \n",
+ "0 False Share of Freehold 2 None 2 None \n",
+ "1 False None 8 None 2 None "
]
},
- "execution_count": 15,
+ "execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
- "df4.to_clipboard()\n",
+ "# df4.to_clipboard()\n",
+ "df4.to_csv('data/filtered_listings.csv', index=False)\n",
"df4.head(2)"
]
},
@@ -1231,32 +1178,6 @@
" return response.json()"
]
},
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "d185e013-9beb-4e57-9d8b-a830e14339c3",
- "metadata": {},
- "outputs": [],
- "source": []
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "cabe5d3f-ad0f-49c2-9fbf-686539a05bd1",
- "metadata": {},
- "outputs": [],
- "source": [
- "\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 30,
- "id": "ba500fe6-fb18-466e-a697-403d28181674",
- "metadata": {},
- "outputs": [],
- "source": []
- },
{
"cell_type": "code",
"execution_count": 46,
@@ -1324,86 +1245,6 @@
"print(bor_to_locid)"
]
},
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "157e20a2-d137-45b8-802b-948fa8e04ba3",
- "metadata": {},
- "outputs": [],
- "source": []
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "386578dc-1ad5-4b8a-8905-29b0c47a6174",
- "metadata": {},
- "outputs": [],
- "source": []
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "10c17fdf-f424-40cb-9d8c-9218f8d4ab53",
- "metadata": {},
- "outputs": [],
- "source": []
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "88d99eb7-8c92-4817-86ce-ba0738331dba",
- "metadata": {},
- "outputs": [],
- "source": []
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "6c8b4488-ae2b-41ab-9c95-e3c85f9fb77e",
- "metadata": {},
- "outputs": [],
- "source": []
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "c3072907-7ad8-4618-92ab-818e392218d9",
- "metadata": {},
- "outputs": [],
- "source": []
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "b20f6f16-3236-4772-b1a3-2d4a3b1925a6",
- "metadata": {},
- "outputs": [],
- "source": []
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "1a7230dc-1a0f-43e2-bd15-0c85ea445733",
- "metadata": {},
- "outputs": [],
- "source": []
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "aeaf84bf-8514-48c6-88ce-2c6828bdcdf2",
- "metadata": {},
- "outputs": [],
- "source": []
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "c888d4e6-d192-45df-b9b6-5e2d39bca344",
- "metadata": {},
- "outputs": [],
- "source": []
- },
{
"cell_type": "code",
"execution_count": 1,
@@ -1521,22 +1362,6 @@
"js"
]
},
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "99d1a790-19d4-4686-b916-ee74fb3b2411",
- "metadata": {},
- "outputs": [],
- "source": []
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "a4a2937c-1509-4c52-8aec-7bc86372092f",
- "metadata": {},
- "outputs": [],
- "source": []
- },
{
"cell_type": "code",
"execution_count": 33,
@@ -1568,14 +1393,6 @@
"l.last_seen"
]
},
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "a437a351-e3aa-458a-b692-3b630e598fa0",
- "metadata": {},
- "outputs": [],
- "source": []
- },
{
"cell_type": "code",
"execution_count": 24,
@@ -1609,39 +1426,13 @@
"with open('/tmp/la.json', 'w') as f:\n",
" json.dump(datetime.datetime.now().isoformat(), f)"
]
- },
- {
- "cell_type": "code",
- "execution_count": 26,
- "id": "8b8bc05a-46a8-4f39-8b68-22ad62d13fe1",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\"2025-01-26T21:37:07.744971\""
- ]
- }
- ],
- "source": [
- "!cat /tmp/la.json"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "341fe82c-fda8-4fc6-8bc0-081b285a5330",
- "metadata": {},
- "outputs": [],
- "source": []
}
],
"metadata": {
"kernelspec": {
- "display_name": "Python 3 (ipykernel)",
+ "display_name": "venv",
"language": "python",
- "name": "python3"
+ "name": "venv"
},
"language_info": {
"codemirror_mode": {
@@ -1653,7 +1444,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.12.0"
+ "version": "3.13.3"
}
},
"nbformat": 4,