From 91d3237516509e5fc4593193cea8e65a2b4ca4e0 Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Sat, 17 May 2025 23:24:40 +0000 Subject: [PATCH] remove some empty fields in exploration notebook --- crawler/exploration.ipynb | 779 ++++++++++++++------------------------ 1 file changed, 285 insertions(+), 494 deletions(-) diff --git a/crawler/exploration.ipynb b/crawler/exploration.ipynb index 93e7751..ab3dc60 100644 --- a/crawler/exploration.ipynb +++ b/crawler/exploration.ipynb @@ -10,14 +10,15 @@ "name": "stderr", "output_type": "stream", "text": [ - "/Users/kadir/code/realestate/crawler/venv/lib/python3.12/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + "/home/wizard/.cache/pypoetry/virtualenvs/rec-g1fA4zXM-py3.13/lib/python3.13/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", " from .autonotebook import tqdm as notebook_tqdm\n" ] } ], "source": [ "from data_access import Listing\n", - "import pandas as pd" + "import pandas as pd\n", + "\n" ] }, { @@ -33,7 +34,18 @@ "execution_count": 2, "id": "424501ab-ecc6-42f5-b87e-b0d2871bdc74", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/tmp/ipykernel_1627094/3290399543.py:3: FutureWarning: The behavior of 'to_datetime' with 'unit' when parsing strings is deprecated. In a future version, strings will be parsed as datetime strings, matching the behavior without a 'unit'. To retain the old behavior, explicitly cast ints or floats to numeric type before calling to_datetime.\n", + " decisions = pd.read_json(decisions_path)\n", + "/tmp/ipykernel_1627094/3290399543.py:3: FutureWarning: The behavior of 'to_datetime' with 'unit' when parsing strings is deprecated. In a future version, strings will be parsed as datetime strings, matching the behavior without a 'unit'. To retain the old behavior, explicitly cast ints or floats to numeric type before calling to_datetime.\n", + " decisions = pd.read_json(decisions_path)\n" + ] + } + ], "source": [ "# read decisions on file\n", "decisions_path = 'data/decisions.json'\n", @@ -128,7 +140,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 5, "id": "b1101088-9613-465f-81fd-79801e0202b8", "metadata": {}, "outputs": [ @@ -136,7 +148,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "10574\n" + "2254\n" ] } ], @@ -153,7 +165,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 6, "id": "63e61601-7e3f-4d58-89f6-1794e4868cc3", "metadata": {}, "outputs": [], @@ -163,7 +175,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 9, "id": "1c222721-f426-42c0-9ac5-badc1f7a2034", "metadata": {}, "outputs": [ @@ -209,97 +221,97 @@ " \n", " \n", " 0\n", - " 105484772\n", - " 45.7\n", - " 325000.0\n", - " 7111.597374\n", - " https://www.rightmove.co.uk/properties/105484772\n", - " 1\n", - " {'duration': 1983, 'distance': 10095, 'duratio...\n", - " {'duration': 2043, 'distance': 10083, 'duratio...\n", - " 104.0\n", - " 641.53\n", - " False\n", - " Leasehold\n", - " 116\n", + " 111829454\n", + " 57.40\n", + " 500000.0\n", + " 8710.801394\n", + " https://www.rightmove.co.uk/properties/111829454\n", + " 2\n", " None\n", - " 0\n", + " None\n", + " 0.0\n", + " 0.0\n", + " False\n", + " Share of Freehold\n", + " 2\n", + " None\n", + " 2\n", " None\n", " \n", " \n", " 1\n", - " 105827126\n", - " 58.5\n", - " 950000.0\n", - " 16239.316239\n", - " https://www.rightmove.co.uk/properties/105827126\n", - " 1\n", - " {'duration': 2478, 'distance': 9584, 'duration...\n", - " {'duration': 2478, 'distance': 9584, 'duration...\n", + " 118624844\n", " NaN\n", + " 400000.0\n", " NaN\n", - " True\n", - " Leasehold\n", - " 83\n", + " https://www.rightmove.co.uk/properties/118624844\n", + " 3\n", " None\n", - " 0\n", + " None\n", + " NaN\n", + " NaN\n", + " False\n", + " None\n", + " 8\n", + " None\n", + " 2\n", " None\n", " \n", " \n", " 2\n", - " 108102476\n", - " 53.7\n", - " 515000.0\n", - " 9590.316574\n", - " https://www.rightmove.co.uk/properties/108102476\n", + " 121349855\n", + " 45.23\n", + " 630000.0\n", + " 13928.808313\n", + " https://www.rightmove.co.uk/properties/121349855\n", " 1\n", - " {'duration': 1266, 'distance': 4042, 'duration...\n", - " {'duration': 1861, 'distance': 4548, 'duration...\n", - " 104.0\n", + " None\n", + " None\n", + " NaN\n", " NaN\n", " False\n", " Leasehold\n", - " 97\n", + " 24\n", " None\n", - " 0\n", + " 2\n", " None\n", " \n", " \n", " 3\n", - " 108171770\n", - " 45.0\n", - " 650000.0\n", - " 14444.444444\n", - " https://www.rightmove.co.uk/properties/108171770\n", - " 2\n", - " {'duration': 1591, 'distance': 7827, 'duration...\n", - " {'duration': 1591, 'distance': 7827, 'duration...\n", - " 962.0\n", - " 2000.00\n", + " 122409413\n", + " 51.59\n", + " 550000.0\n", + " 10660.980810\n", + " https://www.rightmove.co.uk/properties/122409413\n", + " 1\n", + " None\n", + " None\n", + " 259.0\n", + " NaN\n", " False\n", " Leasehold\n", - " 261\n", + " 241\n", " None\n", - " 0\n", + " 2\n", " None\n", " \n", " \n", " 4\n", - " 109595123\n", - " NaN\n", - " 1000000.0\n", - " NaN\n", - " https://www.rightmove.co.uk/properties/109595123\n", + " 123219209\n", + " 59.34\n", + " 960000.0\n", + " 16177.957533\n", + " https://www.rightmove.co.uk/properties/123219209\n", " 1\n", - " {'duration': 2463, 'distance': 9565, 'duration...\n", - " {'duration': 2463, 'distance': 9565, 'duration...\n", - " NaN\n", - " NaN\n", - " True\n", - " Please confirm if this is a freehold or leaseh...\n", - " 96\n", " None\n", - " 0\n", + " None\n", + " 993.0\n", + " 6300.0\n", + " False\n", + " Leasehold\n", + " 9\n", + " None\n", + " 2\n", " None\n", " \n", " \n", @@ -322,201 +334,162 @@ " ...\n", " \n", " \n", - " 10569\n", + " 2249\n", + " 87149832\n", + " NaN\n", + " 800000.0\n", + " NaN\n", + " https://www.rightmove.co.uk/properties/87149832\n", + " 2\n", + " None\n", + " None\n", + " NaN\n", + " NaN\n", + " False\n", + " Leasehold\n", + " 2\n", + " None\n", + " 2\n", + " None\n", + " \n", + " \n", + " 2250\n", + " 87150201\n", + " 55.40\n", + " 500000.0\n", + " 9025.270758\n", + " https://www.rightmove.co.uk/properties/87150201\n", + " 1\n", + " None\n", + " None\n", + " NaN\n", + " NaN\n", + " False\n", + " Share of Freehold\n", + " 8\n", + " None\n", + " 2\n", + " None\n", + " \n", + " \n", + " 2251\n", + " 87150213\n", + " 55.48\n", + " 495000.0\n", + " 8922.134102\n", + " https://www.rightmove.co.uk/properties/87150213\n", + " 2\n", + " None\n", + " None\n", + " 106.0\n", + " 1773.0\n", + " False\n", + " Leasehold\n", + " 2\n", + " None\n", + " 2\n", + " None\n", + " \n", + " \n", + " 2252\n", " 88731877\n", " NaN\n", " 570000.0\n", " NaN\n", " https://www.rightmove.co.uk/properties/88731877\n", " 1\n", - " {'duration': 912, 'distance': 6329, 'duration_...\n", - " {'duration': 852, 'distance': 6329, 'duration_...\n", - " 998.0\n", + " None\n", + " None\n", + " 995.0\n", " NaN\n", " False\n", " Leasehold\n", - " 407\n", - " None\n", - " 0\n", - " None\n", - " \n", - " \n", - " 10570\n", - " 89825950\n", - " 48.9\n", - " 680000.0\n", - " 13905.930470\n", - " https://www.rightmove.co.uk/properties/89825950\n", - " 1\n", - " {'duration': 273, 'distance': 762, 'duration_s...\n", - " {'duration': 273, 'distance': 762, 'duration_s...\n", - " 112.0\n", - " 1700.00\n", - " False\n", - " Leasehold\n", - " 113\n", - " None\n", - " 0\n", - " None\n", - " \n", - " \n", - " 10571\n", - " 94206080\n", - " 49.6\n", - " 899000.0\n", - " 18125.000000\n", - " https://www.rightmove.co.uk/properties/94206080\n", - " 1\n", - " {'duration': 1125, 'distance': 4637, 'duration...\n", - " {'duration': 1125, 'distance': 4641, 'duration...\n", - " NaN\n", - " NaN\n", - " True\n", - " Leasehold\n", - " 337\n", - " None\n", - " 0\n", - " None\n", - " \n", - " \n", - " 10572\n", - " 94508306\n", - " 94.0\n", - " 1000000.0\n", - " 10638.297872\n", - " https://www.rightmove.co.uk/properties/94508306\n", " 2\n", - " {'duration': 1046, 'distance': 2193, 'duration...\n", - " {'duration': 1046, 'distance': 2193, 'duration...\n", - " 977.0\n", - " NaN\n", - " False\n", - " Leasehold\n", - " 230\n", " None\n", - " 9\n", + " 2\n", " None\n", " \n", " \n", - " 10573\n", + " 2253\n", " 95975483\n", " NaN\n", - " 800000.0\n", + " 799950.0\n", " NaN\n", " https://www.rightmove.co.uk/properties/95975483\n", " 2\n", - " {'duration': 2281, 'distance': 7262, 'duration...\n", - " {'duration': 2815, 'distance': 5607, 'duration...\n", + " None\n", + " None\n", " 999.0\n", - " 0.00\n", + " 0.0\n", " False\n", " Leasehold\n", - " 84\n", + " 25\n", " None\n", - " 0\n", + " 2\n", " None\n", " \n", " \n", "\n", - "

10574 rows × 16 columns

\n", + "

2254 rows × 16 columns

\n", "" ], "text/plain": [ - " identifier sqm_ocr price price_per_sqm \\\n", - "0 105484772 45.7 325000.0 7111.597374 \n", - "1 105827126 58.5 950000.0 16239.316239 \n", - "2 108102476 53.7 515000.0 9590.316574 \n", - "3 108171770 45.0 650000.0 14444.444444 \n", - "4 109595123 NaN 1000000.0 NaN \n", - "... ... ... ... ... \n", - "10569 88731877 NaN 570000.0 NaN \n", - "10570 89825950 48.9 680000.0 13905.930470 \n", - "10571 94206080 49.6 899000.0 18125.000000 \n", - "10572 94508306 94.0 1000000.0 10638.297872 \n", - "10573 95975483 NaN 800000.0 NaN \n", + " identifier sqm_ocr price price_per_sqm \\\n", + "0 111829454 57.40 500000.0 8710.801394 \n", + "1 118624844 NaN 400000.0 NaN \n", + "2 121349855 45.23 630000.0 13928.808313 \n", + "3 122409413 51.59 550000.0 10660.980810 \n", + "4 123219209 59.34 960000.0 16177.957533 \n", + "... ... ... ... ... \n", + "2249 87149832 NaN 800000.0 NaN \n", + "2250 87150201 55.40 500000.0 9025.270758 \n", + "2251 87150213 55.48 495000.0 8922.134102 \n", + "2252 88731877 NaN 570000.0 NaN \n", + "2253 95975483 NaN 799950.0 NaN \n", "\n", - " url bedrooms \\\n", - "0 https://www.rightmove.co.uk/properties/105484772 1 \n", - "1 https://www.rightmove.co.uk/properties/105827126 1 \n", - "2 https://www.rightmove.co.uk/properties/108102476 1 \n", - "3 https://www.rightmove.co.uk/properties/108171770 2 \n", - "4 https://www.rightmove.co.uk/properties/109595123 1 \n", - "... ... ... \n", - "10569 https://www.rightmove.co.uk/properties/88731877 1 \n", - "10570 https://www.rightmove.co.uk/properties/89825950 1 \n", - "10571 https://www.rightmove.co.uk/properties/94206080 1 \n", - "10572 https://www.rightmove.co.uk/properties/94508306 2 \n", - "10573 https://www.rightmove.co.uk/properties/95975483 2 \n", + " url bedrooms \\\n", + "0 https://www.rightmove.co.uk/properties/111829454 2 \n", + "1 https://www.rightmove.co.uk/properties/118624844 3 \n", + "2 https://www.rightmove.co.uk/properties/121349855 1 \n", + "3 https://www.rightmove.co.uk/properties/122409413 1 \n", + "4 https://www.rightmove.co.uk/properties/123219209 1 \n", + "... ... ... \n", + "2249 https://www.rightmove.co.uk/properties/87149832 2 \n", + "2250 https://www.rightmove.co.uk/properties/87150201 1 \n", + "2251 https://www.rightmove.co.uk/properties/87150213 2 \n", + "2252 https://www.rightmove.co.uk/properties/88731877 1 \n", + "2253 https://www.rightmove.co.uk/properties/95975483 2 \n", "\n", - " travel_time_fastest \\\n", - "0 {'duration': 1983, 'distance': 10095, 'duratio... \n", - "1 {'duration': 2478, 'distance': 9584, 'duration... \n", - "2 {'duration': 1266, 'distance': 4042, 'duration... \n", - "3 {'duration': 1591, 'distance': 7827, 'duration... \n", - "4 {'duration': 2463, 'distance': 9565, 'duration... \n", - "... ... \n", - "10569 {'duration': 912, 'distance': 6329, 'duration_... \n", - "10570 {'duration': 273, 'distance': 762, 'duration_s... \n", - "10571 {'duration': 1125, 'distance': 4637, 'duration... \n", - "10572 {'duration': 1046, 'distance': 2193, 'duration... \n", - "10573 {'duration': 2281, 'distance': 7262, 'duration... \n", + " travel_time_fastest travel_time_second lease_left service_charge \\\n", + "0 None None 0.0 0.0 \n", + "1 None None NaN NaN \n", + "2 None None NaN NaN \n", + "3 None None 259.0 NaN \n", + "4 None None 993.0 6300.0 \n", + "... ... ... ... ... \n", + "2249 None None NaN NaN \n", + "2250 None None NaN NaN \n", + "2251 None None 106.0 1773.0 \n", + "2252 None None 995.0 NaN \n", + "2253 None None 999.0 0.0 \n", "\n", - " travel_time_second lease_left \\\n", - "0 {'duration': 2043, 'distance': 10083, 'duratio... 104.0 \n", - "1 {'duration': 2478, 'distance': 9584, 'duration... NaN \n", - "2 {'duration': 1861, 'distance': 4548, 'duration... 104.0 \n", - "3 {'duration': 1591, 'distance': 7827, 'duration... 962.0 \n", - "4 {'duration': 2463, 'distance': 9565, 'duration... NaN \n", - "... ... ... \n", - "10569 {'duration': 852, 'distance': 6329, 'duration_... 998.0 \n", - "10570 {'duration': 273, 'distance': 762, 'duration_s... 112.0 \n", - "10571 {'duration': 1125, 'distance': 4641, 'duration... NaN \n", - "10572 {'duration': 1046, 'distance': 2193, 'duration... 977.0 \n", - "10573 {'duration': 2815, 'distance': 5607, 'duration... 999.0 \n", + " development tenure_type updated_days status last_seen decision \n", + "0 False Share of Freehold 2 None 2 None \n", + "1 False None 8 None 2 None \n", + "2 False Leasehold 24 None 2 None \n", + "3 False Leasehold 241 None 2 None \n", + "4 False Leasehold 9 None 2 None \n", + "... ... ... ... ... ... ... \n", + "2249 False Leasehold 2 None 2 None \n", + "2250 False Share of Freehold 8 None 2 None \n", + "2251 False Leasehold 2 None 2 None \n", + "2252 False Leasehold 2 None 2 None \n", + "2253 False Leasehold 25 None 2 None \n", "\n", - " service_charge development \\\n", - "0 641.53 False \n", - "1 NaN True \n", - "2 NaN False \n", - "3 2000.00 False \n", - "4 NaN True \n", - "... ... ... \n", - "10569 NaN False \n", - "10570 1700.00 False \n", - "10571 NaN True \n", - "10572 NaN False \n", - "10573 0.00 False \n", - "\n", - " tenure_type updated_days status \\\n", - "0 Leasehold 116 None \n", - "1 Leasehold 83 None \n", - "2 Leasehold 97 None \n", - "3 Leasehold 261 None \n", - "4 Please confirm if this is a freehold or leaseh... 96 None \n", - "... ... ... ... \n", - "10569 Leasehold 407 None \n", - "10570 Leasehold 113 None \n", - "10571 Leasehold 337 None \n", - "10572 Leasehold 230 None \n", - "10573 Leasehold 84 None \n", - "\n", - " last_seen decision \n", - "0 0 None \n", - "1 0 None \n", - "2 0 None \n", - "3 0 None \n", - "4 0 None \n", - "... ... ... \n", - "10569 0 None \n", - "10570 0 None \n", - "10571 0 None \n", - "10572 9 None \n", - "10573 0 None \n", - "\n", - "[10574 rows x 16 columns]" + "[2254 rows x 16 columns]" ] }, - "execution_count": 8, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } @@ -536,7 +509,7 @@ { "data": { "text/plain": [ - "(10574, 16)" + "(2254, 16)" ] }, "execution_count": 9, @@ -548,22 +521,6 @@ "df.shape" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "51e2770c-7633-4bd3-9a63-11ea705f0694", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "127c5377-594d-450f-81c5-235acd8ca863", - "metadata": {}, - "outputs": [], - "source": [] - }, { "cell_type": "code", "execution_count": 10, @@ -595,28 +552,30 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 14, "id": "90500b06-9eb9-49e9-a0e0-6adef0b8effd", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "(9494, 18)" + "(2254, 16)" ] }, - "execution_count": 12, + "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# remove all entries where we didnt calculate transit time (probably due to a too far distance)\n", - "df2 = df[df.travel_time_fastest.notna()]\n", + "# df2 = df[df.travel_time_fastest.notna()]\n", + "df2 = df\n", "\n", "# drop columns\n", "dropcolumns = ['distance_per_transit', 'duration_static', 'distance']\n", - "s1 = df2['travel_time_fastest'].apply(pd.Series).drop(dropcolumns, axis=1)\n", + "# s1 = df2['travel_time_fastest'].apply(pd.Series).drop(dropcolumns, axis=1)\n", + "s1 = df2\n", "\n", "# fill in gap values for service charge and lease left. This is for excel so we can use filters better there\n", "df2.loc[:, 'service_charge'] = df2.service_charge.fillna(-1)\n", @@ -624,43 +583,36 @@ "df2.loc[:, 'sqm_ocr'] = df2.sqm_ocr.fillna(-1)\n", "\n", "\n", - "df3 = pd.concat([df2.drop(['travel_time_fastest', 'travel_time_second'], axis=1), s1], axis=1)\n", - "df3.loc[:, 'duration'] = (df3.loc[:, ['duration']].min(axis=1) / 60).round()\n", + "df3 = df2\n", + "# df3 = pd.concat([df2.drop(['travel_time_fastest', 'travel_time_second'], axis=1), s1], axis=1)\n", + "# df3.loc[:, 'duration'] = (df3.loc[:, ['duration']].min(axis=1) / 60).round()\n", "df3.shape" ] }, { "cell_type": "code", - "execution_count": null, - "id": "2735f38e-7f05-4b5d-b835-1f3f31745ec7", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": 13, + "execution_count": 15, "id": "227f434a-7daf-4f9b-944b-b22ce0216b13", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "0 None\n", - "1 None\n", - "2 None\n", - "3 None\n", - "4 None\n", - " ... \n", - "10569 None\n", - "10570 None\n", - "10571 None\n", - "10572 None\n", - "10573 None\n", - "Name: status, Length: 9494, dtype: object" + "0 None\n", + "1 None\n", + "2 None\n", + "3 None\n", + "4 None\n", + " ... \n", + "2249 None\n", + "2250 None\n", + "2251 None\n", + "2252 None\n", + "2253 None\n", + "Name: status, Length: 2254, dtype: object" ] }, - "execution_count": 13, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } @@ -671,33 +623,34 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 17, "id": "cc96e017-1a6f-4e27-b128-e3ef9ff9cb27", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "(6578, 17)" + "(2254, 16)" ] }, - "execution_count": 14, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# filter out undesirable\n", - "available = df3.status.isna()\n", - "near = df3.duration < 40\n", - "last_seen_under_30 = df3.last_seen < 30\n", - "df4 = df3[available & near & last_seen_under_30].drop(axis=1, columns=[\"status\"])\n", + "df4 = df3\n", + "# available = df3.status.isna()\n", + "# near = df3.duration < 40\n", + "# last_seen_under_30 = df3.last_seen < 30\n", + "# df4 = df3[available & near & last_seen_under_30].drop(axis=1, columns=[\"status\"])\n", "df4.shape" ] }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 19, "id": "7c1e9779-b3f8-4098-b2a4-4547ac8ca8f9", "metadata": {}, "outputs": [ @@ -728,59 +681,56 @@ " price_per_sqm\n", " url\n", " bedrooms\n", + " travel_time_fastest\n", + " travel_time_second\n", " lease_left\n", " service_charge\n", " development\n", " tenure_type\n", " updated_days\n", + " status\n", " last_seen\n", " decision\n", - " duration\n", - " initial_walk_duration\n", - " duration_per_transit\n", - " number_of_transit_stops\n", " \n", " \n", " \n", " \n", " 0\n", - " 105484772\n", - " 45.7\n", - " 325000.0\n", - " 7111.597374\n", - " https://www.rightmove.co.uk/properties/105484772\n", - " 1\n", - " 104.0\n", - " 641.53\n", - " False\n", - " Leasehold\n", - " 116\n", - " 0\n", - " None\n", - " 33\n", - " 372\n", - " {'WALK': 609, 'TRANSIT': 1109}\n", + " 111829454\n", + " 57.4\n", + " 500000.0\n", + " 8710.801394\n", + " https://www.rightmove.co.uk/properties/111829454\n", " 2\n", + " None\n", + " None\n", + " 0.0\n", + " 0.0\n", + " False\n", + " Share of Freehold\n", + " 2\n", + " None\n", + " 2\n", + " None\n", " \n", " \n", - " 2\n", - " 108102476\n", - " 53.7\n", - " 515000.0\n", - " 9590.316574\n", - " https://www.rightmove.co.uk/properties/108102476\n", - " 1\n", - " 104.0\n", - " -1.00\n", - " False\n", - " Leasehold\n", - " 97\n", - " 0\n", + " 1\n", + " 118624844\n", + " -1.0\n", + " 400000.0\n", + " NaN\n", + " https://www.rightmove.co.uk/properties/118624844\n", + " 3\n", + " None\n", + " None\n", + " -1.0\n", + " -1.0\n", + " False\n", + " None\n", + " 8\n", + " None\n", + " 2\n", " None\n", - " 21\n", - " 593\n", - " {'WALK': 819, 'TRANSIT': 445}\n", - " 1\n", " \n", " \n", "\n", @@ -788,33 +738,30 @@ ], "text/plain": [ " identifier sqm_ocr price price_per_sqm \\\n", - "0 105484772 45.7 325000.0 7111.597374 \n", - "2 108102476 53.7 515000.0 9590.316574 \n", + "0 111829454 57.4 500000.0 8710.801394 \n", + "1 118624844 -1.0 400000.0 NaN \n", "\n", - " url bedrooms lease_left \\\n", - "0 https://www.rightmove.co.uk/properties/105484772 1 104.0 \n", - "2 https://www.rightmove.co.uk/properties/108102476 1 104.0 \n", + " url bedrooms \\\n", + "0 https://www.rightmove.co.uk/properties/111829454 2 \n", + "1 https://www.rightmove.co.uk/properties/118624844 3 \n", "\n", - " service_charge development tenure_type updated_days last_seen decision \\\n", - "0 641.53 False Leasehold 116 0 None \n", - "2 -1.00 False Leasehold 97 0 None \n", + " travel_time_fastest travel_time_second lease_left service_charge \\\n", + "0 None None 0.0 0.0 \n", + "1 None None -1.0 -1.0 \n", "\n", - " duration initial_walk_duration duration_per_transit \\\n", - "0 33 372 {'WALK': 609, 'TRANSIT': 1109} \n", - "2 21 593 {'WALK': 819, 'TRANSIT': 445} \n", - "\n", - " number_of_transit_stops \n", - "0 2 \n", - "2 1 " + " development tenure_type updated_days status last_seen decision \n", + "0 False Share of Freehold 2 None 2 None \n", + "1 False None 8 None 2 None " ] }, - "execution_count": 15, + "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "df4.to_clipboard()\n", + "# df4.to_clipboard()\n", + "df4.to_csv('data/filtered_listings.csv', index=False)\n", "df4.head(2)" ] }, @@ -1231,32 +1178,6 @@ " return response.json()" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "d185e013-9beb-4e57-9d8b-a830e14339c3", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "cabe5d3f-ad0f-49c2-9fbf-686539a05bd1", - "metadata": {}, - "outputs": [], - "source": [ - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": 30, - "id": "ba500fe6-fb18-466e-a697-403d28181674", - "metadata": {}, - "outputs": [], - "source": [] - }, { "cell_type": "code", "execution_count": 46, @@ -1324,86 +1245,6 @@ "print(bor_to_locid)" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "157e20a2-d137-45b8-802b-948fa8e04ba3", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "386578dc-1ad5-4b8a-8905-29b0c47a6174", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "10c17fdf-f424-40cb-9d8c-9218f8d4ab53", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "88d99eb7-8c92-4817-86ce-ba0738331dba", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6c8b4488-ae2b-41ab-9c95-e3c85f9fb77e", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c3072907-7ad8-4618-92ab-818e392218d9", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b20f6f16-3236-4772-b1a3-2d4a3b1925a6", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1a7230dc-1a0f-43e2-bd15-0c85ea445733", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "aeaf84bf-8514-48c6-88ce-2c6828bdcdf2", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c888d4e6-d192-45df-b9b6-5e2d39bca344", - "metadata": {}, - "outputs": [], - "source": [] - }, { "cell_type": "code", "execution_count": 1, @@ -1521,22 +1362,6 @@ "js" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "99d1a790-19d4-4686-b916-ee74fb3b2411", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a4a2937c-1509-4c52-8aec-7bc86372092f", - "metadata": {}, - "outputs": [], - "source": [] - }, { "cell_type": "code", "execution_count": 33, @@ -1568,14 +1393,6 @@ "l.last_seen" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "a437a351-e3aa-458a-b692-3b630e598fa0", - "metadata": {}, - "outputs": [], - "source": [] - }, { "cell_type": "code", "execution_count": 24, @@ -1609,39 +1426,13 @@ "with open('/tmp/la.json', 'w') as f:\n", " json.dump(datetime.datetime.now().isoformat(), f)" ] - }, - { - "cell_type": "code", - "execution_count": 26, - "id": "8b8bc05a-46a8-4f39-8b68-22ad62d13fe1", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\"2025-01-26T21:37:07.744971\"" - ] - } - ], - "source": [ - "!cat /tmp/la.json" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "341fe82c-fda8-4fc6-8bc0-081b285a5330", - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "venv", "language": "python", - "name": "python3" + "name": "venv" }, "language_info": { "codemirror_mode": { @@ -1653,7 +1444,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.12.0" + "version": "3.13.3" } }, "nbformat": 4,