diff --git a/crawler/2_dump_detail.py b/crawler/2_dump_detail.py index 49ed4f3..62cad80 100644 --- a/crawler/2_dump_detail.py +++ b/crawler/2_dump_detail.py @@ -9,7 +9,7 @@ filtered_listings = [] for listing in listings: if not listing.path_detail_json().exists(): filtered_listings.append(listing) - + for listing in tqdm(filtered_listings): try: d = detail_query(listing.identifier) diff --git a/crawler/exploration.ipynb b/crawler/exploration.ipynb index 3a63d40..5972562 100644 --- a/crawler/exploration.ipynb +++ b/crawler/exploration.ipynb @@ -3,7 +3,7 @@ { "cell_type": "code", "execution_count": 1, - "id": "f20bddee-1e7c-4c46-a17a-c7bb6c13f30c", + "id": "38e8690a-f6f7-4e14-a657-f20605477afd", "metadata": {}, "outputs": [ { @@ -20,51 +20,172 @@ "import pandas as pd" ] }, + { + "cell_type": "markdown", + "id": "cfe2ab03-3204-4fd8-b76a-a734f6b87d75", + "metadata": {}, + "source": [ + "### Fetch previous decisions" + ] + }, { "cell_type": "code", "execution_count": 2, - "id": "b1101088-9613-465f-81fd-79801e0202b8", + "id": "db55b615-698c-4f5d-881a-ea1d3b6d6205", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(93, 2)\n" + ] + }, + { + "data": { + "text/html": [ + "
| \n", + " | identifier | \n", + "decision | \n", + "
|---|---|---|
| 2047 | \n", + "145699277 | \n", + "n | \n", + "
| 1442 | \n", + "144642851 | \n", + "n | \n", + "
| 1839 | \n", + "145394765 | \n", + "n | \n", + "
| 1853 | \n", + "145418669 | \n", + "removed | \n", + "
| 930 | \n", + "143205230 | \n", + "n | \n", + "
2186 rows × 11 columns
\n", + "34542 rows × 11 columns
\n", "" ], "text/plain": [ - " identifier sqm_ocr price price_per_sqm \\\n", - "0 102360773 NaN 350000.0 NaN \n", - "1 105836849 NaN 400000.0 NaN \n", - "2 107233214 76.91 400000.0 5200.884150 \n", - "3 107976896 112.40 800000.0 7117.437722 \n", - "4 115499441 115.60 775000.0 6704.152249 \n", - "... ... ... ... ... \n", - "2181 86813508 NaN 750000.0 NaN \n", - "2182 86813523 NaN 655000.0 NaN \n", - "2183 86814669 82.80 550000.0 6642.512077 \n", - "2184 86955958 90.00 300000.0 3333.333333 \n", - "2185 86972726 81.60 790000.0 9681.372549 \n", + " identifier sqm_ocr price price_per_sqm \\\n", + "0 100506851 58.4 525000.0 8989.726027 \n", + "1 100938761 NaN 390000.0 NaN \n", + "2 101817179 53.2 495000.0 9304.511278 \n", + "3 101939660 56.5 238000.0 4212.389381 \n", + "4 102103157 NaN 425000.0 NaN \n", + "... ... ... ... ... \n", + "34537 97023443 8.3 699999.0 84337.228916 \n", + "34538 97124237 53.4 300000.0 5617.977528 \n", + "34539 97335680 48.0 315000.0 6562.500000 \n", + "34540 97522346 NaN 400000.0 NaN \n", + "34541 98352914 NaN 399950.0 NaN \n", "\n", - " url bedrooms \\\n", - "0 https://www.rightmove.co.uk/properties/102360773 1 \n", - "1 https://www.rightmove.co.uk/properties/105836849 3 \n", - "2 https://www.rightmove.co.uk/properties/107233214 3 \n", - "3 https://www.rightmove.co.uk/properties/107976896 3 \n", - "4 https://www.rightmove.co.uk/properties/115499441 3 \n", - "... ... ... \n", - "2181 https://www.rightmove.co.uk/properties/86813508 3 \n", - "2182 https://www.rightmove.co.uk/properties/86813523 3 \n", - "2183 https://www.rightmove.co.uk/properties/86814669 3 \n", - "2184 https://www.rightmove.co.uk/properties/86955958 3 \n", - "2185 https://www.rightmove.co.uk/properties/86972726 2 \n", + " url bedrooms \\\n", + "0 https://www.rightmove.co.uk/properties/100506851 2 \n", + "1 https://www.rightmove.co.uk/properties/100938761 1 \n", + "2 https://www.rightmove.co.uk/properties/101817179 1 \n", + "3 https://www.rightmove.co.uk/properties/101939660 2 \n", + "4 https://www.rightmove.co.uk/properties/102103157 1 \n", + "... ... ... \n", + "34537 https://www.rightmove.co.uk/properties/97023443 1 \n", + "34538 https://www.rightmove.co.uk/properties/97124237 1 \n", + "34539 https://www.rightmove.co.uk/properties/97335680 2 \n", + "34540 https://www.rightmove.co.uk/properties/97522346 2 \n", + "34541 https://www.rightmove.co.uk/properties/98352914 2 \n", "\n", - " travel_time_fastest \\\n", - "0 {'duration': 2695, 'distance': 6467, 'duration... \n", - "1 {'duration': 2565, 'distance': 14070, 'duratio... \n", - "2 {'duration': 1714, 'distance': 9570, 'duration... \n", - "3 {'duration': 1862, 'distance': 8278, 'duration... \n", - "4 {'duration': 2943, 'distance': 7437, 'duration... \n", - "... ... \n", - "2181 {'duration': 2400, 'distance': 13983, 'duratio... \n", - "2182 {'duration': 2400, 'distance': 13983, 'duratio... \n", - "2183 {'duration': 2310, 'distance': 12972, 'duratio... \n", - "2184 {'duration': 2332, 'distance': 6898, 'duration... \n", - "2185 {'duration': 1393, 'distance': 6390, 'duration... \n", + " travel_time_fastest \\\n", + "0 {'duration': 1948, 'distance': 10927, 'duratio... \n", + "1 None \n", + "2 {'duration': 2702, 'distance': 8637, 'duration... \n", + "3 {'duration': 2262, 'distance': 13512, 'duratio... \n", + "4 None \n", + "... ... \n", + "34537 {'duration': 1704, 'distance': 8729, 'duration... \n", + "34538 None \n", + "34539 None \n", + "34540 None \n", + "34541 None \n", "\n", - " travel_time_second lease_left \\\n", - "0 {'duration': 1682, 'distance': 6810, 'duration... 119.0 \n", - "1 {'duration': 2565, 'distance': 14070, 'duratio... NaN \n", - "2 {'duration': 1774, 'distance': 9570, 'duration... 91.0 \n", - "3 {'duration': 1862, 'distance': 8278, 'duration... NaN \n", - "4 {'duration': 2167, 'distance': 9920, 'duration... NaN \n", - "... ... ... \n", - "2181 {'duration': 2605, 'distance': 14702, 'duratio... NaN \n", - "2182 {'duration': 2605, 'distance': 14702, 'duratio... NaN \n", - "2183 {'duration': 2322, 'distance': 12104, 'duratio... 0.0 \n", - "2184 {'duration': 2248, 'distance': 6893, 'duration... 105.0 \n", - "2185 {'duration': 2733, 'distance': 6490, 'duration... 993.0 \n", + " travel_time_second lease_left \\\n", + "0 {'duration': 1948, 'distance': 10927, 'duratio... NaN \n", + "1 None 996.0 \n", + "2 {'duration': 3333, 'distance': 10013, 'duratio... 91.0 \n", + "3 {'duration': 2322, 'distance': 13491, 'duratio... 0.0 \n", + "4 None NaN \n", + "... ... ... \n", + "34537 {'duration': 1713, 'distance': 6755, 'duration... 993.0 \n", + "34538 None NaN \n", + "34539 None NaN \n", + "34540 None NaN \n", + "34541 None 125.0 \n", "\n", - " development decision \n", - "0 False None \n", - "1 False None \n", - "2 False None \n", - "3 False None \n", - "4 False None \n", - "... ... ... \n", - "2181 True None \n", - "2182 True None \n", - "2183 False None \n", - "2184 False None \n", - "2185 False None \n", + " development decision \n", + "0 False None \n", + "1 False None \n", + "2 False None \n", + "3 False None \n", + "4 False None \n", + "... ... ... \n", + "34537 True None \n", + "34538 False None \n", + "34539 False None \n", + "34540 False None \n", + "34541 True None \n", "\n", - "[2186 rows x 11 columns]" + "[34542 rows x 11 columns]" ] }, - "execution_count": 4, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } @@ -354,17 +475,17 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 8, "id": "d80d9911-9a6d-4608-a6da-11dc864ee32b", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "(2186, 11)" + "(34542, 11)" ] }, - "execution_count": 5, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -375,7 +496,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 9, "id": "7b37ad6b-9b0a-444e-b8c3-6fe4e43e42cb", "metadata": {}, "outputs": [ @@ -423,45 +544,45 @@ " \n", "| \n", - " | sqm_ocr | \n", - "price | \n", - "price_per_sqm | \n", - "url | \n", - "bedrooms | \n", - "
|---|---|---|---|---|---|
| 953 | \n", - "74142.40 | \n", - "550000.0 | \n", - "7.418157 | \n", - "https://www.rightmove.co.uk/properties/145546538 | \n", - "2 | \n", - "
| 249 | \n", - "21850.10 | \n", - "725000.0 | \n", - "33.180626 | \n", - "https://www.rightmove.co.uk/properties/140821736 | \n", - "2 | \n", - "
| 177 | \n", - "137.50 | \n", - "695000.0 | \n", - "5054.545455 | \n", - "https://www.rightmove.co.uk/properties/139354259 | \n", - "2 | \n", - "
| 352 | \n", - "85.50 | \n", - "475000.0 | \n", - "5555.555556 | \n", - "https://www.rightmove.co.uk/properties/142142348 | \n", - "2 | \n", - "
| 36 | \n", - "82.90 | \n", - "475000.0 | \n", - "5729.794934 | \n", - "https://www.rightmove.co.uk/properties/128925950 | \n", - "2 | \n", - "
| 700 | \n", - "101.00 | \n", - "650000.0 | \n", - "6435.643564 | \n", - "https://www.rightmove.co.uk/properties/144591572 | \n", - "1 | \n", - "
| 823 | \n", - "80.40 | \n", - "525000.0 | \n", - "6529.850746 | \n", - "https://www.rightmove.co.uk/properties/145051769 | \n", - "2 | \n", - "
| 214 | \n", - "91.51 | \n", - "650000.0 | \n", - "7103.048847 | \n", - "https://www.rightmove.co.uk/properties/140326307 | \n", - "2 | \n", - "
| 171 | \n", - "91.10 | \n", - "650000.0 | \n", - "7135.016465 | \n", - "https://www.rightmove.co.uk/properties/139245428 | \n", - "2 | \n", - "
| 598 | \n", - "109.40 | \n", - "795000.0 | \n", - "7266.910420 | \n", - "https://www.rightmove.co.uk/properties/144034655 | \n", - "2 | \n", - "
| 146 | \n", - "93.78 | \n", - "700000.0 | \n", - "7464.278098 | \n", - "https://www.rightmove.co.uk/properties/138510395 | \n", - "2 | \n", - "
| 711 | \n", - "100.31 | \n", - "750000.0 | \n", - "7476.821852 | \n", - "https://www.rightmove.co.uk/properties/144620303 | \n", - "2 | \n", - "
| 592 | \n", - "86.49 | \n", - "650000.0 | \n", - "7515.319690 | \n", - "https://www.rightmove.co.uk/properties/143987669 | \n", - "2 | \n", - "
| 624 | \n", - "101.35 | \n", - "775000.0 | \n", - "7646.768624 | \n", - "https://www.rightmove.co.uk/properties/144217922 | \n", - "1 | \n", - "
| 55 | \n", - "97.60 | \n", - "750000.0 | \n", - "7684.426230 | \n", - "https://www.rightmove.co.uk/properties/132564737 | \n", - "2 | \n", - "
| 851 | \n", - "81.01 | \n", - "625000.0 | \n", - "7715.096902 | \n", - "https://www.rightmove.co.uk/properties/145172504 | \n", - "2 | \n", - "
| 356 | \n", - "89.00 | \n", - "695000.0 | \n", - "7808.988764 | \n", - "https://www.rightmove.co.uk/properties/142185623 | \n", - "2 | \n", - "
| 748 | \n", - "85.90 | \n", - "675000.0 | \n", - "7857.974389 | \n", - "https://www.rightmove.co.uk/properties/144771281 | \n", - "2 | \n", - "
| 204 | \n", - "82.40 | \n", - "650000.0 | \n", - "7888.349515 | \n", - "https://www.rightmove.co.uk/properties/140173319 | \n", - "2 | \n", - "
| 654 | \n", - "83.33 | \n", - "665000.0 | \n", - "7980.319213 | \n", - "https://www.rightmove.co.uk/properties/144361100 | \n", - "2 | \n", - "
| 514 | \n", - "87.50 | \n", - "700000.0 | \n", - "8000.000000 | \n", - "https://www.rightmove.co.uk/properties/143460365 | \n", - "1 | \n", - "
| 762 | \n", - "92.90 | \n", - "750000.0 | \n", - "8073.196986 | \n", - "https://www.rightmove.co.uk/properties/144846725 | \n", - "1 | \n", - "
| 963 | \n", - "92.90 | \n", - "750000.0 | \n", - "8073.196986 | \n", - "https://www.rightmove.co.uk/properties/145565252 | \n", - "2 | \n", - "
| 938 | \n", - "86.10 | \n", - "699000.0 | \n", - "8118.466899 | \n", - "https://www.rightmove.co.uk/properties/145491137 | \n", - "2 | \n", - "
| 403 | \n", - "90.60 | \n", - "750000.0 | \n", - "8278.145695 | \n", - "https://www.rightmove.co.uk/properties/142704416 | \n", - "2 | \n", - "
| 302 | \n", - "93.30 | \n", - "795000.0 | \n", - "8520.900322 | \n", - "https://www.rightmove.co.uk/properties/141595433 | \n", - "2 | \n", - "
| 671 | \n", - "93.27 | \n", - "800000.0 | \n", - "8577.248847 | \n", - "https://www.rightmove.co.uk/properties/144429140 | \n", - "2 | \n", - "
| 349 | \n", - "80.57 | \n", - "695000.0 | \n", - "8626.039469 | \n", - "https://www.rightmove.co.uk/properties/142115918 | \n", - "2 | \n", - "
| 224 | \n", - "90.10 | \n", - "800000.0 | \n", - "8879.023307 | \n", - "https://www.rightmove.co.uk/properties/140464481 | \n", - "2 | \n", - "
| 740 | \n", - "81.70 | \n", - "735000.0 | \n", - "8996.328029 | \n", - "https://www.rightmove.co.uk/properties/144722414 | \n", - "2 | \n", - "
| 326 | \n", - "87.30 | \n", - "800000.0 | \n", - "9163.802978 | \n", - "https://www.rightmove.co.uk/properties/141846023 | \n", - "2 | \n", - "
| 319 | \n", - "81.57 | \n", - "750000.0 | \n", - "9194.556822 | \n", - "https://www.rightmove.co.uk/properties/141797357 | \n", - "2 | \n", - "
| 558 | \n", - "86.77 | \n", - "800000.0 | \n", - "9219.776420 | \n", - "https://www.rightmove.co.uk/properties/143758763 | \n", - "2 | \n", - "
| 712 | \n", - "86.30 | \n", - "800000.0 | \n", - "9269.988413 | \n", - "https://www.rightmove.co.uk/properties/144622157 | \n", - "2 | \n", - "
| 473 | \n", - "86.00 | \n", - "800000.0 | \n", - "9302.325581 | \n", - "https://www.rightmove.co.uk/properties/143210102 | \n", - "2 | \n", - "
| 105 | \n", - "80.40 | \n", - "750000.0 | \n", - "9328.358209 | \n", - "https://www.rightmove.co.uk/properties/136988726 | \n", - "2 | \n", - "
| 1003 | \n", - "80.27 | \n", - "750000.0 | \n", - "9343.465803 | \n", - "https://www.rightmove.co.uk/properties/86775291 | \n", - "2 | \n", - "
| 235 | \n", - "82.80 | \n", - "775000.0 | \n", - "9359.903382 | \n", - "https://www.rightmove.co.uk/properties/140611055 | \n", - "2 | \n", - "
| 65 | \n", - "85.10 | \n", - "800000.0 | \n", - "9400.705053 | \n", - "https://www.rightmove.co.uk/properties/134116232 | \n", - "2 | \n", - "
| 30 | \n", - "83.70 | \n", - "795000.0 | \n", - "9498.207885 | \n", - "https://www.rightmove.co.uk/properties/127787960 | \n", - "1 | \n", - "
| 1025 | \n", - "81.60 | \n", - "790000.0 | \n", - "9681.372549 | \n", - "https://www.rightmove.co.uk/properties/86972726 | \n", - "2 | \n", - "
| 88 | \n", - "81.75 | \n", - "800000.0 | \n", - "9785.932722 | \n", - "https://www.rightmove.co.uk/properties/136012193 | \n", - "1 | \n", - "
| 454 | \n", - "80.80 | \n", - "800000.0 | \n", - "9900.990099 | \n", - "https://www.rightmove.co.uk/properties/143138867 | \n", - "2 | \n", - "
| 343 | \n", - "80.64 | \n", - "800000.0 | \n", - "9920.634921 | \n", - "https://www.rightmove.co.uk/properties/142032935 | \n", - "2 | \n", - "
| \n", - " | sqm_ocr | \n", - "price | \n", - "price_per_sqm | \n", - "url | \n", - "bedrooms | \n", - "
|---|---|---|---|---|---|
| 359 | \n", - "73.40 | \n", - "400000.0 | \n", - "5449.591281 | \n", - "https://www.rightmove.co.uk/properties/142186991 | \n", - "2 | \n", - "
| 293 | \n", - "77.00 | \n", - "425000.0 | \n", - "5519.480519 | \n", - "https://www.rightmove.co.uk/properties/141437783 | \n", - "2 | \n", - "
| 352 | \n", - "85.50 | \n", - "475000.0 | \n", - "5555.555556 | \n", - "https://www.rightmove.co.uk/properties/142142348 | \n", - "2 | \n", - "
| 685 | \n", - "76.03 | \n", - "425000.0 | \n", - "5589.898724 | \n", - "https://www.rightmove.co.uk/properties/144494012 | \n", - "2 | \n", - "
| 36 | \n", - "82.90 | \n", - "475000.0 | \n", - "5729.794934 | \n", - "https://www.rightmove.co.uk/properties/128925950 | \n", - "2 | \n", - "
| 491 | \n", - "71.68 | \n", - "450000.0 | \n", - "6277.901786 | \n", - "https://www.rightmove.co.uk/properties/143315840 | \n", - "2 | \n", - "
| 1020 | \n", - "73.67 | \n", - "495000.0 | \n", - "6719.152980 | \n", - "https://www.rightmove.co.uk/properties/86807916 | \n", - "2 | \n", - "
| \n", - " | sqm_ocr | \n", - "price | \n", - "price_per_sqm | \n", - "url | \n", - "bedrooms | \n", - "
|---|---|---|---|---|---|
| 953 | \n", - "74142.40 | \n", - "550000.0 | \n", - "7.418157 | \n", - "https://www.rightmove.co.uk/properties/145546538 | \n", - "2 | \n", - "
| 823 | \n", - "80.40 | \n", - "525000.0 | \n", - "6529.850746 | \n", - "https://www.rightmove.co.uk/properties/145051769 | \n", - "2 | \n", - "
| 492 | \n", - "73.90 | \n", - "525000.0 | \n", - "7104.194858 | \n", - "https://www.rightmove.co.uk/properties/143317361 | \n", - "2 | \n", - "
| 561 | \n", - "76.70 | \n", - "550000.0 | \n", - "7170.795306 | \n", - "https://www.rightmove.co.uk/properties/143780789 | \n", - "2 | \n", - "
| 69 | \n", - "73.10 | \n", - "525000.0 | \n", - "7181.942544 | \n", - "https://www.rightmove.co.uk/properties/134574563 | \n", - "2 | \n", - "
| 988 | \n", - "72.40 | \n", - "525000.0 | \n", - "7251.381215 | \n", - "https://www.rightmove.co.uk/properties/86648925 | \n", - "2 | \n", - "
| 272 | \n", - "77.89 | \n", - "575000.0 | \n", - "7382.205675 | \n", - "https://www.rightmove.co.uk/properties/141131297 | \n", - "2 | \n", - "
| 521 | \n", - "70.88 | \n", - "525000.0 | \n", - "7406.884876 | \n", - "https://www.rightmove.co.uk/properties/143514149 | \n", - "2 | \n", - "
| 324 | \n", - "76.10 | \n", - "575000.0 | \n", - "7555.847569 | \n", - "https://www.rightmove.co.uk/properties/141831353 | \n", - "2 | \n", - "
| 1018 | \n", - "71.80 | \n", - "550000.0 | \n", - "7660.167131 | \n", - "https://www.rightmove.co.uk/properties/86804832 | \n", - "2 | \n", - "
| 526 | \n", - "78.00 | \n", - "600000.0 | \n", - "7692.307692 | \n", - "https://www.rightmove.co.uk/properties/143552156 | \n", - "1 | \n", - "
| 817 | \n", - "71.17 | \n", - "550000.0 | \n", - "7727.975270 | \n", - "https://www.rightmove.co.uk/properties/145035929 | \n", - "2 | \n", - "
| 843 | \n", - "77.50 | \n", - "600000.0 | \n", - "7741.935484 | \n", - "https://www.rightmove.co.uk/properties/145144988 | \n", - "2 | \n", - "
| 337 | \n", - "70.60 | \n", - "550000.0 | \n", - "7790.368272 | \n", - "https://www.rightmove.co.uk/properties/141904286 | \n", - "2 | \n", - "
| 233 | \n", - "75.70 | \n", - "600000.0 | \n", - "7926.023778 | \n", - "https://www.rightmove.co.uk/properties/140582213 | \n", - "2 | \n", - "
| 763 | \n", - "75.00 | \n", - "600000.0 | \n", - "8000.000000 | \n", - "https://www.rightmove.co.uk/properties/144862070 | \n", - "2 | \n", - "
| 315 | \n", - "71.30 | \n", - "590000.0 | \n", - "8274.894811 | \n", - "https://www.rightmove.co.uk/properties/141669686 | \n", - "2 | \n", - "
| 899 | \n", - "71.47 | \n", - "595000.0 | \n", - "8325.171401 | \n", - "https://www.rightmove.co.uk/properties/145362911 | \n", - "2 | \n", - "
| 147 | \n", - "71.50 | \n", - "600000.0 | \n", - "8391.608392 | \n", - "https://www.rightmove.co.uk/properties/138537527 | \n", - "1 | \n", - "
| 973 | \n", - "70.89 | \n", - "595000.0 | \n", - "8393.285372 | \n", - "https://www.rightmove.co.uk/properties/86296491 | \n", - "2 | \n", - "