diff --git a/crawler/exploration.ipynb b/crawler/exploration.ipynb
index 5912de3..df1160f 100644
--- a/crawler/exploration.ipynb
+++ b/crawler/exploration.ipynb
@@ -27,7 +27,39 @@
"metadata": {},
"outputs": [],
"source": [
- "ls = Listing.get_all_listings()"
+ "ls = Listing.get_all_listings()\n",
+ "\n",
+ "decisions = {\n",
+ " 142789514: 'n',\n",
+ " 136010102: 'n',\n",
+ " 141457334: 'y',\n",
+ " 86778015: 'n',\n",
+ " 134574563: 'n',\n",
+ " 86648925: 'n',\n",
+ " 143319068: 'n',\n",
+ " 135668207: 'n',\n",
+ " 142063949: 'n',\n",
+ " 145051769: 'n',\n",
+ " 138945719: 'n',\n",
+ " 135714833: 'n',\n",
+ " 144983192: 'n',\n",
+ " 144666920: 'n',\n",
+ " 143895080: 'n',\n",
+ " 141114200: 'n',\n",
+ " 145407389: 'n',\n",
+ " 145047533: 'n',\n",
+ " 145161722: 'n',\n",
+ " 145130066: 'n',\n",
+ " 142110470: 'n',\n",
+ " 133667606: 'n',\n",
+ " 145005536: 'n',\n",
+ " 143458961: 'n',\n",
+ " 141412010: 'y',\n",
+ " 138683339: 'n',\n",
+ " 138490370: 'n',\n",
+ " 137805509: 'n',\n",
+ " 135854261: 'n',\n",
+ "}"
]
},
{
@@ -67,6 +99,7 @@
" \n",
" \n",
" \n",
" \n",
" \n",
+ " identifier \n",
" sqm_ocr \n",
" price \n",
" price_per_sqm \n",
@@ -76,23 +109,27 @@
" travel_time_second \n",
" lease_left \n",
" development \n",
+ " decision \n",
"
2134 rows × 9 columns
\n", + "2186 rows × 11 columns
\n", "" ], "text/plain": [ - " sqm_ocr price price_per_sqm \\\n", - "0 7.81 350000.0 44814.340589 \n", - "1 NaN 400000.0 NaN \n", - "2 76.91 400000.0 5200.884150 \n", - "3 112.40 800000.0 7117.437722 \n", - "4 115.60 775000.0 6704.152249 \n", - "... ... ... ... \n", - "2129 NaN 750000.0 NaN \n", - "2130 NaN 655000.0 NaN \n", - "2131 82.80 550000.0 6642.512077 \n", - "2132 5.52 300000.0 54347.826087 \n", - "2133 81.60 790000.0 9681.372549 \n", + " identifier sqm_ocr price price_per_sqm \\\n", + "0 102360773 NaN 350000.0 NaN \n", + "1 105836849 NaN 400000.0 NaN \n", + "2 107233214 76.91 400000.0 5200.884150 \n", + "3 107976896 112.40 800000.0 7117.437722 \n", + "4 115499441 115.60 775000.0 6704.152249 \n", + "... ... ... ... ... \n", + "2181 86813508 NaN 750000.0 NaN \n", + "2182 86813523 NaN 655000.0 NaN \n", + "2183 86814669 82.80 550000.0 6642.512077 \n", + "2184 86955958 90.00 300000.0 3333.333333 \n", + "2185 86972726 81.60 790000.0 9681.372549 \n", "\n", " url bedrooms \\\n", "0 https://www.rightmove.co.uk/properties/102360773 1 \n", @@ -237,11 +293,11 @@ "3 https://www.rightmove.co.uk/properties/107976896 3 \n", "4 https://www.rightmove.co.uk/properties/115499441 3 \n", "... ... ... \n", - "2129 https://www.rightmove.co.uk/properties/86813508 3 \n", - "2130 https://www.rightmove.co.uk/properties/86813523 3 \n", - "2131 https://www.rightmove.co.uk/properties/86814669 3 \n", - "2132 https://www.rightmove.co.uk/properties/86955958 3 \n", - "2133 https://www.rightmove.co.uk/properties/86972726 2 \n", + "2181 https://www.rightmove.co.uk/properties/86813508 3 \n", + "2182 https://www.rightmove.co.uk/properties/86813523 3 \n", + "2183 https://www.rightmove.co.uk/properties/86814669 3 \n", + "2184 https://www.rightmove.co.uk/properties/86955958 3 \n", + "2185 https://www.rightmove.co.uk/properties/86972726 2 \n", "\n", " travel_time_fastest \\\n", "0 {'duration': 2695, 'distance': 6467, 'duration... \n", @@ -250,11 +306,11 @@ "3 {'duration': 1862, 'distance': 8278, 'duration... \n", "4 {'duration': 2943, 'distance': 7437, 'duration... \n", "... ... \n", - "2129 {'duration': 2400, 'distance': 13983, 'duratio... \n", - "2130 {'duration': 2400, 'distance': 13983, 'duratio... \n", - "2131 {'duration': 2310, 'distance': 12972, 'duratio... \n", - "2132 {'duration': 2332, 'distance': 6898, 'duration... \n", - "2133 {'duration': 1393, 'distance': 6390, 'duration... \n", + "2181 {'duration': 2400, 'distance': 13983, 'duratio... \n", + "2182 {'duration': 2400, 'distance': 13983, 'duratio... \n", + "2183 {'duration': 2310, 'distance': 12972, 'duratio... \n", + "2184 {'duration': 2332, 'distance': 6898, 'duration... \n", + "2185 {'duration': 1393, 'distance': 6390, 'duration... \n", "\n", " travel_time_second lease_left \\\n", "0 {'duration': 1682, 'distance': 6810, 'duration... 119.0 \n", @@ -263,26 +319,26 @@ "3 {'duration': 1862, 'distance': 8278, 'duration... NaN \n", "4 {'duration': 2167, 'distance': 9920, 'duration... NaN \n", "... ... ... \n", - "2129 {'duration': 2605, 'distance': 14702, 'duratio... NaN \n", - "2130 {'duration': 2605, 'distance': 14702, 'duratio... NaN \n", - "2131 {'duration': 2322, 'distance': 12104, 'duratio... 0.0 \n", - "2132 {'duration': 2248, 'distance': 6893, 'duration... 105.0 \n", - "2133 {'duration': 2733, 'distance': 6490, 'duration... 993.0 \n", + "2181 {'duration': 2605, 'distance': 14702, 'duratio... NaN \n", + "2182 {'duration': 2605, 'distance': 14702, 'duratio... NaN \n", + "2183 {'duration': 2322, 'distance': 12104, 'duratio... 0.0 \n", + "2184 {'duration': 2248, 'distance': 6893, 'duration... 105.0 \n", + "2185 {'duration': 2733, 'distance': 6490, 'duration... 993.0 \n", "\n", - " development \n", - "0 False \n", - "1 False \n", - "2 False \n", - "3 False \n", - "4 False \n", - "... ... \n", - "2129 True \n", - "2130 True \n", - "2131 False \n", - "2132 False \n", - "2133 False \n", + " development decision \n", + "0 False None \n", + "1 False None \n", + "2 False None \n", + "3 False None \n", + "4 False None \n", + "... ... ... \n", + "2181 True None \n", + "2182 True None \n", + "2183 False None \n", + "2184 False None \n", + "2185 False None \n", "\n", - "[2134 rows x 9 columns]" + "[2186 rows x 11 columns]" ] }, "execution_count": 4, @@ -292,42 +348,34 @@ ], "source": [ "df = pd.DataFrame(ds)\n", + "df.loc[:, 'decision'] = df.identifier.apply(lambda x: decisions.get(x))\n", "df" ] }, { "cell_type": "code", - "execution_count": 14, - "id": "387c48d9-26c6-4bed-8201-352735c06acb", + "execution_count": 5, + "id": "d80d9911-9a6d-4608-a6da-11dc864ee32b", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "0 1682.0\n", - "1 2565.0\n", - "2 1714.0\n", - "3 1862.0\n", - "4 2167.0\n", - " ... \n", - "2129 2400.0\n", - "2130 2400.0\n", - "2131 2310.0\n", - "2132 2248.0\n", - "2133 1393.0\n", - "Length: 2134, dtype: float64" + "(2186, 11)" ] }, - "execution_count": 14, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], - "source": [] + "source": [ + "df.shape" + ] }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 6, "id": "7b37ad6b-9b0a-444e-b8c3-6fe4e43e42cb", "metadata": {}, "outputs": [ @@ -352,6 +400,7 @@ " \n", "2134 rows × 14 columns
\n", "" ], "text/plain": [ - " sqm_ocr price price_per_sqm \\\n", - "0 7.81 350000.0 44814.340589 \n", - "1 NaN 400000.0 NaN \n", - "2 76.91 400000.0 5200.884150 \n", - "3 112.40 800000.0 7117.437722 \n", - "4 115.60 775000.0 6704.152249 \n", - "... ... ... ... \n", - "2129 NaN 750000.0 NaN \n", - "2130 NaN 655000.0 NaN \n", - "2131 82.80 550000.0 6642.512077 \n", - "2132 5.52 300000.0 54347.826087 \n", - "2133 81.60 790000.0 9681.372549 \n", + " identifier sqm_ocr price price_per_sqm \\\n", + "0 102360773 NaN 350000.0 NaN \n", + "1 105836849 NaN 400000.0 NaN \n", "\n", - " url bedrooms lease_left \\\n", - "0 https://www.rightmove.co.uk/properties/102360773 1 119.0 \n", - "1 https://www.rightmove.co.uk/properties/105836849 3 NaN \n", - "2 https://www.rightmove.co.uk/properties/107233214 3 91.0 \n", - "3 https://www.rightmove.co.uk/properties/107976896 3 NaN \n", - "4 https://www.rightmove.co.uk/properties/115499441 3 NaN \n", - "... ... ... ... \n", - "2129 https://www.rightmove.co.uk/properties/86813508 3 NaN \n", - "2130 https://www.rightmove.co.uk/properties/86813523 3 NaN \n", - "2131 https://www.rightmove.co.uk/properties/86814669 3 0.0 \n", - "2132 https://www.rightmove.co.uk/properties/86955958 3 105.0 \n", - "2133 https://www.rightmove.co.uk/properties/86972726 2 993.0 \n", + " url bedrooms lease_left \\\n", + "0 https://www.rightmove.co.uk/properties/102360773 1 119.0 \n", + "1 https://www.rightmove.co.uk/properties/105836849 3 NaN \n", "\n", - " development a_duration a_duration_per_transit \\\n", - "0 False 2695 {'WALK': 414, 'TRANSIT': 2280} \n", - "1 False 2565 {'WALK': 750, 'TRANSIT': 1800} \n", - "2 False 1714 {'WALK': 903, 'TRANSIT': 780} \n", - "3 False 1862 {'WALK': 635, 'TRANSIT': 900} \n", - "4 False 2943 {'WALK': 242, 'TRANSIT': 2700} \n", - "... ... ... ... \n", - "2129 True 2400 {'WALK': 603, 'TRANSIT': 1524} \n", - "2130 True 2400 {'WALK': 603, 'TRANSIT': 1524} \n", - "2131 False 2310 {'WALK': 786, 'TRANSIT': 1277} \n", - "2132 False 2332 {'WALK': 1671, 'TRANSIT': 660} \n", - "2133 False 1393 {'WALK': 518, 'TRANSIT': 564} \n", + " development decision a_duration a_initial_walk_duration \\\n", + "0 False None 2695 174 \n", + "1 False None 2565 340 \n", "\n", - " a_number_of_transit_stops b_duration b_duration_per_transit \\\n", - "0 1 1682.0 {'WALK': 608, 'TRANSIT': 804} \n", - "1 1 2565.0 {'WALK': 750, 'TRANSIT': 1800} \n", - "2 1 1774.0 {'WALK': 903, 'TRANSIT': 840} \n", - "3 2 1862.0 {'WALK': 635, 'TRANSIT': 960} \n", - "4 1 2167.0 {'WALK': 658, 'TRANSIT': 1200} \n", - "... ... ... ... \n", - "2129 2 2605.0 {'WALK': 1467, 'TRANSIT': 1132} \n", - "2130 2 2605.0 {'WALK': 1467, 'TRANSIT': 1132} \n", - "2131 2 2322.0 {'WALK': 991, 'TRANSIT': 1080} \n", - "2132 1 2248.0 {'WALK': 1671, 'TRANSIT': 576} \n", - "2133 2 2733.0 {'WALK': 451, 'TRANSIT': 2280} \n", + " a_duration_per_transit a_number_of_transit_stops b_duration \\\n", + "0 {'WALK': 414, 'TRANSIT': 2280} 1 1682.0 \n", + "1 {'WALK': 750, 'TRANSIT': 1800} 1 2565.0 \n", "\n", - " b_number_of_transit_stops min_duration \n", - "0 2.0 28.0 \n", - "1 1.0 43.0 \n", - "2 1.0 29.0 \n", - "3 2.0 31.0 \n", - "4 2.0 36.0 \n", - "... ... ... \n", - "2129 1.0 40.0 \n", - "2130 1.0 40.0 \n", - "2131 2.0 38.0 \n", - "2132 1.0 37.0 \n", - "2133 1.0 23.0 \n", + " b_initial_walk_duration b_duration_per_transit \\\n", + "0 204.0 {'WALK': 608, 'TRANSIT': 804} \n", + "1 340.0 {'WALK': 750, 'TRANSIT': 1800} \n", "\n", - "[2134 rows x 14 columns]" + " b_number_of_transit_stops min_duration \n", + "0 2.0 28.0 \n", + "1 1.0 43.0 " ] }, - "execution_count": 19, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -645,12 +508,12 @@ "\n", "df2 = pd.concat([df.drop(['travel_time_fastest', 'travel_time_second'], axis=1), s1, s2], axis=1)\n", "df2.loc[:, 'min_duration'] = (df2.loc[:, ['a_duration', 'b_duration']].min(axis=1) / 60).round()\n", - "df2" + "df2.head(2)" ] }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 7, "id": "8c75aaa6-6113-482f-809b-11e405510184", "metadata": {}, "outputs": [], @@ -660,7 +523,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 8, "id": "79f99692-91e8-4915-9b57-7b3a1efd7d3a", "metadata": {}, "outputs": [ @@ -685,342 +548,89 @@ " \n", "2051 rows × 17 columns
\n", "" ], "text/plain": [ - " sqm_ocr price price_per_sqm \\\n", - "0 7.81 350000.0 44814.340589 \n", - "1 NaN 400000.0 NaN \n", - "2 76.91 400000.0 5200.884150 \n", - "3 112.40 800000.0 7117.437722 \n", - "4 115.60 775000.0 6704.152249 \n", - "... ... ... ... \n", - "2046 NaN 750000.0 NaN \n", - "2047 NaN 655000.0 NaN \n", - "2048 82.80 550000.0 6642.512077 \n", - "2049 5.52 300000.0 54347.826087 \n", - "2050 81.60 790000.0 9681.372549 \n", + " identifier sqm_ocr price price_per_sqm \\\n", + "0 102360773 NaN 350000.0 NaN \n", + "1 105836849 NaN 400000.0 NaN \n", "\n", - " url bedrooms duration \\\n", - "0 https://www.rightmove.co.uk/properties/102360773 1 2695 \n", - "1 https://www.rightmove.co.uk/properties/105836849 3 2565 \n", - "2 https://www.rightmove.co.uk/properties/107233214 3 1714 \n", - "3 https://www.rightmove.co.uk/properties/107976896 3 1862 \n", - "4 https://www.rightmove.co.uk/properties/115499441 3 2943 \n", - "... ... ... ... \n", - "2046 https://www.rightmove.co.uk/properties/86813508 3 2400 \n", - "2047 https://www.rightmove.co.uk/properties/86813523 3 2400 \n", - "2048 https://www.rightmove.co.uk/properties/86814669 3 2310 \n", - "2049 https://www.rightmove.co.uk/properties/86955958 3 2332 \n", - "2050 https://www.rightmove.co.uk/properties/86972726 2 1393 \n", + " url bedrooms lease_left \\\n", + "0 https://www.rightmove.co.uk/properties/102360773 1 119.0 \n", + "1 https://www.rightmove.co.uk/properties/105836849 3 NaN \n", "\n", - " distance duration_static duration_per_transit \\\n", - "0 6467 2695 {'WALK': 414, 'TRANSIT': 2280} \n", - "1 14070 2565 {'WALK': 750, 'TRANSIT': 1800} \n", - "2 9570 1714 {'WALK': 903, 'TRANSIT': 780} \n", - "3 8278 1862 {'WALK': 635, 'TRANSIT': 900} \n", - "4 7437 2943 {'WALK': 242, 'TRANSIT': 2700} \n", - "... ... ... ... \n", - "2046 13983 2400 {'WALK': 603, 'TRANSIT': 1524} \n", - "2047 13983 2400 {'WALK': 603, 'TRANSIT': 1524} \n", - "2048 12972 2310 {'WALK': 786, 'TRANSIT': 1277} \n", - "2049 6898 2332 {'WALK': 1671, 'TRANSIT': 660} \n", - "2050 6390 1393 {'WALK': 518, 'TRANSIT': 564} \n", + " development decision duration initial_walk_duration \\\n", + "0 False None 45 174 \n", + "1 False None 43 340 \n", "\n", - " distance_per_transit number_of_transit_stops duration \\\n", - "0 {'WALK': 465, 'TRANSIT': 6002} 1 1682.0 \n", - "1 {'WALK': 856, 'TRANSIT': 13214} 1 2565.0 \n", - "2 {'WALK': 1035, 'TRANSIT': 8535} 1 1774.0 \n", - "3 {'WALK': 710, 'TRANSIT': 7568} 2 1862.0 \n", - "4 {'WALK': 276, 'TRANSIT': 7161} 1 2167.0 \n", - "... ... ... ... \n", - "2046 {'WALK': 671, 'TRANSIT': 13312} 2 2605.0 \n", - "2047 {'WALK': 671, 'TRANSIT': 13312} 2 2605.0 \n", - "2048 {'WALK': 885, 'TRANSIT': 12087} 2 2322.0 \n", - "2049 {'WALK': 1945, 'TRANSIT': 4953} 1 2248.0 \n", - "2050 {'WALK': 441, 'TRANSIT': 5949} 2 2733.0 \n", - "\n", - " distance duration_static duration_per_transit \\\n", - "0 6810.0 1682.0 {'WALK': 608, 'TRANSIT': 804} \n", - "1 14070.0 2565.0 {'WALK': 750, 'TRANSIT': 1800} \n", - "2 9570.0 1774.0 {'WALK': 903, 'TRANSIT': 840} \n", - "3 8278.0 1862.0 {'WALK': 635, 'TRANSIT': 960} \n", - "4 9920.0 2167.0 {'WALK': 658, 'TRANSIT': 1200} \n", - "... ... ... ... \n", - "2046 14702.0 2605.0 {'WALK': 1467, 'TRANSIT': 1132} \n", - "2047 14702.0 2605.0 {'WALK': 1467, 'TRANSIT': 1132} \n", - "2048 12104.0 2322.0 {'WALK': 991, 'TRANSIT': 1080} \n", - "2049 6893.0 2248.0 {'WALK': 1671, 'TRANSIT': 576} \n", - "2050 6490.0 2733.0 {'WALK': 451, 'TRANSIT': 2280} \n", - "\n", - " distance_per_transit number_of_transit_stops \n", - "0 {'WALK': 582, 'TRANSIT': 6228} 2.0 \n", - "1 {'WALK': 856, 'TRANSIT': 13214} 1.0 \n", - "2 {'WALK': 1035, 'TRANSIT': 8535} 1.0 \n", - "3 {'WALK': 710, 'TRANSIT': 7568} 2.0 \n", - "4 {'WALK': 720, 'TRANSIT': 9200} 2.0 \n", - "... ... ... \n", - "2046 {'WALK': 1698, 'TRANSIT': 13004} 1.0 \n", - "2047 {'WALK': 1698, 'TRANSIT': 13004} 1.0 \n", - "2048 {'WALK': 1089, 'TRANSIT': 11015} 2.0 \n", - "2049 {'WALK': 1945, 'TRANSIT': 4948} 1.0 \n", - "2050 {'WALK': 488, 'TRANSIT': 6002} 1.0 \n", - "\n", - "[2051 rows x 17 columns]" + " duration_per_transit number_of_transit_stops \n", + "0 {'WALK': 414, 'TRANSIT': 2280} 1 \n", + "1 {'WALK': 750, 'TRANSIT': 1800} 1 " ] }, - "execution_count": 12, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "df = pd.concat([df.drop(['travel_time_fastest'], axis=1), df['travel_time_fastest'].apply(pd.Series)], axis=1)\n", - "df = pd.concat([df.drop(['travel_time_second'], axis=1), df['travel_time_second'].apply(pd.Series)], axis=1)\n", - "df" + "dropcolumns = ['distance_per_transit', 'duration_static', 'distance']\n", + "s1 = df['travel_time_fastest'].apply(pd.Series).drop(dropcolumns, axis=1)\n", + "\n", + "df3 = pd.concat([df.drop(['travel_time_fastest', 'travel_time_second'], axis=1), s1], axis=1)\n", + "df3.loc[:, 'duration'] = (df3.loc[:, ['duration']].min(axis=1) / 60).round()\n", + "df3.to_clipboard()\n", + "df3.head(2)" ] }, { @@ -1918,62 +1528,85 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 15, "id": "edd9fa24-cad2-4448-9b17-c6d514564f41", "metadata": {}, + "outputs": [], + "source": [ + "from data_access import Listing\n", + "import pytesseract\n", + "from PIL import Image\n", + "import re" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "002b2a3a-3ecc-45c1-8c2f-c143380ee0d5", + "metadata": {}, "outputs": [ { - "data": { - "text/html": [ - "| \n", - " | sqm_ocr | \n", - "price | \n", - "price_per_sqm | \n", - "url | \n", - "bedrooms | \n", - "
|---|