diff --git a/crawler/exploration.ipynb b/crawler/exploration.ipynb index 5912de3..df1160f 100644 --- a/crawler/exploration.ipynb +++ b/crawler/exploration.ipynb @@ -27,7 +27,39 @@ "metadata": {}, "outputs": [], "source": [ - "ls = Listing.get_all_listings()" + "ls = Listing.get_all_listings()\n", + "\n", + "decisions = {\n", + " 142789514: 'n',\n", + " 136010102: 'n',\n", + " 141457334: 'y',\n", + " 86778015: 'n',\n", + " 134574563: 'n',\n", + " 86648925: 'n',\n", + " 143319068: 'n',\n", + " 135668207: 'n',\n", + " 142063949: 'n',\n", + " 145051769: 'n',\n", + " 138945719: 'n',\n", + " 135714833: 'n',\n", + " 144983192: 'n',\n", + " 144666920: 'n',\n", + " 143895080: 'n',\n", + " 141114200: 'n',\n", + " 145407389: 'n',\n", + " 145047533: 'n',\n", + " 145161722: 'n',\n", + " 145130066: 'n',\n", + " 142110470: 'n',\n", + " 133667606: 'n',\n", + " 145005536: 'n',\n", + " 143458961: 'n',\n", + " 141412010: 'y',\n", + " 138683339: 'n',\n", + " 138490370: 'n',\n", + " 137805509: 'n',\n", + " 135854261: 'n',\n", + "}" ] }, { @@ -67,6 +99,7 @@ " \n", " \n", " \n", + " identifier\n", " sqm_ocr\n", " price\n", " price_per_sqm\n", @@ -76,23 +109,27 @@ " travel_time_second\n", " lease_left\n", " development\n", + " decision\n", " \n", " \n", " \n", " \n", " 0\n", - " 7.81\n", + " 102360773\n", + " NaN\n", " 350000.0\n", - " 44814.340589\n", + " NaN\n", " https://www.rightmove.co.uk/properties/102360773\n", " 1\n", " {'duration': 2695, 'distance': 6467, 'duration...\n", " {'duration': 1682, 'distance': 6810, 'duration...\n", " 119.0\n", " False\n", + " None\n", " \n", " \n", " 1\n", + " 105836849\n", " NaN\n", " 400000.0\n", " NaN\n", @@ -102,9 +139,11 @@ " {'duration': 2565, 'distance': 14070, 'duratio...\n", " NaN\n", " False\n", + " None\n", " \n", " \n", " 2\n", + " 107233214\n", " 76.91\n", " 400000.0\n", " 5200.884150\n", @@ -114,9 +153,11 @@ " {'duration': 1774, 'distance': 9570, 'duration...\n", " 91.0\n", " False\n", + " None\n", " \n", " \n", " 3\n", + " 107976896\n", " 112.40\n", " 800000.0\n", " 7117.437722\n", @@ -126,9 +167,11 @@ " {'duration': 1862, 'distance': 8278, 'duration...\n", " NaN\n", " False\n", + " None\n", " \n", " \n", " 4\n", + " 115499441\n", " 115.60\n", " 775000.0\n", " 6704.152249\n", @@ -138,6 +181,7 @@ " {'duration': 2167, 'distance': 9920, 'duration...\n", " NaN\n", " False\n", + " None\n", " \n", " \n", " ...\n", @@ -150,9 +194,12 @@ " ...\n", " ...\n", " ...\n", + " ...\n", + " ...\n", " \n", " \n", - " 2129\n", + " 2181\n", + " 86813508\n", " NaN\n", " 750000.0\n", " NaN\n", @@ -162,9 +209,11 @@ " {'duration': 2605, 'distance': 14702, 'duratio...\n", " NaN\n", " True\n", + " None\n", " \n", " \n", - " 2130\n", + " 2182\n", + " 86813523\n", " NaN\n", " 655000.0\n", " NaN\n", @@ -174,9 +223,11 @@ " {'duration': 2605, 'distance': 14702, 'duratio...\n", " NaN\n", " True\n", + " None\n", " \n", " \n", - " 2131\n", + " 2183\n", + " 86814669\n", " 82.80\n", " 550000.0\n", " 6642.512077\n", @@ -186,21 +237,25 @@ " {'duration': 2322, 'distance': 12104, 'duratio...\n", " 0.0\n", " False\n", + " None\n", " \n", " \n", - " 2132\n", - " 5.52\n", + " 2184\n", + " 86955958\n", + " 90.00\n", " 300000.0\n", - " 54347.826087\n", + " 3333.333333\n", " https://www.rightmove.co.uk/properties/86955958\n", " 3\n", " {'duration': 2332, 'distance': 6898, 'duration...\n", " {'duration': 2248, 'distance': 6893, 'duration...\n", " 105.0\n", " False\n", + " None\n", " \n", " \n", - " 2133\n", + " 2185\n", + " 86972726\n", " 81.60\n", " 790000.0\n", " 9681.372549\n", @@ -210,25 +265,26 @@ " {'duration': 2733, 'distance': 6490, 'duration...\n", " 993.0\n", " False\n", + " None\n", " \n", " \n", "\n", - "

2134 rows × 9 columns

\n", + "

2186 rows × 11 columns

\n", "" ], "text/plain": [ - " sqm_ocr price price_per_sqm \\\n", - "0 7.81 350000.0 44814.340589 \n", - "1 NaN 400000.0 NaN \n", - "2 76.91 400000.0 5200.884150 \n", - "3 112.40 800000.0 7117.437722 \n", - "4 115.60 775000.0 6704.152249 \n", - "... ... ... ... \n", - "2129 NaN 750000.0 NaN \n", - "2130 NaN 655000.0 NaN \n", - "2131 82.80 550000.0 6642.512077 \n", - "2132 5.52 300000.0 54347.826087 \n", - "2133 81.60 790000.0 9681.372549 \n", + " identifier sqm_ocr price price_per_sqm \\\n", + "0 102360773 NaN 350000.0 NaN \n", + "1 105836849 NaN 400000.0 NaN \n", + "2 107233214 76.91 400000.0 5200.884150 \n", + "3 107976896 112.40 800000.0 7117.437722 \n", + "4 115499441 115.60 775000.0 6704.152249 \n", + "... ... ... ... ... \n", + "2181 86813508 NaN 750000.0 NaN \n", + "2182 86813523 NaN 655000.0 NaN \n", + "2183 86814669 82.80 550000.0 6642.512077 \n", + "2184 86955958 90.00 300000.0 3333.333333 \n", + "2185 86972726 81.60 790000.0 9681.372549 \n", "\n", " url bedrooms \\\n", "0 https://www.rightmove.co.uk/properties/102360773 1 \n", @@ -237,11 +293,11 @@ "3 https://www.rightmove.co.uk/properties/107976896 3 \n", "4 https://www.rightmove.co.uk/properties/115499441 3 \n", "... ... ... \n", - "2129 https://www.rightmove.co.uk/properties/86813508 3 \n", - "2130 https://www.rightmove.co.uk/properties/86813523 3 \n", - "2131 https://www.rightmove.co.uk/properties/86814669 3 \n", - "2132 https://www.rightmove.co.uk/properties/86955958 3 \n", - "2133 https://www.rightmove.co.uk/properties/86972726 2 \n", + "2181 https://www.rightmove.co.uk/properties/86813508 3 \n", + "2182 https://www.rightmove.co.uk/properties/86813523 3 \n", + "2183 https://www.rightmove.co.uk/properties/86814669 3 \n", + "2184 https://www.rightmove.co.uk/properties/86955958 3 \n", + "2185 https://www.rightmove.co.uk/properties/86972726 2 \n", "\n", " travel_time_fastest \\\n", "0 {'duration': 2695, 'distance': 6467, 'duration... \n", @@ -250,11 +306,11 @@ "3 {'duration': 1862, 'distance': 8278, 'duration... \n", "4 {'duration': 2943, 'distance': 7437, 'duration... \n", "... ... \n", - "2129 {'duration': 2400, 'distance': 13983, 'duratio... \n", - "2130 {'duration': 2400, 'distance': 13983, 'duratio... \n", - "2131 {'duration': 2310, 'distance': 12972, 'duratio... \n", - "2132 {'duration': 2332, 'distance': 6898, 'duration... \n", - "2133 {'duration': 1393, 'distance': 6390, 'duration... \n", + "2181 {'duration': 2400, 'distance': 13983, 'duratio... \n", + "2182 {'duration': 2400, 'distance': 13983, 'duratio... \n", + "2183 {'duration': 2310, 'distance': 12972, 'duratio... \n", + "2184 {'duration': 2332, 'distance': 6898, 'duration... \n", + "2185 {'duration': 1393, 'distance': 6390, 'duration... \n", "\n", " travel_time_second lease_left \\\n", "0 {'duration': 1682, 'distance': 6810, 'duration... 119.0 \n", @@ -263,26 +319,26 @@ "3 {'duration': 1862, 'distance': 8278, 'duration... NaN \n", "4 {'duration': 2167, 'distance': 9920, 'duration... NaN \n", "... ... ... \n", - "2129 {'duration': 2605, 'distance': 14702, 'duratio... NaN \n", - "2130 {'duration': 2605, 'distance': 14702, 'duratio... NaN \n", - "2131 {'duration': 2322, 'distance': 12104, 'duratio... 0.0 \n", - "2132 {'duration': 2248, 'distance': 6893, 'duration... 105.0 \n", - "2133 {'duration': 2733, 'distance': 6490, 'duration... 993.0 \n", + "2181 {'duration': 2605, 'distance': 14702, 'duratio... NaN \n", + "2182 {'duration': 2605, 'distance': 14702, 'duratio... NaN \n", + "2183 {'duration': 2322, 'distance': 12104, 'duratio... 0.0 \n", + "2184 {'duration': 2248, 'distance': 6893, 'duration... 105.0 \n", + "2185 {'duration': 2733, 'distance': 6490, 'duration... 993.0 \n", "\n", - " development \n", - "0 False \n", - "1 False \n", - "2 False \n", - "3 False \n", - "4 False \n", - "... ... \n", - "2129 True \n", - "2130 True \n", - "2131 False \n", - "2132 False \n", - "2133 False \n", + " development decision \n", + "0 False None \n", + "1 False None \n", + "2 False None \n", + "3 False None \n", + "4 False None \n", + "... ... ... \n", + "2181 True None \n", + "2182 True None \n", + "2183 False None \n", + "2184 False None \n", + "2185 False None \n", "\n", - "[2134 rows x 9 columns]" + "[2186 rows x 11 columns]" ] }, "execution_count": 4, @@ -292,42 +348,34 @@ ], "source": [ "df = pd.DataFrame(ds)\n", + "df.loc[:, 'decision'] = df.identifier.apply(lambda x: decisions.get(x))\n", "df" ] }, { "cell_type": "code", - "execution_count": 14, - "id": "387c48d9-26c6-4bed-8201-352735c06acb", + "execution_count": 5, + "id": "d80d9911-9a6d-4608-a6da-11dc864ee32b", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "0 1682.0\n", - "1 2565.0\n", - "2 1714.0\n", - "3 1862.0\n", - "4 2167.0\n", - " ... \n", - "2129 2400.0\n", - "2130 2400.0\n", - "2131 2310.0\n", - "2132 2248.0\n", - "2133 1393.0\n", - "Length: 2134, dtype: float64" + "(2186, 11)" ] }, - "execution_count": 14, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], - "source": [] + "source": [ + "df.shape" + ] }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 6, "id": "7b37ad6b-9b0a-444e-b8c3-6fe4e43e42cb", "metadata": {}, "outputs": [ @@ -352,6 +400,7 @@ " \n", " \n", " \n", + " identifier\n", " sqm_ocr\n", " price\n", " price_per_sqm\n", @@ -359,10 +408,13 @@ " bedrooms\n", " lease_left\n", " development\n", + " decision\n", " a_duration\n", + " a_initial_walk_duration\n", " a_duration_per_transit\n", " a_number_of_transit_stops\n", " b_duration\n", + " b_initial_walk_duration\n", " b_duration_per_transit\n", " b_number_of_transit_stops\n", " min_duration\n", @@ -371,23 +423,28 @@ " \n", " \n", " 0\n", - " 7.81\n", + " 102360773\n", + " NaN\n", " 350000.0\n", - " 44814.340589\n", + " NaN\n", " https://www.rightmove.co.uk/properties/102360773\n", " 1\n", " 119.0\n", " False\n", + " None\n", " 2695\n", + " 174\n", " {'WALK': 414, 'TRANSIT': 2280}\n", " 1\n", " 1682.0\n", + " 204.0\n", " {'WALK': 608, 'TRANSIT': 804}\n", " 2.0\n", " 28.0\n", " \n", " \n", " 1\n", + " 105836849\n", " NaN\n", " 400000.0\n", " NaN\n", @@ -395,242 +452,48 @@ " 3\n", " NaN\n", " False\n", + " None\n", " 2565\n", + " 340\n", " {'WALK': 750, 'TRANSIT': 1800}\n", " 1\n", " 2565.0\n", + " 340.0\n", " {'WALK': 750, 'TRANSIT': 1800}\n", " 1.0\n", " 43.0\n", " \n", - " \n", - " 2\n", - " 76.91\n", - " 400000.0\n", - " 5200.884150\n", - " https://www.rightmove.co.uk/properties/107233214\n", - " 3\n", - " 91.0\n", - " False\n", - " 1714\n", - " {'WALK': 903, 'TRANSIT': 780}\n", - " 1\n", - " 1774.0\n", - " {'WALK': 903, 'TRANSIT': 840}\n", - " 1.0\n", - " 29.0\n", - " \n", - " \n", - " 3\n", - " 112.40\n", - " 800000.0\n", - " 7117.437722\n", - " https://www.rightmove.co.uk/properties/107976896\n", - " 3\n", - " NaN\n", - " False\n", - " 1862\n", - " {'WALK': 635, 'TRANSIT': 900}\n", - " 2\n", - " 1862.0\n", - " {'WALK': 635, 'TRANSIT': 960}\n", - " 2.0\n", - " 31.0\n", - " \n", - " \n", - " 4\n", - " 115.60\n", - " 775000.0\n", - " 6704.152249\n", - " https://www.rightmove.co.uk/properties/115499441\n", - " 3\n", - " NaN\n", - " False\n", - " 2943\n", - " {'WALK': 242, 'TRANSIT': 2700}\n", - " 1\n", - " 2167.0\n", - " {'WALK': 658, 'TRANSIT': 1200}\n", - " 2.0\n", - " 36.0\n", - " \n", - " \n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " \n", - " \n", - " 2129\n", - " NaN\n", - " 750000.0\n", - " NaN\n", - " https://www.rightmove.co.uk/properties/86813508\n", - " 3\n", - " NaN\n", - " True\n", - " 2400\n", - " {'WALK': 603, 'TRANSIT': 1524}\n", - " 2\n", - " 2605.0\n", - " {'WALK': 1467, 'TRANSIT': 1132}\n", - " 1.0\n", - " 40.0\n", - " \n", - " \n", - " 2130\n", - " NaN\n", - " 655000.0\n", - " NaN\n", - " https://www.rightmove.co.uk/properties/86813523\n", - " 3\n", - " NaN\n", - " True\n", - " 2400\n", - " {'WALK': 603, 'TRANSIT': 1524}\n", - " 2\n", - " 2605.0\n", - " {'WALK': 1467, 'TRANSIT': 1132}\n", - " 1.0\n", - " 40.0\n", - " \n", - " \n", - " 2131\n", - " 82.80\n", - " 550000.0\n", - " 6642.512077\n", - " https://www.rightmove.co.uk/properties/86814669\n", - " 3\n", - " 0.0\n", - " False\n", - " 2310\n", - " {'WALK': 786, 'TRANSIT': 1277}\n", - " 2\n", - " 2322.0\n", - " {'WALK': 991, 'TRANSIT': 1080}\n", - " 2.0\n", - " 38.0\n", - " \n", - " \n", - " 2132\n", - " 5.52\n", - " 300000.0\n", - " 54347.826087\n", - " https://www.rightmove.co.uk/properties/86955958\n", - " 3\n", - " 105.0\n", - " False\n", - " 2332\n", - " {'WALK': 1671, 'TRANSIT': 660}\n", - " 1\n", - " 2248.0\n", - " {'WALK': 1671, 'TRANSIT': 576}\n", - " 1.0\n", - " 37.0\n", - " \n", - " \n", - " 2133\n", - " 81.60\n", - " 790000.0\n", - " 9681.372549\n", - " https://www.rightmove.co.uk/properties/86972726\n", - " 2\n", - " 993.0\n", - " False\n", - " 1393\n", - " {'WALK': 518, 'TRANSIT': 564}\n", - " 2\n", - " 2733.0\n", - " {'WALK': 451, 'TRANSIT': 2280}\n", - " 1.0\n", - " 23.0\n", - " \n", " \n", "\n", - "

2134 rows × 14 columns

\n", "" ], "text/plain": [ - " sqm_ocr price price_per_sqm \\\n", - "0 7.81 350000.0 44814.340589 \n", - "1 NaN 400000.0 NaN \n", - "2 76.91 400000.0 5200.884150 \n", - "3 112.40 800000.0 7117.437722 \n", - "4 115.60 775000.0 6704.152249 \n", - "... ... ... ... \n", - "2129 NaN 750000.0 NaN \n", - "2130 NaN 655000.0 NaN \n", - "2131 82.80 550000.0 6642.512077 \n", - "2132 5.52 300000.0 54347.826087 \n", - "2133 81.60 790000.0 9681.372549 \n", + " identifier sqm_ocr price price_per_sqm \\\n", + "0 102360773 NaN 350000.0 NaN \n", + "1 105836849 NaN 400000.0 NaN \n", "\n", - " url bedrooms lease_left \\\n", - "0 https://www.rightmove.co.uk/properties/102360773 1 119.0 \n", - "1 https://www.rightmove.co.uk/properties/105836849 3 NaN \n", - "2 https://www.rightmove.co.uk/properties/107233214 3 91.0 \n", - "3 https://www.rightmove.co.uk/properties/107976896 3 NaN \n", - "4 https://www.rightmove.co.uk/properties/115499441 3 NaN \n", - "... ... ... ... \n", - "2129 https://www.rightmove.co.uk/properties/86813508 3 NaN \n", - "2130 https://www.rightmove.co.uk/properties/86813523 3 NaN \n", - "2131 https://www.rightmove.co.uk/properties/86814669 3 0.0 \n", - "2132 https://www.rightmove.co.uk/properties/86955958 3 105.0 \n", - "2133 https://www.rightmove.co.uk/properties/86972726 2 993.0 \n", + " url bedrooms lease_left \\\n", + "0 https://www.rightmove.co.uk/properties/102360773 1 119.0 \n", + "1 https://www.rightmove.co.uk/properties/105836849 3 NaN \n", "\n", - " development a_duration a_duration_per_transit \\\n", - "0 False 2695 {'WALK': 414, 'TRANSIT': 2280} \n", - "1 False 2565 {'WALK': 750, 'TRANSIT': 1800} \n", - "2 False 1714 {'WALK': 903, 'TRANSIT': 780} \n", - "3 False 1862 {'WALK': 635, 'TRANSIT': 900} \n", - "4 False 2943 {'WALK': 242, 'TRANSIT': 2700} \n", - "... ... ... ... \n", - "2129 True 2400 {'WALK': 603, 'TRANSIT': 1524} \n", - "2130 True 2400 {'WALK': 603, 'TRANSIT': 1524} \n", - "2131 False 2310 {'WALK': 786, 'TRANSIT': 1277} \n", - "2132 False 2332 {'WALK': 1671, 'TRANSIT': 660} \n", - "2133 False 1393 {'WALK': 518, 'TRANSIT': 564} \n", + " development decision a_duration a_initial_walk_duration \\\n", + "0 False None 2695 174 \n", + "1 False None 2565 340 \n", "\n", - " a_number_of_transit_stops b_duration b_duration_per_transit \\\n", - "0 1 1682.0 {'WALK': 608, 'TRANSIT': 804} \n", - "1 1 2565.0 {'WALK': 750, 'TRANSIT': 1800} \n", - "2 1 1774.0 {'WALK': 903, 'TRANSIT': 840} \n", - "3 2 1862.0 {'WALK': 635, 'TRANSIT': 960} \n", - "4 1 2167.0 {'WALK': 658, 'TRANSIT': 1200} \n", - "... ... ... ... \n", - "2129 2 2605.0 {'WALK': 1467, 'TRANSIT': 1132} \n", - "2130 2 2605.0 {'WALK': 1467, 'TRANSIT': 1132} \n", - "2131 2 2322.0 {'WALK': 991, 'TRANSIT': 1080} \n", - "2132 1 2248.0 {'WALK': 1671, 'TRANSIT': 576} \n", - "2133 2 2733.0 {'WALK': 451, 'TRANSIT': 2280} \n", + " a_duration_per_transit a_number_of_transit_stops b_duration \\\n", + "0 {'WALK': 414, 'TRANSIT': 2280} 1 1682.0 \n", + "1 {'WALK': 750, 'TRANSIT': 1800} 1 2565.0 \n", "\n", - " b_number_of_transit_stops min_duration \n", - "0 2.0 28.0 \n", - "1 1.0 43.0 \n", - "2 1.0 29.0 \n", - "3 2.0 31.0 \n", - "4 2.0 36.0 \n", - "... ... ... \n", - "2129 1.0 40.0 \n", - "2130 1.0 40.0 \n", - "2131 2.0 38.0 \n", - "2132 1.0 37.0 \n", - "2133 1.0 23.0 \n", + " b_initial_walk_duration b_duration_per_transit \\\n", + "0 204.0 {'WALK': 608, 'TRANSIT': 804} \n", + "1 340.0 {'WALK': 750, 'TRANSIT': 1800} \n", "\n", - "[2134 rows x 14 columns]" + " b_number_of_transit_stops min_duration \n", + "0 2.0 28.0 \n", + "1 1.0 43.0 " ] }, - "execution_count": 19, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -645,12 +508,12 @@ "\n", "df2 = pd.concat([df.drop(['travel_time_fastest', 'travel_time_second'], axis=1), s1, s2], axis=1)\n", "df2.loc[:, 'min_duration'] = (df2.loc[:, ['a_duration', 'b_duration']].min(axis=1) / 60).round()\n", - "df2" + "df2.head(2)" ] }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 7, "id": "8c75aaa6-6113-482f-809b-11e405510184", "metadata": {}, "outputs": [], @@ -660,7 +523,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 8, "id": "79f99692-91e8-4915-9b57-7b3a1efd7d3a", "metadata": {}, "outputs": [ @@ -685,342 +548,89 @@ " \n", " \n", " \n", + " identifier\n", " sqm_ocr\n", " price\n", " price_per_sqm\n", " url\n", " bedrooms\n", + " lease_left\n", + " development\n", + " decision\n", " duration\n", - " distance\n", - " duration_static\n", + " initial_walk_duration\n", " duration_per_transit\n", - " distance_per_transit\n", - " number_of_transit_stops\n", - " duration\n", - " distance\n", - " duration_static\n", - " duration_per_transit\n", - " distance_per_transit\n", " number_of_transit_stops\n", " \n", " \n", " \n", " \n", " 0\n", - " 7.81\n", + " 102360773\n", + " NaN\n", " 350000.0\n", - " 44814.340589\n", + " NaN\n", " https://www.rightmove.co.uk/properties/102360773\n", " 1\n", - " 2695\n", - " 6467\n", - " 2695\n", + " 119.0\n", + " False\n", + " None\n", + " 45\n", + " 174\n", " {'WALK': 414, 'TRANSIT': 2280}\n", - " {'WALK': 465, 'TRANSIT': 6002}\n", " 1\n", - " 1682.0\n", - " 6810.0\n", - " 1682.0\n", - " {'WALK': 608, 'TRANSIT': 804}\n", - " {'WALK': 582, 'TRANSIT': 6228}\n", - " 2.0\n", " \n", " \n", " 1\n", + " 105836849\n", " NaN\n", " 400000.0\n", " NaN\n", " https://www.rightmove.co.uk/properties/105836849\n", " 3\n", - " 2565\n", - " 14070\n", - " 2565\n", + " NaN\n", + " False\n", + " None\n", + " 43\n", + " 340\n", " {'WALK': 750, 'TRANSIT': 1800}\n", - " {'WALK': 856, 'TRANSIT': 13214}\n", " 1\n", - " 2565.0\n", - " 14070.0\n", - " 2565.0\n", - " {'WALK': 750, 'TRANSIT': 1800}\n", - " {'WALK': 856, 'TRANSIT': 13214}\n", - " 1.0\n", - " \n", - " \n", - " 2\n", - " 76.91\n", - " 400000.0\n", - " 5200.884150\n", - " https://www.rightmove.co.uk/properties/107233214\n", - " 3\n", - " 1714\n", - " 9570\n", - " 1714\n", - " {'WALK': 903, 'TRANSIT': 780}\n", - " {'WALK': 1035, 'TRANSIT': 8535}\n", - " 1\n", - " 1774.0\n", - " 9570.0\n", - " 1774.0\n", - " {'WALK': 903, 'TRANSIT': 840}\n", - " {'WALK': 1035, 'TRANSIT': 8535}\n", - " 1.0\n", - " \n", - " \n", - " 3\n", - " 112.40\n", - " 800000.0\n", - " 7117.437722\n", - " https://www.rightmove.co.uk/properties/107976896\n", - " 3\n", - " 1862\n", - " 8278\n", - " 1862\n", - " {'WALK': 635, 'TRANSIT': 900}\n", - " {'WALK': 710, 'TRANSIT': 7568}\n", - " 2\n", - " 1862.0\n", - " 8278.0\n", - " 1862.0\n", - " {'WALK': 635, 'TRANSIT': 960}\n", - " {'WALK': 710, 'TRANSIT': 7568}\n", - " 2.0\n", - " \n", - " \n", - " 4\n", - " 115.60\n", - " 775000.0\n", - " 6704.152249\n", - " https://www.rightmove.co.uk/properties/115499441\n", - " 3\n", - " 2943\n", - " 7437\n", - " 2943\n", - " {'WALK': 242, 'TRANSIT': 2700}\n", - " {'WALK': 276, 'TRANSIT': 7161}\n", - " 1\n", - " 2167.0\n", - " 9920.0\n", - " 2167.0\n", - " {'WALK': 658, 'TRANSIT': 1200}\n", - " {'WALK': 720, 'TRANSIT': 9200}\n", - " 2.0\n", - " \n", - " \n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " \n", - " \n", - " 2046\n", - " NaN\n", - " 750000.0\n", - " NaN\n", - " https://www.rightmove.co.uk/properties/86813508\n", - " 3\n", - " 2400\n", - " 13983\n", - " 2400\n", - " {'WALK': 603, 'TRANSIT': 1524}\n", - " {'WALK': 671, 'TRANSIT': 13312}\n", - " 2\n", - " 2605.0\n", - " 14702.0\n", - " 2605.0\n", - " {'WALK': 1467, 'TRANSIT': 1132}\n", - " {'WALK': 1698, 'TRANSIT': 13004}\n", - " 1.0\n", - " \n", - " \n", - " 2047\n", - " NaN\n", - " 655000.0\n", - " NaN\n", - " https://www.rightmove.co.uk/properties/86813523\n", - " 3\n", - " 2400\n", - " 13983\n", - " 2400\n", - " {'WALK': 603, 'TRANSIT': 1524}\n", - " {'WALK': 671, 'TRANSIT': 13312}\n", - " 2\n", - " 2605.0\n", - " 14702.0\n", - " 2605.0\n", - " {'WALK': 1467, 'TRANSIT': 1132}\n", - " {'WALK': 1698, 'TRANSIT': 13004}\n", - " 1.0\n", - " \n", - " \n", - " 2048\n", - " 82.80\n", - " 550000.0\n", - " 6642.512077\n", - " https://www.rightmove.co.uk/properties/86814669\n", - " 3\n", - " 2310\n", - " 12972\n", - " 2310\n", - " {'WALK': 786, 'TRANSIT': 1277}\n", - " {'WALK': 885, 'TRANSIT': 12087}\n", - " 2\n", - " 2322.0\n", - " 12104.0\n", - " 2322.0\n", - " {'WALK': 991, 'TRANSIT': 1080}\n", - " {'WALK': 1089, 'TRANSIT': 11015}\n", - " 2.0\n", - " \n", - " \n", - " 2049\n", - " 5.52\n", - " 300000.0\n", - " 54347.826087\n", - " https://www.rightmove.co.uk/properties/86955958\n", - " 3\n", - " 2332\n", - " 6898\n", - " 2332\n", - " {'WALK': 1671, 'TRANSIT': 660}\n", - " {'WALK': 1945, 'TRANSIT': 4953}\n", - " 1\n", - " 2248.0\n", - " 6893.0\n", - " 2248.0\n", - " {'WALK': 1671, 'TRANSIT': 576}\n", - " {'WALK': 1945, 'TRANSIT': 4948}\n", - " 1.0\n", - " \n", - " \n", - " 2050\n", - " 81.60\n", - " 790000.0\n", - " 9681.372549\n", - " https://www.rightmove.co.uk/properties/86972726\n", - " 2\n", - " 1393\n", - " 6390\n", - " 1393\n", - " {'WALK': 518, 'TRANSIT': 564}\n", - " {'WALK': 441, 'TRANSIT': 5949}\n", - " 2\n", - " 2733.0\n", - " 6490.0\n", - " 2733.0\n", - " {'WALK': 451, 'TRANSIT': 2280}\n", - " {'WALK': 488, 'TRANSIT': 6002}\n", - " 1.0\n", " \n", " \n", "\n", - "

2051 rows × 17 columns

\n", "" ], "text/plain": [ - " sqm_ocr price price_per_sqm \\\n", - "0 7.81 350000.0 44814.340589 \n", - "1 NaN 400000.0 NaN \n", - "2 76.91 400000.0 5200.884150 \n", - "3 112.40 800000.0 7117.437722 \n", - "4 115.60 775000.0 6704.152249 \n", - "... ... ... ... \n", - "2046 NaN 750000.0 NaN \n", - "2047 NaN 655000.0 NaN \n", - "2048 82.80 550000.0 6642.512077 \n", - "2049 5.52 300000.0 54347.826087 \n", - "2050 81.60 790000.0 9681.372549 \n", + " identifier sqm_ocr price price_per_sqm \\\n", + "0 102360773 NaN 350000.0 NaN \n", + "1 105836849 NaN 400000.0 NaN \n", "\n", - " url bedrooms duration \\\n", - "0 https://www.rightmove.co.uk/properties/102360773 1 2695 \n", - "1 https://www.rightmove.co.uk/properties/105836849 3 2565 \n", - "2 https://www.rightmove.co.uk/properties/107233214 3 1714 \n", - "3 https://www.rightmove.co.uk/properties/107976896 3 1862 \n", - "4 https://www.rightmove.co.uk/properties/115499441 3 2943 \n", - "... ... ... ... \n", - "2046 https://www.rightmove.co.uk/properties/86813508 3 2400 \n", - "2047 https://www.rightmove.co.uk/properties/86813523 3 2400 \n", - "2048 https://www.rightmove.co.uk/properties/86814669 3 2310 \n", - "2049 https://www.rightmove.co.uk/properties/86955958 3 2332 \n", - "2050 https://www.rightmove.co.uk/properties/86972726 2 1393 \n", + " url bedrooms lease_left \\\n", + "0 https://www.rightmove.co.uk/properties/102360773 1 119.0 \n", + "1 https://www.rightmove.co.uk/properties/105836849 3 NaN \n", "\n", - " distance duration_static duration_per_transit \\\n", - "0 6467 2695 {'WALK': 414, 'TRANSIT': 2280} \n", - "1 14070 2565 {'WALK': 750, 'TRANSIT': 1800} \n", - "2 9570 1714 {'WALK': 903, 'TRANSIT': 780} \n", - "3 8278 1862 {'WALK': 635, 'TRANSIT': 900} \n", - "4 7437 2943 {'WALK': 242, 'TRANSIT': 2700} \n", - "... ... ... ... \n", - "2046 13983 2400 {'WALK': 603, 'TRANSIT': 1524} \n", - "2047 13983 2400 {'WALK': 603, 'TRANSIT': 1524} \n", - "2048 12972 2310 {'WALK': 786, 'TRANSIT': 1277} \n", - "2049 6898 2332 {'WALK': 1671, 'TRANSIT': 660} \n", - "2050 6390 1393 {'WALK': 518, 'TRANSIT': 564} \n", + " development decision duration initial_walk_duration \\\n", + "0 False None 45 174 \n", + "1 False None 43 340 \n", "\n", - " distance_per_transit number_of_transit_stops duration \\\n", - "0 {'WALK': 465, 'TRANSIT': 6002} 1 1682.0 \n", - "1 {'WALK': 856, 'TRANSIT': 13214} 1 2565.0 \n", - "2 {'WALK': 1035, 'TRANSIT': 8535} 1 1774.0 \n", - "3 {'WALK': 710, 'TRANSIT': 7568} 2 1862.0 \n", - "4 {'WALK': 276, 'TRANSIT': 7161} 1 2167.0 \n", - "... ... ... ... \n", - "2046 {'WALK': 671, 'TRANSIT': 13312} 2 2605.0 \n", - "2047 {'WALK': 671, 'TRANSIT': 13312} 2 2605.0 \n", - "2048 {'WALK': 885, 'TRANSIT': 12087} 2 2322.0 \n", - "2049 {'WALK': 1945, 'TRANSIT': 4953} 1 2248.0 \n", - "2050 {'WALK': 441, 'TRANSIT': 5949} 2 2733.0 \n", - "\n", - " distance duration_static duration_per_transit \\\n", - "0 6810.0 1682.0 {'WALK': 608, 'TRANSIT': 804} \n", - "1 14070.0 2565.0 {'WALK': 750, 'TRANSIT': 1800} \n", - "2 9570.0 1774.0 {'WALK': 903, 'TRANSIT': 840} \n", - "3 8278.0 1862.0 {'WALK': 635, 'TRANSIT': 960} \n", - "4 9920.0 2167.0 {'WALK': 658, 'TRANSIT': 1200} \n", - "... ... ... ... \n", - "2046 14702.0 2605.0 {'WALK': 1467, 'TRANSIT': 1132} \n", - "2047 14702.0 2605.0 {'WALK': 1467, 'TRANSIT': 1132} \n", - "2048 12104.0 2322.0 {'WALK': 991, 'TRANSIT': 1080} \n", - "2049 6893.0 2248.0 {'WALK': 1671, 'TRANSIT': 576} \n", - "2050 6490.0 2733.0 {'WALK': 451, 'TRANSIT': 2280} \n", - "\n", - " distance_per_transit number_of_transit_stops \n", - "0 {'WALK': 582, 'TRANSIT': 6228} 2.0 \n", - "1 {'WALK': 856, 'TRANSIT': 13214} 1.0 \n", - "2 {'WALK': 1035, 'TRANSIT': 8535} 1.0 \n", - "3 {'WALK': 710, 'TRANSIT': 7568} 2.0 \n", - "4 {'WALK': 720, 'TRANSIT': 9200} 2.0 \n", - "... ... ... \n", - "2046 {'WALK': 1698, 'TRANSIT': 13004} 1.0 \n", - "2047 {'WALK': 1698, 'TRANSIT': 13004} 1.0 \n", - "2048 {'WALK': 1089, 'TRANSIT': 11015} 2.0 \n", - "2049 {'WALK': 1945, 'TRANSIT': 4948} 1.0 \n", - "2050 {'WALK': 488, 'TRANSIT': 6002} 1.0 \n", - "\n", - "[2051 rows x 17 columns]" + " duration_per_transit number_of_transit_stops \n", + "0 {'WALK': 414, 'TRANSIT': 2280} 1 \n", + "1 {'WALK': 750, 'TRANSIT': 1800} 1 " ] }, - "execution_count": 12, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "df = pd.concat([df.drop(['travel_time_fastest'], axis=1), df['travel_time_fastest'].apply(pd.Series)], axis=1)\n", - "df = pd.concat([df.drop(['travel_time_second'], axis=1), df['travel_time_second'].apply(pd.Series)], axis=1)\n", - "df" + "dropcolumns = ['distance_per_transit', 'duration_static', 'distance']\n", + "s1 = df['travel_time_fastest'].apply(pd.Series).drop(dropcolumns, axis=1)\n", + "\n", + "df3 = pd.concat([df.drop(['travel_time_fastest', 'travel_time_second'], axis=1), s1], axis=1)\n", + "df3.loc[:, 'duration'] = (df3.loc[:, ['duration']].min(axis=1) / 60).round()\n", + "df3.to_clipboard()\n", + "df3.head(2)" ] }, { @@ -1918,62 +1528,85 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 15, "id": "edd9fa24-cad2-4448-9b17-c6d514564f41", "metadata": {}, + "outputs": [], + "source": [ + "from data_access import Listing\n", + "import pytesseract\n", + "from PIL import Image\n", + "import re" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "002b2a3a-3ecc-45c1-8c2f-c143380ee0d5", + "metadata": {}, "outputs": [ { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
sqm_ocrpriceprice_per_sqmurlbedrooms
\n", - "
" - ], - "text/plain": [ - "Empty DataFrame\n", - "Columns: [sqm_ocr, price, price_per_sqm, url, bedrooms]\n", - "Index: []" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" + "name": "stdout", + "output_type": "stream", + "text": [ + "[('3.14', 'm'), ('3.43', 'm'), ('3.89', 'm'), ('3.62', 'm'), ('3.89', 'm'), ('2.88', 'm'), ('75.3', 'sq. m'), ('4.07', 'm'), ('6.18', 'm'), ('2.79', 'm'), ('3.34', 'm'), ('2.79', 'm'), ('4.34', 'm'), ('1.76', 'm'), ('2.29', 'm'), ('1.76', 'm'), ('1.92', 'm'), ('75.3', 'sq. m')]\n", + "-----\n", + "Bedroom 2\n", + "Bedroom 1 3.14m x 3.43m\n", + "3.89m x 3.62m (10'4\" x 11'3\")\n", + "(12'9\" x 11'10\")\n", + "\n", + "Bedroom 3\n", + "3.89m x 2.88m\n", + "(12'9\" x 9'5\")\n", + "\n", + "xt\n", + "\n", + "v\n", + "\n", + "Levita House NW1\n", + "\n", + "Approx. 75.3 sq. metres (810.2 sq. feet)\n", + "\n", + "Storage\n", + "\n", + ". Hall\n", + "Kitchen 4.07m x 6.18m\n", + "2.79m x 3.34m (13'4\" x 20'3\") Bedroom 4\n", + "\n", + "(9'2\" x 10'11\") 2.79m x 4.34m\n", + "\n", + "(9'2\" x 14'3\")\n", + "\n", + "Bathroom 1\n", + "1.76m x 2.29m\n", + "(5'9\" x 76\")\n", + "\n", + "Bathroom 2\n", + "1.76m x 1.92m\n", + "(5'9\" x 64\")\n", + "\n", + "Total area: approx. 75.3 sq. metres (810.2 sq. feet)\n", + "\n" + ] } ], "source": [ - "df[df.bedrooms > 2]" + "l = Listing(144497822)\n", + "for path in l.path_floorplan_folder().iterdir():\n", + " img = Image.open(path)\n", + " text = pytesseract.image_to_string(img)\n", + " sqmregex = r'(\\d+\\.\\d*) ?(sq ?m|sq. ?m|m)'\n", + " matches = re.findall(sqmregex, text.lower())\n", + " print(matches)\n", + " print(\"-----\")\n", + " print(text)" ] }, { "cell_type": "code", "execution_count": null, - "id": "002b2a3a-3ecc-45c1-8c2f-c143380ee0d5", + "id": "015e870d-0cf0-4d07-a9ae-4e80d128b26c", "metadata": {}, "outputs": [], "source": [