decisions + logger

This commit is contained in:
Kadir 2025-05-12 01:01:19 +01:00
parent 962c9a2f38
commit 3f4be8b7ff
3 changed files with 217 additions and 203 deletions

File diff suppressed because one or more lines are too long

View file

@ -33,18 +33,7 @@
"execution_count": 2,
"id": "424501ab-ecc6-42f5-b87e-b0d2871bdc74",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/var/folders/wl/kx43lvyn6yv7lq988gwrkq_m0000gn/T/ipykernel_85865/3290399543.py:3: FutureWarning: The behavior of 'to_datetime' with 'unit' when parsing strings is deprecated. In a future version, strings will be parsed as datetime strings, matching the behavior without a 'unit'. To retain the old behavior, explicitly cast ints or floats to numeric type before calling to_datetime.\n",
" decisions = pd.read_json(decisions_path)\n",
"/var/folders/wl/kx43lvyn6yv7lq988gwrkq_m0000gn/T/ipykernel_85865/3290399543.py:3: FutureWarning: The behavior of 'to_datetime' with 'unit' when parsing strings is deprecated. In a future version, strings will be parsed as datetime strings, matching the behavior without a 'unit'. To retain the old behavior, explicitly cast ints or floats to numeric type before calling to_datetime.\n",
" decisions = pd.read_json(decisions_path)\n"
]
}
],
"outputs": [],
"source": [
"# read decisions on file\n",
"decisions_path = 'data/decisions.json'\n",
@ -147,7 +136,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
"18508\n"
"10574\n"
]
}
],
@ -220,25 +209,6 @@
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>101369066</td>\n",
" <td>NaN</td>\n",
" <td>875000.0</td>\n",
" <td>NaN</td>\n",
" <td>https://www.rightmove.co.uk/properties/101369066</td>\n",
" <td>3</td>\n",
" <td>{'duration': 2252, 'distance': 7140, 'duration...</td>\n",
" <td>{'duration': 2465, 'distance': 7502, 'duration...</td>\n",
" <td>0.0</td>\n",
" <td>NaN</td>\n",
" <td>False</td>\n",
" <td>Share of Freehold</td>\n",
" <td>12</td>\n",
" <td>None</td>\n",
" <td>0</td>\n",
" <td>None</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>105484772</td>\n",
" <td>45.7</td>\n",
" <td>325000.0</td>\n",
@ -251,13 +221,13 @@
" <td>641.53</td>\n",
" <td>False</td>\n",
" <td>Leasehold</td>\n",
" <td>36</td>\n",
" <td>116</td>\n",
" <td>None</td>\n",
" <td>0</td>\n",
" <td>None</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <th>1</th>\n",
" <td>105827126</td>\n",
" <td>58.5</td>\n",
" <td>950000.0</td>\n",
@ -270,32 +240,13 @@
" <td>NaN</td>\n",
" <td>True</td>\n",
" <td>Leasehold</td>\n",
" <td>2</td>\n",
" <td>83</td>\n",
" <td>None</td>\n",
" <td>0</td>\n",
" <td>None</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>105836849</td>\n",
" <td>NaN</td>\n",
" <td>400000.0</td>\n",
" <td>NaN</td>\n",
" <td>https://www.rightmove.co.uk/properties/105836849</td>\n",
" <td>3</td>\n",
" <td>{'duration': 2565, 'distance': 14070, 'duratio...</td>\n",
" <td>{'duration': 2565, 'distance': 14070, 'duratio...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>False</td>\n",
" <td>Leasehold</td>\n",
" <td>393</td>\n",
" <td>None</td>\n",
" <td>20</td>\n",
" <td>None</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <th>2</th>\n",
" <td>108102476</td>\n",
" <td>53.7</td>\n",
" <td>515000.0</td>\n",
@ -308,7 +259,45 @@
" <td>NaN</td>\n",
" <td>False</td>\n",
" <td>Leasehold</td>\n",
" <td>16</td>\n",
" <td>97</td>\n",
" <td>None</td>\n",
" <td>0</td>\n",
" <td>None</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>108171770</td>\n",
" <td>45.0</td>\n",
" <td>650000.0</td>\n",
" <td>14444.444444</td>\n",
" <td>https://www.rightmove.co.uk/properties/108171770</td>\n",
" <td>2</td>\n",
" <td>{'duration': 1591, 'distance': 7827, 'duration...</td>\n",
" <td>{'duration': 1591, 'distance': 7827, 'duration...</td>\n",
" <td>962.0</td>\n",
" <td>2000.00</td>\n",
" <td>False</td>\n",
" <td>Leasehold</td>\n",
" <td>261</td>\n",
" <td>None</td>\n",
" <td>0</td>\n",
" <td>None</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>109595123</td>\n",
" <td>NaN</td>\n",
" <td>1000000.0</td>\n",
" <td>NaN</td>\n",
" <td>https://www.rightmove.co.uk/properties/109595123</td>\n",
" <td>1</td>\n",
" <td>{'duration': 2463, 'distance': 9565, 'duration...</td>\n",
" <td>{'duration': 2463, 'distance': 9565, 'duration...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>True</td>\n",
" <td>Please confirm if this is a freehold or leaseh...</td>\n",
" <td>96</td>\n",
" <td>None</td>\n",
" <td>0</td>\n",
" <td>None</td>\n",
@ -333,7 +322,45 @@
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18503</th>\n",
" <th>10569</th>\n",
" <td>88731877</td>\n",
" <td>NaN</td>\n",
" <td>570000.0</td>\n",
" <td>NaN</td>\n",
" <td>https://www.rightmove.co.uk/properties/88731877</td>\n",
" <td>1</td>\n",
" <td>{'duration': 912, 'distance': 6329, 'duration_...</td>\n",
" <td>{'duration': 852, 'distance': 6329, 'duration_...</td>\n",
" <td>998.0</td>\n",
" <td>NaN</td>\n",
" <td>False</td>\n",
" <td>Leasehold</td>\n",
" <td>407</td>\n",
" <td>None</td>\n",
" <td>0</td>\n",
" <td>None</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10570</th>\n",
" <td>89825950</td>\n",
" <td>48.9</td>\n",
" <td>680000.0</td>\n",
" <td>13905.930470</td>\n",
" <td>https://www.rightmove.co.uk/properties/89825950</td>\n",
" <td>1</td>\n",
" <td>{'duration': 273, 'distance': 762, 'duration_s...</td>\n",
" <td>{'duration': 273, 'distance': 762, 'duration_s...</td>\n",
" <td>112.0</td>\n",
" <td>1700.00</td>\n",
" <td>False</td>\n",
" <td>Leasehold</td>\n",
" <td>113</td>\n",
" <td>None</td>\n",
" <td>0</td>\n",
" <td>None</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10571</th>\n",
" <td>94206080</td>\n",
" <td>49.6</td>\n",
" <td>899000.0</td>\n",
@ -346,32 +373,13 @@
" <td>NaN</td>\n",
" <td>True</td>\n",
" <td>Leasehold</td>\n",
" <td>256</td>\n",
" <td>337</td>\n",
" <td>None</td>\n",
" <td>0</td>\n",
" <td>None</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18504</th>\n",
" <td>94206329</td>\n",
" <td>NaN</td>\n",
" <td>700000.0</td>\n",
" <td>NaN</td>\n",
" <td>https://www.rightmove.co.uk/properties/94206329</td>\n",
" <td>1</td>\n",
" <td>{'duration': 2172, 'distance': 12497, 'duratio...</td>\n",
" <td>{'duration': 2112, 'distance': 12497, 'duratio...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>False</td>\n",
" <td>Leasehold</td>\n",
" <td>256</td>\n",
" <td>None</td>\n",
" <td>20</td>\n",
" <td>None</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18505</th>\n",
" <th>10572</th>\n",
" <td>94508306</td>\n",
" <td>94.0</td>\n",
" <td>1000000.0</td>\n",
@ -384,13 +392,13 @@
" <td>NaN</td>\n",
" <td>False</td>\n",
" <td>Leasehold</td>\n",
" <td>149</td>\n",
" <td>230</td>\n",
" <td>None</td>\n",
" <td>0</td>\n",
" <td>9</td>\n",
" <td>None</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18506</th>\n",
" <th>10573</th>\n",
" <td>95975483</td>\n",
" <td>NaN</td>\n",
" <td>800000.0</td>\n",
@ -403,115 +411,109 @@
" <td>0.00</td>\n",
" <td>False</td>\n",
" <td>Leasehold</td>\n",
" <td>3</td>\n",
" <td>84</td>\n",
" <td>None</td>\n",
" <td>0</td>\n",
" <td>None</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18507</th>\n",
" <td>96773996</td>\n",
" <td>70.8</td>\n",
" <td>1000000.0</td>\n",
" <td>14124.293785</td>\n",
" <td>https://www.rightmove.co.uk/properties/96773996</td>\n",
" <td>2</td>\n",
" <td>{'duration': 1608, 'distance': 8301, 'duration...</td>\n",
" <td>{'duration': 1608, 'distance': 8301, 'duration...</td>\n",
" <td>992.0</td>\n",
" <td>4716.36</td>\n",
" <td>True</td>\n",
" <td>Leasehold</td>\n",
" <td>227</td>\n",
" <td>None</td>\n",
" <td>20</td>\n",
" <td>None</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>18508 rows × 16 columns</p>\n",
"<p>10574 rows × 16 columns</p>\n",
"</div>"
],
"text/plain": [
" identifier sqm_ocr price price_per_sqm \\\n",
"0 101369066 NaN 875000.0 NaN \n",
"1 105484772 45.7 325000.0 7111.597374 \n",
"2 105827126 58.5 950000.0 16239.316239 \n",
"3 105836849 NaN 400000.0 NaN \n",
"4 108102476 53.7 515000.0 9590.316574 \n",
"0 105484772 45.7 325000.0 7111.597374 \n",
"1 105827126 58.5 950000.0 16239.316239 \n",
"2 108102476 53.7 515000.0 9590.316574 \n",
"3 108171770 45.0 650000.0 14444.444444 \n",
"4 109595123 NaN 1000000.0 NaN \n",
"... ... ... ... ... \n",
"18503 94206080 49.6 899000.0 18125.000000 \n",
"18504 94206329 NaN 700000.0 NaN \n",
"18505 94508306 94.0 1000000.0 10638.297872 \n",
"18506 95975483 NaN 800000.0 NaN \n",
"18507 96773996 70.8 1000000.0 14124.293785 \n",
"10569 88731877 NaN 570000.0 NaN \n",
"10570 89825950 48.9 680000.0 13905.930470 \n",
"10571 94206080 49.6 899000.0 18125.000000 \n",
"10572 94508306 94.0 1000000.0 10638.297872 \n",
"10573 95975483 NaN 800000.0 NaN \n",
"\n",
" url bedrooms \\\n",
"0 https://www.rightmove.co.uk/properties/101369066 3 \n",
"1 https://www.rightmove.co.uk/properties/105484772 1 \n",
"2 https://www.rightmove.co.uk/properties/105827126 1 \n",
"3 https://www.rightmove.co.uk/properties/105836849 3 \n",
"4 https://www.rightmove.co.uk/properties/108102476 1 \n",
"0 https://www.rightmove.co.uk/properties/105484772 1 \n",
"1 https://www.rightmove.co.uk/properties/105827126 1 \n",
"2 https://www.rightmove.co.uk/properties/108102476 1 \n",
"3 https://www.rightmove.co.uk/properties/108171770 2 \n",
"4 https://www.rightmove.co.uk/properties/109595123 1 \n",
"... ... ... \n",
"18503 https://www.rightmove.co.uk/properties/94206080 1 \n",
"18504 https://www.rightmove.co.uk/properties/94206329 1 \n",
"18505 https://www.rightmove.co.uk/properties/94508306 2 \n",
"18506 https://www.rightmove.co.uk/properties/95975483 2 \n",
"18507 https://www.rightmove.co.uk/properties/96773996 2 \n",
"10569 https://www.rightmove.co.uk/properties/88731877 1 \n",
"10570 https://www.rightmove.co.uk/properties/89825950 1 \n",
"10571 https://www.rightmove.co.uk/properties/94206080 1 \n",
"10572 https://www.rightmove.co.uk/properties/94508306 2 \n",
"10573 https://www.rightmove.co.uk/properties/95975483 2 \n",
"\n",
" travel_time_fastest \\\n",
"0 {'duration': 2252, 'distance': 7140, 'duration... \n",
"1 {'duration': 1983, 'distance': 10095, 'duratio... \n",
"2 {'duration': 2478, 'distance': 9584, 'duration... \n",
"3 {'duration': 2565, 'distance': 14070, 'duratio... \n",
"4 {'duration': 1266, 'distance': 4042, 'duration... \n",
"0 {'duration': 1983, 'distance': 10095, 'duratio... \n",
"1 {'duration': 2478, 'distance': 9584, 'duration... \n",
"2 {'duration': 1266, 'distance': 4042, 'duration... \n",
"3 {'duration': 1591, 'distance': 7827, 'duration... \n",
"4 {'duration': 2463, 'distance': 9565, 'duration... \n",
"... ... \n",
"18503 {'duration': 1125, 'distance': 4637, 'duration... \n",
"18504 {'duration': 2172, 'distance': 12497, 'duratio... \n",
"18505 {'duration': 1046, 'distance': 2193, 'duration... \n",
"18506 {'duration': 2281, 'distance': 7262, 'duration... \n",
"18507 {'duration': 1608, 'distance': 8301, 'duration... \n",
"10569 {'duration': 912, 'distance': 6329, 'duration_... \n",
"10570 {'duration': 273, 'distance': 762, 'duration_s... \n",
"10571 {'duration': 1125, 'distance': 4637, 'duration... \n",
"10572 {'duration': 1046, 'distance': 2193, 'duration... \n",
"10573 {'duration': 2281, 'distance': 7262, 'duration... \n",
"\n",
" travel_time_second lease_left \\\n",
"0 {'duration': 2465, 'distance': 7502, 'duration... 0.0 \n",
"1 {'duration': 2043, 'distance': 10083, 'duratio... 104.0 \n",
"2 {'duration': 2478, 'distance': 9584, 'duration... NaN \n",
"3 {'duration': 2565, 'distance': 14070, 'duratio... NaN \n",
"4 {'duration': 1861, 'distance': 4548, 'duration... 104.0 \n",
"0 {'duration': 2043, 'distance': 10083, 'duratio... 104.0 \n",
"1 {'duration': 2478, 'distance': 9584, 'duration... NaN \n",
"2 {'duration': 1861, 'distance': 4548, 'duration... 104.0 \n",
"3 {'duration': 1591, 'distance': 7827, 'duration... 962.0 \n",
"4 {'duration': 2463, 'distance': 9565, 'duration... NaN \n",
"... ... ... \n",
"18503 {'duration': 1125, 'distance': 4641, 'duration... NaN \n",
"18504 {'duration': 2112, 'distance': 12497, 'duratio... NaN \n",
"18505 {'duration': 1046, 'distance': 2193, 'duration... 977.0 \n",
"18506 {'duration': 2815, 'distance': 5607, 'duration... 999.0 \n",
"18507 {'duration': 1608, 'distance': 8301, 'duration... 992.0 \n",
"10569 {'duration': 852, 'distance': 6329, 'duration_... 998.0 \n",
"10570 {'duration': 273, 'distance': 762, 'duration_s... 112.0 \n",
"10571 {'duration': 1125, 'distance': 4641, 'duration... NaN \n",
"10572 {'duration': 1046, 'distance': 2193, 'duration... 977.0 \n",
"10573 {'duration': 2815, 'distance': 5607, 'duration... 999.0 \n",
"\n",
" service_charge development tenure_type updated_days status \\\n",
"0 NaN False Share of Freehold 12 None \n",
"1 641.53 False Leasehold 36 None \n",
"2 NaN True Leasehold 2 None \n",
"3 NaN False Leasehold 393 None \n",
"4 NaN False Leasehold 16 None \n",
"... ... ... ... ... ... \n",
"18503 NaN True Leasehold 256 None \n",
"18504 NaN False Leasehold 256 None \n",
"18505 NaN False Leasehold 149 None \n",
"18506 0.00 False Leasehold 3 None \n",
"18507 4716.36 True Leasehold 227 None \n",
" service_charge development \\\n",
"0 641.53 False \n",
"1 NaN True \n",
"2 NaN False \n",
"3 2000.00 False \n",
"4 NaN True \n",
"... ... ... \n",
"10569 NaN False \n",
"10570 1700.00 False \n",
"10571 NaN True \n",
"10572 NaN False \n",
"10573 0.00 False \n",
"\n",
" tenure_type updated_days status \\\n",
"0 Leasehold 116 None \n",
"1 Leasehold 83 None \n",
"2 Leasehold 97 None \n",
"3 Leasehold 261 None \n",
"4 Please confirm if this is a freehold or leaseh... 96 None \n",
"... ... ... ... \n",
"10569 Leasehold 407 None \n",
"10570 Leasehold 113 None \n",
"10571 Leasehold 337 None \n",
"10572 Leasehold 230 None \n",
"10573 Leasehold 84 None \n",
"\n",
" last_seen decision \n",
"0 0 None \n",
"1 0 None \n",
"2 0 None \n",
"3 20 None \n",
"3 0 None \n",
"4 0 None \n",
"... ... ... \n",
"18503 0 None \n",
"18504 20 None \n",
"18505 0 None \n",
"18506 0 None \n",
"18507 20 None \n",
"10569 0 None \n",
"10570 0 None \n",
"10571 0 None \n",
"10572 9 None \n",
"10573 0 None \n",
"\n",
"[18508 rows x 16 columns]"
"[10574 rows x 16 columns]"
]
},
"execution_count": 8,
@ -534,7 +536,7 @@
{
"data": {
"text/plain": [
"(18508, 16)"
"(10574, 16)"
]
},
"execution_count": 9,
@ -600,7 +602,7 @@
{
"data": {
"text/plain": [
"(17217, 18)"
"(9494, 18)"
]
},
"execution_count": 12,
@ -650,12 +652,12 @@
"3 None\n",
"4 None\n",
" ... \n",
"18503 None\n",
"18504 None\n",
"18505 None\n",
"18506 None\n",
"18507 None\n",
"Name: status, Length: 17217, dtype: object"
"10569 None\n",
"10570 None\n",
"10571 None\n",
"10572 None\n",
"10573 None\n",
"Name: status, Length: 9494, dtype: object"
]
},
"execution_count": 13,
@ -676,7 +678,7 @@
{
"data": {
"text/plain": [
"(10396, 17)"
"(6578, 17)"
]
},
"execution_count": 14,
@ -742,26 +744,6 @@
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>101369066</td>\n",
" <td>-1.0</td>\n",
" <td>875000.0</td>\n",
" <td>NaN</td>\n",
" <td>https://www.rightmove.co.uk/properties/101369066</td>\n",
" <td>3</td>\n",
" <td>0.0</td>\n",
" <td>-1.00</td>\n",
" <td>False</td>\n",
" <td>Share of Freehold</td>\n",
" <td>12</td>\n",
" <td>0</td>\n",
" <td>None</td>\n",
" <td>38</td>\n",
" <td>142</td>\n",
" <td>{'WALK': 797, 'TRANSIT': 1227}</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>105484772</td>\n",
" <td>45.7</td>\n",
" <td>325000.0</td>\n",
@ -772,7 +754,7 @@
" <td>641.53</td>\n",
" <td>False</td>\n",
" <td>Leasehold</td>\n",
" <td>36</td>\n",
" <td>116</td>\n",
" <td>0</td>\n",
" <td>None</td>\n",
" <td>33</td>\n",
@ -780,30 +762,50 @@
" <td>{'WALK': 609, 'TRANSIT': 1109}</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>108102476</td>\n",
" <td>53.7</td>\n",
" <td>515000.0</td>\n",
" <td>9590.316574</td>\n",
" <td>https://www.rightmove.co.uk/properties/108102476</td>\n",
" <td>1</td>\n",
" <td>104.0</td>\n",
" <td>-1.00</td>\n",
" <td>False</td>\n",
" <td>Leasehold</td>\n",
" <td>97</td>\n",
" <td>0</td>\n",
" <td>None</td>\n",
" <td>21</td>\n",
" <td>593</td>\n",
" <td>{'WALK': 819, 'TRANSIT': 445}</td>\n",
" <td>1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" identifier sqm_ocr price price_per_sqm \\\n",
"0 101369066 -1.0 875000.0 NaN \n",
"1 105484772 45.7 325000.0 7111.597374 \n",
"0 105484772 45.7 325000.0 7111.597374 \n",
"2 108102476 53.7 515000.0 9590.316574 \n",
"\n",
" url bedrooms lease_left \\\n",
"0 https://www.rightmove.co.uk/properties/101369066 3 0.0 \n",
"1 https://www.rightmove.co.uk/properties/105484772 1 104.0 \n",
"0 https://www.rightmove.co.uk/properties/105484772 1 104.0 \n",
"2 https://www.rightmove.co.uk/properties/108102476 1 104.0 \n",
"\n",
" service_charge development tenure_type updated_days last_seen \\\n",
"0 -1.00 False Share of Freehold 12 0 \n",
"1 641.53 False Leasehold 36 0 \n",
" service_charge development tenure_type updated_days last_seen decision \\\n",
"0 641.53 False Leasehold 116 0 None \n",
"2 -1.00 False Leasehold 97 0 None \n",
"\n",
" decision duration initial_walk_duration duration_per_transit \\\n",
"0 None 38 142 {'WALK': 797, 'TRANSIT': 1227} \n",
"1 None 33 372 {'WALK': 609, 'TRANSIT': 1109} \n",
" duration initial_walk_duration duration_per_transit \\\n",
"0 33 372 {'WALK': 609, 'TRANSIT': 1109} \n",
"2 21 593 {'WALK': 819, 'TRANSIT': 445} \n",
"\n",
" number_of_transit_stops \n",
"0 2 \n",
"1 2 "
"2 1 "
]
},
"execution_count": 15,

12
crawler/logger.py Normal file
View file

@ -0,0 +1,12 @@
import logging
def createLogger(name):
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
handlers=[
logging.FileHandler('app.log'),
logging.StreamHandler()
]
)
return logging.getLogger(name)