wrongmove/crawler/exploration.ipynb
2024-04-05 11:38:55 +01:00

1174 lines
40 KiB
Text
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "38e8690a-f6f7-4e14-a657-f20605477afd",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/Users/kadir/code/realestate/crawler/venv/lib/python3.12/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
" from .autonotebook import tqdm as notebook_tqdm\n"
]
}
],
"source": [
"from data_access import Listing\n",
"import pandas as pd"
]
},
{
"cell_type": "markdown",
"id": "cfe2ab03-3204-4fd8-b76a-a734f6b87d75",
"metadata": {},
"source": [
"### Fetch previous decisions"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "db55b615-698c-4f5d-881a-ea1d3b6d6205",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(93, 2)\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>identifier</th>\n",
" <th>decision</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>2047</th>\n",
" <td>145699277</td>\n",
" <td>n</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1442</th>\n",
" <td>144642851</td>\n",
" <td>n</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1839</th>\n",
" <td>145394765</td>\n",
" <td>n</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1853</th>\n",
" <td>145418669</td>\n",
" <td>removed</td>\n",
" </tr>\n",
" <tr>\n",
" <th>930</th>\n",
" <td>143205230</td>\n",
" <td>n</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" identifier decision\n",
"2047 145699277 n\n",
"1442 144642851 n\n",
"1839 145394765 n\n",
"1853 145418669 removed\n",
"930 143205230 n"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"decisions = pd.read_clipboard()\n",
"decisions = decisions.loc[decisions.decision.notna(), ['identifier', 'decision']]\n",
"print(decisions.shape)\n",
"decisions.head()\n"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "86224a20-53e1-403c-8d9f-71b9a9df750c",
"metadata": {},
"outputs": [],
"source": [
"\"\"\"\n",
"output:\n",
"{145699277: 'n',\n",
" 144642851: 'n',\n",
" 145394765: 'n',\n",
" 145418669: 'removed',\n",
" 143205230: 'n',\n",
" 140628560: 'eigentlich geil',\n",
" ...\n",
"}\n",
"\"\"\"\n",
"decisions = decisions.set_index('identifier').decision.to_dict()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ec257220-f170-41b8-9f9d-b8ef61512acf",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 4,
"id": "6dbd25bd-802d-4953-83c3-f01640174353",
"metadata": {},
"outputs": [],
"source": [
"# Use if we want to skip at the bottom\n",
"# decisions = {}"
]
},
{
"cell_type": "markdown",
"id": "7c1ee5eb-1000-4ced-983c-df47fb6ceae8",
"metadata": {},
"source": [
"### Get all data prepped for sheets"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f20bddee-1e7c-4c46-a17a-c7bb6c13f30c",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 5,
"id": "b1101088-9613-465f-81fd-79801e0202b8",
"metadata": {},
"outputs": [],
"source": [
"ls = Listing.get_all_listings()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "63e61601-7e3f-4d58-89f6-1794e4868cc3",
"metadata": {},
"outputs": [],
"source": [
"ds = [l.dict_nicely() for l in ls]"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "1c222721-f426-42c0-9ac5-badc1f7a2034",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>identifier</th>\n",
" <th>sqm_ocr</th>\n",
" <th>price</th>\n",
" <th>price_per_sqm</th>\n",
" <th>url</th>\n",
" <th>bedrooms</th>\n",
" <th>travel_time_fastest</th>\n",
" <th>travel_time_second</th>\n",
" <th>lease_left</th>\n",
" <th>development</th>\n",
" <th>decision</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>100506851</td>\n",
" <td>58.4</td>\n",
" <td>525000.0</td>\n",
" <td>8989.726027</td>\n",
" <td>https://www.rightmove.co.uk/properties/100506851</td>\n",
" <td>2</td>\n",
" <td>{'duration': 1948, 'distance': 10927, 'duratio...</td>\n",
" <td>{'duration': 1948, 'distance': 10927, 'duratio...</td>\n",
" <td>NaN</td>\n",
" <td>False</td>\n",
" <td>None</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>100938761</td>\n",
" <td>NaN</td>\n",
" <td>390000.0</td>\n",
" <td>NaN</td>\n",
" <td>https://www.rightmove.co.uk/properties/100938761</td>\n",
" <td>1</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>996.0</td>\n",
" <td>False</td>\n",
" <td>None</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>101817179</td>\n",
" <td>53.2</td>\n",
" <td>495000.0</td>\n",
" <td>9304.511278</td>\n",
" <td>https://www.rightmove.co.uk/properties/101817179</td>\n",
" <td>1</td>\n",
" <td>{'duration': 2702, 'distance': 8637, 'duration...</td>\n",
" <td>{'duration': 3333, 'distance': 10013, 'duratio...</td>\n",
" <td>91.0</td>\n",
" <td>False</td>\n",
" <td>None</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>101939660</td>\n",
" <td>56.5</td>\n",
" <td>238000.0</td>\n",
" <td>4212.389381</td>\n",
" <td>https://www.rightmove.co.uk/properties/101939660</td>\n",
" <td>2</td>\n",
" <td>{'duration': 2262, 'distance': 13512, 'duratio...</td>\n",
" <td>{'duration': 2322, 'distance': 13491, 'duratio...</td>\n",
" <td>0.0</td>\n",
" <td>False</td>\n",
" <td>None</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>102103157</td>\n",
" <td>NaN</td>\n",
" <td>425000.0</td>\n",
" <td>NaN</td>\n",
" <td>https://www.rightmove.co.uk/properties/102103157</td>\n",
" <td>1</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>NaN</td>\n",
" <td>False</td>\n",
" <td>None</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>34537</th>\n",
" <td>97023443</td>\n",
" <td>8.3</td>\n",
" <td>699999.0</td>\n",
" <td>84337.228916</td>\n",
" <td>https://www.rightmove.co.uk/properties/97023443</td>\n",
" <td>1</td>\n",
" <td>{'duration': 1704, 'distance': 8729, 'duration...</td>\n",
" <td>{'duration': 1713, 'distance': 6755, 'duration...</td>\n",
" <td>993.0</td>\n",
" <td>True</td>\n",
" <td>None</td>\n",
" </tr>\n",
" <tr>\n",
" <th>34538</th>\n",
" <td>97124237</td>\n",
" <td>53.4</td>\n",
" <td>300000.0</td>\n",
" <td>5617.977528</td>\n",
" <td>https://www.rightmove.co.uk/properties/97124237</td>\n",
" <td>1</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>NaN</td>\n",
" <td>False</td>\n",
" <td>None</td>\n",
" </tr>\n",
" <tr>\n",
" <th>34539</th>\n",
" <td>97335680</td>\n",
" <td>48.0</td>\n",
" <td>315000.0</td>\n",
" <td>6562.500000</td>\n",
" <td>https://www.rightmove.co.uk/properties/97335680</td>\n",
" <td>2</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>NaN</td>\n",
" <td>False</td>\n",
" <td>None</td>\n",
" </tr>\n",
" <tr>\n",
" <th>34540</th>\n",
" <td>97522346</td>\n",
" <td>NaN</td>\n",
" <td>400000.0</td>\n",
" <td>NaN</td>\n",
" <td>https://www.rightmove.co.uk/properties/97522346</td>\n",
" <td>2</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>NaN</td>\n",
" <td>False</td>\n",
" <td>None</td>\n",
" </tr>\n",
" <tr>\n",
" <th>34541</th>\n",
" <td>98352914</td>\n",
" <td>NaN</td>\n",
" <td>399950.0</td>\n",
" <td>NaN</td>\n",
" <td>https://www.rightmove.co.uk/properties/98352914</td>\n",
" <td>2</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>125.0</td>\n",
" <td>True</td>\n",
" <td>None</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>34542 rows × 11 columns</p>\n",
"</div>"
],
"text/plain": [
" identifier sqm_ocr price price_per_sqm \\\n",
"0 100506851 58.4 525000.0 8989.726027 \n",
"1 100938761 NaN 390000.0 NaN \n",
"2 101817179 53.2 495000.0 9304.511278 \n",
"3 101939660 56.5 238000.0 4212.389381 \n",
"4 102103157 NaN 425000.0 NaN \n",
"... ... ... ... ... \n",
"34537 97023443 8.3 699999.0 84337.228916 \n",
"34538 97124237 53.4 300000.0 5617.977528 \n",
"34539 97335680 48.0 315000.0 6562.500000 \n",
"34540 97522346 NaN 400000.0 NaN \n",
"34541 98352914 NaN 399950.0 NaN \n",
"\n",
" url bedrooms \\\n",
"0 https://www.rightmove.co.uk/properties/100506851 2 \n",
"1 https://www.rightmove.co.uk/properties/100938761 1 \n",
"2 https://www.rightmove.co.uk/properties/101817179 1 \n",
"3 https://www.rightmove.co.uk/properties/101939660 2 \n",
"4 https://www.rightmove.co.uk/properties/102103157 1 \n",
"... ... ... \n",
"34537 https://www.rightmove.co.uk/properties/97023443 1 \n",
"34538 https://www.rightmove.co.uk/properties/97124237 1 \n",
"34539 https://www.rightmove.co.uk/properties/97335680 2 \n",
"34540 https://www.rightmove.co.uk/properties/97522346 2 \n",
"34541 https://www.rightmove.co.uk/properties/98352914 2 \n",
"\n",
" travel_time_fastest \\\n",
"0 {'duration': 1948, 'distance': 10927, 'duratio... \n",
"1 None \n",
"2 {'duration': 2702, 'distance': 8637, 'duration... \n",
"3 {'duration': 2262, 'distance': 13512, 'duratio... \n",
"4 None \n",
"... ... \n",
"34537 {'duration': 1704, 'distance': 8729, 'duration... \n",
"34538 None \n",
"34539 None \n",
"34540 None \n",
"34541 None \n",
"\n",
" travel_time_second lease_left \\\n",
"0 {'duration': 1948, 'distance': 10927, 'duratio... NaN \n",
"1 None 996.0 \n",
"2 {'duration': 3333, 'distance': 10013, 'duratio... 91.0 \n",
"3 {'duration': 2322, 'distance': 13491, 'duratio... 0.0 \n",
"4 None NaN \n",
"... ... ... \n",
"34537 {'duration': 1713, 'distance': 6755, 'duration... 993.0 \n",
"34538 None NaN \n",
"34539 None NaN \n",
"34540 None NaN \n",
"34541 None 125.0 \n",
"\n",
" development decision \n",
"0 False None \n",
"1 False None \n",
"2 False None \n",
"3 False None \n",
"4 False None \n",
"... ... ... \n",
"34537 True None \n",
"34538 False None \n",
"34539 False None \n",
"34540 False None \n",
"34541 True None \n",
"\n",
"[34542 rows x 11 columns]"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = pd.DataFrame(ds)\n",
"df.loc[:, 'decision'] = df.identifier.apply(lambda x: decisions.get(x))\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "d80d9911-9a6d-4608-a6da-11dc864ee32b",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(34542, 11)"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.shape"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "7b37ad6b-9b0a-444e-b8c3-6fe4e43e42cb",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>identifier</th>\n",
" <th>sqm_ocr</th>\n",
" <th>price</th>\n",
" <th>price_per_sqm</th>\n",
" <th>url</th>\n",
" <th>bedrooms</th>\n",
" <th>lease_left</th>\n",
" <th>development</th>\n",
" <th>decision</th>\n",
" <th>a_duration</th>\n",
" <th>a_initial_walk_duration</th>\n",
" <th>a_duration_per_transit</th>\n",
" <th>a_number_of_transit_stops</th>\n",
" <th>b_duration</th>\n",
" <th>b_initial_walk_duration</th>\n",
" <th>b_duration_per_transit</th>\n",
" <th>b_number_of_transit_stops</th>\n",
" <th>min_duration</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>100506851</td>\n",
" <td>58.4</td>\n",
" <td>525000.0</td>\n",
" <td>8989.726027</td>\n",
" <td>https://www.rightmove.co.uk/properties/100506851</td>\n",
" <td>2</td>\n",
" <td>NaN</td>\n",
" <td>False</td>\n",
" <td>None</td>\n",
" <td>1948.0</td>\n",
" <td>161.0</td>\n",
" <td>{'WALK': 481, 'TRANSIT': 1200}</td>\n",
" <td>2.0</td>\n",
" <td>1948.0</td>\n",
" <td>161.0</td>\n",
" <td>{'WALK': 481, 'TRANSIT': 1200}</td>\n",
" <td>2.0</td>\n",
" <td>32.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>100938761</td>\n",
" <td>NaN</td>\n",
" <td>390000.0</td>\n",
" <td>NaN</td>\n",
" <td>https://www.rightmove.co.uk/properties/100938761</td>\n",
" <td>1</td>\n",
" <td>996.0</td>\n",
" <td>False</td>\n",
" <td>None</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" identifier sqm_ocr price price_per_sqm \\\n",
"0 100506851 58.4 525000.0 8989.726027 \n",
"1 100938761 NaN 390000.0 NaN \n",
"\n",
" url bedrooms lease_left \\\n",
"0 https://www.rightmove.co.uk/properties/100506851 2 NaN \n",
"1 https://www.rightmove.co.uk/properties/100938761 1 996.0 \n",
"\n",
" development decision a_duration a_initial_walk_duration \\\n",
"0 False None 1948.0 161.0 \n",
"1 False None NaN NaN \n",
"\n",
" a_duration_per_transit a_number_of_transit_stops b_duration \\\n",
"0 {'WALK': 481, 'TRANSIT': 1200} 2.0 1948.0 \n",
"1 NaN NaN NaN \n",
"\n",
" b_initial_walk_duration b_duration_per_transit \\\n",
"0 161.0 {'WALK': 481, 'TRANSIT': 1200} \n",
"1 NaN NaN \n",
"\n",
" b_number_of_transit_stops min_duration \n",
"0 2.0 32.0 \n",
"1 NaN NaN "
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# dropcolumns = ['distance_per_transit', 'duration_static', 'distance']\n",
"# s1 = df['travel_time_fastest'].apply(pd.Series).drop(dropcolumns, axis=1)\n",
"# s1.columns = ['a_' + c for c in s1.columns]\n",
"\n",
"# s2 = df['travel_time_second'].apply(pd.Series).drop(dropcolumns, axis=1)\n",
"# s2.columns = ['b_' + c for c in s2.columns]\n",
"\n",
"# df2 = pd.concat([df.drop(['travel_time_fastest', 'travel_time_second'], axis=1), s1, s2], axis=1)\n",
"# df2.loc[:, 'min_duration'] = (df2.loc[:, ['a_duration', 'b_duration']].min(axis=1) / 60).round()\n",
"# df2.head(2)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "8c75aaa6-6113-482f-809b-11e405510184",
"metadata": {},
"outputs": [],
"source": [
"# df2.to_clipboard()"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "79f99692-91e8-4915-9b57-7b3a1efd7d3a",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>identifier</th>\n",
" <th>sqm_ocr</th>\n",
" <th>price</th>\n",
" <th>price_per_sqm</th>\n",
" <th>url</th>\n",
" <th>bedrooms</th>\n",
" <th>lease_left</th>\n",
" <th>development</th>\n",
" <th>decision</th>\n",
" <th>duration</th>\n",
" <th>initial_walk_duration</th>\n",
" <th>duration_per_transit</th>\n",
" <th>number_of_transit_stops</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>100506851</td>\n",
" <td>58.4</td>\n",
" <td>525000.0</td>\n",
" <td>8989.726027</td>\n",
" <td>https://www.rightmove.co.uk/properties/100506851</td>\n",
" <td>2</td>\n",
" <td>NaN</td>\n",
" <td>False</td>\n",
" <td>None</td>\n",
" <td>32.0</td>\n",
" <td>161.0</td>\n",
" <td>{'WALK': 481, 'TRANSIT': 1200}</td>\n",
" <td>2.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>100938761</td>\n",
" <td>NaN</td>\n",
" <td>390000.0</td>\n",
" <td>NaN</td>\n",
" <td>https://www.rightmove.co.uk/properties/100938761</td>\n",
" <td>1</td>\n",
" <td>996.0</td>\n",
" <td>False</td>\n",
" <td>None</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" identifier sqm_ocr price price_per_sqm \\\n",
"0 100506851 58.4 525000.0 8989.726027 \n",
"1 100938761 NaN 390000.0 NaN \n",
"\n",
" url bedrooms lease_left \\\n",
"0 https://www.rightmove.co.uk/properties/100506851 2 NaN \n",
"1 https://www.rightmove.co.uk/properties/100938761 1 996.0 \n",
"\n",
" development decision duration initial_walk_duration \\\n",
"0 False None 32.0 161.0 \n",
"1 False None NaN NaN \n",
"\n",
" duration_per_transit number_of_transit_stops \n",
"0 {'WALK': 481, 'TRANSIT': 1200} 2.0 \n",
"1 NaN NaN "
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dropcolumns = ['distance_per_transit', 'duration_static', 'distance']\n",
"s1 = df['travel_time_fastest'].apply(pd.Series).drop(dropcolumns, axis=1)\n",
"\n",
"df3 = pd.concat([df.drop(['travel_time_fastest', 'travel_time_second'], axis=1), s1], axis=1)\n",
"df3.loc[:, 'duration'] = (df3.loc[:, ['duration']].min(axis=1) / 60).round()\n",
"df3.to_clipboard()\n",
"df3.head(2)"
]
},
{
"cell_type": "code",
"execution_count": 20,
"id": "abcbde40-7432-4449-957a-79ce2ca126eb",
"metadata": {},
"outputs": [],
"source": [
"df3[df3.duration < 45].to_clipboard()\n"
]
},
{
"cell_type": "markdown",
"id": "98f8e950-2a3b-4856-aa62-3bc758e2fd42",
"metadata": {},
"source": [
"# Find out the proper radius we want to use"
]
},
{
"cell_type": "code",
"execution_count": 50,
"id": "04bb61d5-cba7-4739-9568-b00342c1b636",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Filtered listings from 32271 to 15804\n"
]
}
],
"source": [
"from data_access import Listing\n",
"from geopy.distance import geodesic\n",
"\n",
"listings = Listing.get_all_listings()\n",
"BROCK_STREET_LAT_LONG = 51.52570434674584, -0.13956495005056113\n",
"\n",
"# reduce listings to everything within 7 miles\n",
"filtered_listings = []\n",
"for listing in listings:\n",
" miles = geodesic(BROCK_STREET_LAT_LONG, (listing.latitude, listing.longitude)).miles\n",
" if miles <= 7:\n",
" filtered_listings.append(listing)\n",
"\n",
"print(f\"Filtered listings from {len(listings)} to {len(filtered_listings)}\")"
]
},
{
"cell_type": "markdown",
"id": "a73fba2d-afeb-4194-8421-eff8e84a14e9",
"metadata": {},
"source": [
"# Typeahead / fetch all boroughs"
]
},
{
"cell_type": "code",
"execution_count": 20,
"id": "61844fe2-408d-4b89-995f-c31110a850f6",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'key': 'CAMD',\n",
" 'term': 'CAMD',\n",
" 'typeAheadLocations': [{'displayName': 'Camden, North West London',\n",
" 'locationIdentifier': 'REGION^85261',\n",
" 'normalisedSearchTerm': 'CAMDEN NORTH WEST LONDON'},\n",
" {'displayName': 'Camden (London Borough)',\n",
" 'locationIdentifier': 'REGION^93941',\n",
" 'normalisedSearchTerm': 'CAMDEN LONDON BOROUGH'},\n",
" {'displayName': 'Camden Town, North West London',\n",
" 'locationIdentifier': 'REGION^85262',\n",
" 'normalisedSearchTerm': 'CAMDEN TOWN NORTH WEST LONDON'},\n",
" {'displayName': 'Camden Town Station',\n",
" 'locationIdentifier': 'STATION^1712',\n",
" 'normalisedSearchTerm': 'CAMDEN TOWN STATION'},\n",
" {'displayName': 'Camden Road Station',\n",
" 'locationIdentifier': 'STATION^1709',\n",
" 'normalisedSearchTerm': 'CAMDEN ROAD STATION'},\n",
" {'displayName': 'Camden Town, Gosport, Hampshire',\n",
" 'locationIdentifier': 'REGION^76577',\n",
" 'normalisedSearchTerm': 'CAMDEN TOWN GOSPORT HAMPSHIRE'},\n",
" {'displayName': 'Camderry, Omagh, County Tyrone, Northern Ireland',\n",
" 'locationIdentifier': 'REGION^73327',\n",
" 'normalisedSearchTerm': 'CAMDERRY OMAGH COUNTY TYRONE NORTHERN IRELAND'}],\n",
" 'isComplete': True}"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import requests\n",
"\n",
"cookies = {\n",
" 'permuserid': '240330LSEXEOANG04Q2VA3OZCIQ8TTSQ',\n",
" 'TS019c0ed0': '012f990cd3494097746bc0b10b8d61bc6237319024e80701e0e8d735bd7d62a792529823c3164c771f223b0cade5ec9ae4b7fc001c',\n",
" 'beta_optin': 'N:36:-1',\n",
" 'RM_Register': 'C',\n",
" 'JSESSIONID': '0BE8E261D81387C9BC530DB1A5F28955',\n",
" 'svr': '3111',\n",
" 'permuserid': '240330LSEXEOANG04Q2VA3OZCIQ8TTSQ',\n",
" 'TS01ec61d1': '012f990cd3161dd68e4ed69b9d64f7d4de2356c651edef96201cc7facd11b28ef9338596a2022bc2ed7a56f09c5dee5aa1711de2d1',\n",
" 'rmsessionid': '7bc54ce6-da97-42cf-8719-4e3e9c53e276',\n",
" 'TS01821201': '012f990cd35255a563a541cfe06e4a774c129628165af71838ccdb7a17919672962514804459a0d9d9a90fe7b8feeec66145e30b98',\n",
" 'TS01826437': '012f990cd3161dd68e4ed69b9d64f7d4de2356c651edef96201cc7facd11b28ef9338596a2022bc2ed7a56f09c5dee5aa1711de2d1',\n",
" 'TPCmaxPrice': '800000',\n",
" 'TS01a07bd2': '012f990cd3161dd68e4ed69b9d64f7d4de2356c651edef96201cc7facd11b28ef9338596a2022bc2ed7a56f09c5dee5aa1711de2d1',\n",
" 'TPCminPrice': '700000',\n",
"}\n",
"\n",
"headers = {\n",
" 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:123.0) Gecko/20100101 Firefox/123.0',\n",
" 'Accept': 'application/json, text/javascript',\n",
" 'Accept-Language': 'en-GB,en;q=0.5',\n",
" # 'Accept-Encoding': 'gzip, deflate, br',\n",
" 'Referer': 'https://www.rightmove.co.uk/property-for-sale/find.html?locationIdentifier=REGION%5E87515&maxBedrooms=3&minBedrooms=1&maxPrice=800000&minPrice=750001&propertyTypes=&includeSSTC=false&mustHave=&dontShow=retirement%2CsharedOwnership&furnishTypes=&keywords=',\n",
" 'X-Correlation-Text-Val': 'source=search',\n",
" 'Content-Type': 'application/x-www-form-urlencoded',\n",
" 'traceparent': '00-d3020142d839bf1ef2b172fa596acea8-605693e1c4c5cbb5-00',\n",
" 'DNT': '1',\n",
" 'Sec-GPC': '1',\n",
" 'Connection': 'keep-alive',\n",
" # 'Cookie': 'permuserid=240330LSEXEOANG04Q2VA3OZCIQ8TTSQ; TS019c0ed0=012f990cd3494097746bc0b10b8d61bc6237319024e80701e0e8d735bd7d62a792529823c3164c771f223b0cade5ec9ae4b7fc001c; beta_optin=N:36:-1; RM_Register=C; JSESSIONID=0BE8E261D81387C9BC530DB1A5F28955; svr=3111; permuserid=240330LSEXEOANG04Q2VA3OZCIQ8TTSQ; TS01ec61d1=012f990cd3161dd68e4ed69b9d64f7d4de2356c651edef96201cc7facd11b28ef9338596a2022bc2ed7a56f09c5dee5aa1711de2d1; rmsessionid=7bc54ce6-da97-42cf-8719-4e3e9c53e276; TS01821201=012f990cd35255a563a541cfe06e4a774c129628165af71838ccdb7a17919672962514804459a0d9d9a90fe7b8feeec66145e30b98; TS01826437=012f990cd3161dd68e4ed69b9d64f7d4de2356c651edef96201cc7facd11b28ef9338596a2022bc2ed7a56f09c5dee5aa1711de2d1; TPCmaxPrice=800000; TS01a07bd2=012f990cd3161dd68e4ed69b9d64f7d4de2356c651edef96201cc7facd11b28ef9338596a2022bc2ed7a56f09c5dee5aa1711de2d1; TPCminPrice=700000',\n",
" 'Sec-Fetch-Dest': 'empty',\n",
" 'Sec-Fetch-Mode': 'cors',\n",
" 'Sec-Fetch-Site': 'same-origin',\n",
"}\n",
"\n",
"response = requests.get('https://www.rightmove.co.uk/typeAhead/uknostreet/CA/MD/EN/', cookies=cookies, headers=headers)\n",
"response.json()"
]
},
{
"cell_type": "code",
"execution_count": 39,
"id": "eba907d0-680d-4374-a56e-95bd4abf0053",
"metadata": {},
"outputs": [],
"source": [
"import requests\n",
"from textwrap import wrap\n",
"\n",
"def query_loc_identifier(name: str)->str:\n",
" \"\"\"\n",
" \n",
" \"\"\"\n",
" name = name.upper()\n",
" name = '/'.join(wrap(name,2))\n",
" \n",
" headers = {\n",
" 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:123.0) Gecko/20100101 Firefox/123.0',\n",
" }\n",
" \n",
" response = requests.get(f'https://www.rightmove.co.uk/typeAhead/uknostreet/{name}', headers=headers)\n",
" return response.json()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d185e013-9beb-4e57-9d8b-a830e14339c3",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "cabe5d3f-ad0f-49c2-9fbf-686539a05bd1",
"metadata": {},
"outputs": [],
"source": [
"\n"
]
},
{
"cell_type": "code",
"execution_count": 30,
"id": "ba500fe6-fb18-466e-a697-403d28181674",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 46,
"id": "56c49b50-ff31-4785-9088-45ff5a39545e",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'Greenwich': 'REGION^61226', 'Hillingdon': 'REGION^93959', 'Ealing': 'REGION^93947', 'Richmond upon Thames': 'REGION^61415', 'Sutton': 'REGION^93974', 'Wandsworth': 'REGION^93977', 'Camden': 'REGION^93941', 'Enfield': 'REGION^93950', 'Croydon': 'REGION^93944', 'Hackney': 'REGION^93953', 'Kingston upon Thames': 'REGION^93968', 'Kensington and Chelsea': 'REGION^61229', 'Bromley': 'REGION^93938', 'Brent': 'REGION^93935', 'Waltham Forest': 'REGION^61232', 'Southwark': 'REGION^61518', 'Harrow': 'REGION^93956', 'Lewisham': 'REGION^61413', 'Barnet': 'REGION^93929', 'Islington': 'REGION^93965', 'Haringey': 'REGION^61227', 'Lambeth': 'REGION^93971', 'Westminster': '', 'Tower Hamlets': 'REGION^61417', 'Havering': 'REGION^61228', 'Barking and Dagenham': 'REGION^61400', 'Hammersmith and Fulham': 'REGION^61407', 'Bexley': 'REGION^93932', 'Redbridge': 'REGION^61537', 'Newham': 'REGION^61231', 'Merton': 'REGION^61414', 'Hounslow': 'REGION^93962'}\n"
]
}
],
"source": [
"{\n",
" \"City of London\": \"REGION^61224\",\n",
" \"Greenwich\": \"REGION^61226\",\n",
" \"Hillingdon\": \"REGION^93959\",\n",
" \"Ealing\": \"REGION^93947\",\n",
" \"Richmond upon Thames\": \"REGION^61415\",\n",
" \"Sutton\": \"REGION^93974\",\n",
" \"Wandsworth\": \"REGION^93977\",\n",
" \"Camden\": \"REGION^93941\",\n",
" \"Enfield\": \"REGION^93950\",\n",
" \"Croydon\": \"REGION^93944\",\n",
" \"Hackney\": \"REGION^93953\",\n",
" \"Kingston upon Thames\": \"REGION^93968\",\n",
" \"Kensington and Chelsea\": \"REGION^61229\",\n",
" \"Bromley\": \"REGION^93938\",\n",
" \"Brent\": \"REGION^93935\",\n",
" \"Waltham Forest\": \"REGION^61232\",\n",
" \"Southwark\": \"REGION^61518\",\n",
" \"Harrow\": \"REGION^93956\",\n",
" \"Lewisham\": \"REGION^61413\",\n",
" \"Barnet\": \"REGION^93929\",\n",
" \"Islington\": \"REGION^93965\",\n",
" \"Haringey\": \"REGION^61227\",\n",
" \"Lambeth\": \"REGION^93971\",\n",
" \"Westminster\": \"REGION^93980\",\n",
" \"Tower Hamlets\": \"REGION^61417\",\n",
" \"Havering\": \"REGION^61228\",\n",
" \"Barking and Dagenham\": \"REGION^61400\",\n",
" \"Hammersmith and Fulham\": \"REGION^61407\",\n",
" \"Bexley\": \"REGION^93932\",\n",
" \"Redbridge\": \"REGION^61537\",\n",
" \"Newham\": \"REGION^61231\",\n",
" \"Merton\": \"REGION^61414\",\n",
" \"Hounslow\": \"REGION^93962\",\n",
"}\n",
"\n",
"bor_to_locid = {}\n",
"\n",
"for borough in boroughs:\n",
" bor_to_locid[borough] = ''\n",
" d = query_loc_identifier(borough)\n",
" locs = d['typeAheadLocations']\n",
" filtered = [l for l in locs if 'Borough' in l['displayName']]\n",
" if len(filtered)>1:\n",
" print(f\"{borough} has more entries: {len(filtered)}!\")\n",
" if filtered:\n",
" bor_to_locid[borough] = filtered[0]['locationIdentifier']\n",
"\n",
"Westminster (City of)\n",
"print(bor_to_locid)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "157e20a2-d137-45b8-802b-948fa8e04ba3",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "386578dc-1ad5-4b8a-8905-29b0c47a6174",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "10c17fdf-f424-40cb-9d8c-9218f8d4ab53",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "88d99eb7-8c92-4817-86ce-ba0738331dba",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "6c8b4488-ae2b-41ab-9c95-e3c85f9fb77e",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "c3072907-7ad8-4618-92ab-818e392218d9",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "b20f6f16-3236-4772-b1a3-2d4a3b1925a6",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "1a7230dc-1a0f-43e2-bd15-0c85ea445733",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "aeaf84bf-8514-48c6-88ce-2c6828bdcdf2",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "c888d4e6-d192-45df-b9b6-5e2d39bca344",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 1,
"id": "cd71db7f-ba11-4d5d-a183-768ed4db23ba",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/Users/kadir/code/realestate/crawler/venv/lib/python3.12/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
" from .autonotebook import tqdm as notebook_tqdm\n"
]
}
],
"source": [
"from data_access import Listing"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "d0ced84b-ee91-4642-b2ff-dd32d9f1e437",
"metadata": {},
"outputs": [],
"source": [
"l = Listing(133604363)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "7157f5f7-65b3-4232-bcae-26b93e5d93e6",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"6395.76"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"l.serviceCharge"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "3f453f9c-bdaa-4713-8220-c504f1a436ae",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.0"
}
},
"nbformat": 4,
"nbformat_minor": 5
}