fix types and format

This commit is contained in:
Viktor Barzin 2025-05-18 12:27:26 +00:00
parent 91d3237516
commit b873eaf203
No known key found for this signature in database
GPG key ID: 4056458DBDBF8863
8 changed files with 117 additions and 172 deletions

View file

@ -16,12 +16,13 @@ class QueryParameters:
district_names: set[str] district_names: set[str]
radius: float = 0 radius: float = 0
page_size: int = 500 # items per page page_size: int = 500 # items per page
max_days_since_added: int | None = None max_days_since_added: int = 30
# available from; furnished/unfurnished; council tax
async def dump_listings( async def dump_listings(
parameters: QueryParameters, parameters: QueryParameters,
data_dir: pathlib.Path = pathlib.Path("data/rs/"), data_dir: pathlib.Path = pathlib.Path("data/rs/"),
) -> list[Listing]: ) -> list[Listing]:
districts = { districts = {
district: locid district: locid
@ -31,29 +32,28 @@ async def dump_listings(
print("Valid districts to scrape:", districts.keys()) print("Valid districts to scrape:", districts.keys())
listings = [] listings = []
json_responses = await asyncio.gather( json_responses = await asyncio.gather(*[
*[ listing_query(
listing_query( page=i,
page=i, channel=parameters.listing_type,
channel=parameters.listing_type, min_bedrooms=parameters.min_bedrooms,
min_bedrooms=parameters.min_bedrooms, max_bedrooms=parameters.max_bedrooms,
max_bedrooms=parameters.max_bedrooms, radius=parameters.radius,
radius=parameters.radius, min_price=parameters.min_price,
min_price=parameters.min_price, max_price=parameters.max_price,
max_price=parameters.max_price, location_id=locid,
location_id=locid, page_size=parameters.page_size,
page_size=parameters.page_size, max_days_since_added=parameters.max_days_since_added,
max_days_since_added=parameters.max_days_since_added, ) for locid in districts.values() for i in [1, 2]
) for locid in districts.values() for i in [1, 2] ])
]
)
listings = [] listings = []
for response_json in json_responses: for response_json in json_responses:
if response_json["totalAvailableResults"] == 0: if response_json["totalAvailableResults"] == 0:
print("No results found") print("No results found")
continue continue
if response_json["totalAvailableResults"] > 0: if response_json["totalAvailableResults"] > 0:
print("totalAvailableResults: ", response_json["totalAvailableResults"]) print("totalAvailableResults: ",
response_json["totalAvailableResults"])
for property in response_json["properties"]: for property in response_json["properties"]:
identifier = property["identifier"] identifier = property["identifier"]
@ -62,11 +62,3 @@ async def dump_listings(
listings.append(listing) listings.append(listing)
return listings return listings
def main():
dump_listings()
if __name__ == "__main__":
main()

View file

@ -1,6 +1,5 @@
import asyncio import asyncio
import json import json
import pathlib
from rec.query import detail_query from rec.query import detail_query
from tqdm.asyncio import tqdm from tqdm.asyncio import tqdm
@ -13,8 +12,7 @@ semaphore = asyncio.Semaphore(10)
async def dump_detail(listing_paths: list[str]): async def dump_detail(listing_paths: list[str]):
listings = Listing.get_all_listings(listing_paths) listings = Listing.get_all_listings(listing_paths)
filtered_listings = await tqdm.gather( filtered_listings = await tqdm.gather(
*[_dump_detail_for_listing(listing) for listing in listings] *[_dump_detail_for_listing(listing) for listing in listings])
)
return filtered_listings return filtered_listings
@ -27,12 +25,3 @@ async def _dump_detail_for_listing(listing: Listing):
d = await detail_query(listing.identifier) d = await detail_query(listing.identifier)
with open(listing.path_detail_json(), "w") as f: with open(listing.path_detail_json(), "w") as f:
json.dump(d, f) json.dump(d, f)
def main():
listing_paths = sorted(list(pathlib.Path("data/rs").glob("*/listing.json")))
dump_detail(listing_paths)
if __name__ == "__main__":
main()

View file

@ -1,6 +1,5 @@
import asyncio import asyncio
import json import json
import pathlib
import aiohttp import aiohttp
from tqdm.asyncio import tqdm from tqdm.asyncio import tqdm
from data_access import Listing from data_access import Listing
@ -11,7 +10,8 @@ semaphore = asyncio.Semaphore(10)
async def dump_images(listing_paths: list[str]): async def dump_images(listing_paths: list[str]):
listings = Listing.get_all_listings(listing_paths) listings = Listing.get_all_listings(listing_paths)
await tqdm.gather(*[dump_images_for_listing(listing) for listing in listings]) await tqdm.gather(
*[dump_images_for_listing(listing) for listing in listings])
async def dump_images_for_listing(listing: Listing): async def dump_images_for_listing(listing: Listing):
@ -30,17 +30,9 @@ async def dump_images_for_listing(listing: Listing):
async with semaphore: async with semaphore:
async with session.get(url) as response: async with session.get(url) as response:
if response.status != 200: if response.status != 200:
raise Exception(f"Error for {url}: {response.status}") raise Exception(
f"Error for {url}: {response.status}")
with open(p, "wb") as f: with open(p, "wb") as f:
f.write(await response.read()) f.write(await response.read())
except Exception as e: except Exception as e:
tqdm.write(f"Error for {url}: {e}") tqdm.write(f"Error for {url}: {e}")
def main():
listing_paths = sorted(list(pathlib.Path("data/rs").glob("*/listing.json")))
dump_images(listing_paths)
if __name__ == "__main__":
main()

View file

@ -1,5 +1,4 @@
import asyncio import asyncio
import pathlib
from data_access import Listing from data_access import Listing
from tqdm.asyncio import tqdm from tqdm.asyncio import tqdm
import multiprocessing import multiprocessing
@ -7,25 +6,16 @@ import multiprocessing
async def detect_floorplan(listing_paths: list[str]): async def detect_floorplan(listing_paths: list[str]):
listings = Listing.get_all_listings(listing_paths) listings = Listing.get_all_listings(listing_paths)
cpu_count = multiprocessing.cpu_count() / 4 cpu_count = multiprocessing.cpu_count() // 4
semaphore = asyncio.Semaphore(cpu_count) semaphore = asyncio.Semaphore(cpu_count)
await tqdm.gather( await tqdm.gather(*[
*[_detect_floorplan_with_semaphore(listing, semaphore) for listing in listings] _detect_floorplan_with_semaphore(listing, semaphore)
) for listing in listings
])
async def _detect_floorplan_with_semaphore( async def _detect_floorplan_with_semaphore(listing: Listing,
listing: Listing, semaphore: asyncio.Semaphore semaphore: asyncio.Semaphore):
):
async with semaphore: async with semaphore:
return await listing.calculate_sqm_ocr(recalculate=False) return await listing.calculate_sqm_ocr(recalculate=False)
def main():
listing_paths = sorted(list(pathlib.Path("data/rs").glob("*/listing.json")))
detect_floorplan(listing_paths)
if __name__ == "__main__":
main()

View file

@ -1,4 +1,3 @@
import pathlib
from data_access import Listing from data_access import Listing
from tqdm import tqdm from tqdm import tqdm
from geopy.distance import geodesic from geopy.distance import geodesic
@ -15,30 +14,30 @@ def calculate_route(listing_paths: list[str]):
# reduce listings to everything within 7 miles # reduce listings to everything within 7 miles
filtered_listings = [] filtered_listings = []
for listing in listings: for listing in listings:
miles = geodesic( miles = geodesic(BROCK_STREET_LAT_LONG,
BROCK_STREET_LAT_LONG, (listing.latitude, listing.longitude) (listing.latitude, listing.longitude)).miles
).miles
if listing.isRemoved: if listing.isRemoved:
log.info(f"Removed-Skip: Skipping {listing.identifier} is already removed.") log.info(f"Removed-Skip: Skipping {listing.identifier} "
"is already removed.")
continue continue
if miles > 7: if miles > 7:
log.info( log.info(f"Miles-Skip: Skipping {listing.identifier} as it is "
f"Miles-Skip: Skipping {listing.identifier} as it is {miles} miles away" f"{miles} miles away")
)
continue continue
if listing.path_routing_json().exists(): if listing.path_routing_json().exists():
log.info( log.info(
f"Path-Skip: Skipping {listing.identifier} as path routing already exists" (f"Path-Skip: Skipping {listing.identifier} as path routing "
) "already exists"))
continue continue
if listing.sqm_ocr is None or listing.sqm_ocr < 30 or listing.sqm_ocr > 200: if (listing.sqm_ocr is None or listing.sqm_ocr < 30
log.info( or listing.sqm_ocr > 200):
f"Floorplan-Skip: Skipping {listing.identifier} as sqm_ocr is {listing.sqm_ocr}" log.info((f"Floorplan-Skip: Skipping {listing.identifier} as "
) f"sqm_ocr is {listing.sqm_ocr}"))
continue continue
filtered_listings.append(listing) filtered_listings.append(listing)
print(f"Filtered listings from {len(listings)} to {len(filtered_listings)}") print(
f"Filtered listings from {len(listings)} to {len(filtered_listings)}")
for listing in tqdm(filtered_listings): for listing in tqdm(filtered_listings):
lat, long = BROCK_STREET_LAT_LONG lat, long = BROCK_STREET_LAT_LONG
@ -47,12 +46,3 @@ def calculate_route(listing_paths: list[str]):
duration_minutes = traveltime["duration"] / 60.0 duration_minutes = traveltime["duration"] / 60.0
tqdm.write(f"{listing.identifier} {duration_minutes}") tqdm.write(f"{listing.identifier} {duration_minutes}")
def main():
listing_paths = sorted(list(pathlib.Path("data/rs").glob("*/listing.json")))
calculate_route(listing_paths)
if __name__ == "__main__":
main()

View file

@ -4,7 +4,9 @@ import pandas as pd
def export_to_csv( def export_to_csv(
listings: list[Listing], output_file: Path, columns: list[str] listings: list[Listing],
output_file: Path,
columns: list[str],
) -> None: ) -> None:
ds = [listing.dict_nicely() for listing in listings] ds = [listing.dict_nicely() for listing in listings]
df = pd.DataFrame(ds) df = pd.DataFrame(ds)

View file

@ -2,7 +2,7 @@ import asyncio
from dataclasses import dataclass from dataclasses import dataclass
import json import json
import pathlib import pathlib
from typing import List, Dict from typing import Any, List, Dict
from rec import floorplan, routing from rec import floorplan, routing
import re import re
import datetime import datetime
@ -11,7 +11,7 @@ import datetime
@dataclass() @dataclass()
class Listing: class Listing:
identifier: int identifier: int
_cached: Dict = None _cached: Dict | None = None
data_dir: pathlib.Path = pathlib.Path("data/rs/") data_dir: pathlib.Path = pathlib.Path("data/rs/")
ALL_COLUMNS = [ ALL_COLUMNS = [
"identifier", "identifier",
@ -46,10 +46,8 @@ class Listing:
while str(d['identifier']) in str(data_dir.resolve().absolute()): while str(d['identifier']) in str(data_dir.resolve().absolute()):
data_dir = data_dir.parent data_dir = data_dir.parent
listing = Listing(d["identifier"], data_dir=data_dir) listing = Listing(d["identifier"], data_dir=data_dir)
if ( if (listing.last_seen is not None
listing.last_seen is not None and listing.last_seen < seen_in_the_last_n_days):
and listing.last_seen < seen_in_the_last_n_days
):
identifiers.append(listing) identifiers.append(listing)
return identifiers return identifiers
@ -107,18 +105,15 @@ class Listing:
objs = [] objs = []
for floorplan_path in self.list_floorplans(): for floorplan_path in self.list_floorplans():
estimated_sqm, model_output, predictions = floorplan.calculate_model( estimated_sqm, model_output, predictions = floorplan.calculate_model(
floorplan_path floorplan_path)
) objs.append({
objs.append( "floorplan_path": str(floorplan_path),
{ "estimated_sqm": estimated_sqm,
"floorplan_path": str(floorplan_path), "model_output": model_output,
"estimated_sqm": estimated_sqm, "no_predictions": len(
"model_output": model_output, predictions
"no_predictions": len( ), # cant serialize the predictions itself since its a tensor
predictions })
), # cant serialize the predictions itself since its a tensor
}
)
with open(self.path_floorplan_model_json(), "w") as f: with open(self.path_floorplan_model_json(), "w") as f:
json.dump(objs, f) json.dump(objs, f)
@ -131,9 +126,8 @@ class Listing:
with open(self.path_floorplan_json()) as f: with open(self.path_floorplan_json()) as f:
objs = json.load(f) objs = json.load(f)
max_sqm = max( max_sqm = max([o["estimated_sqm"] for o in objs
[o["estimated_sqm"] for o in objs if o is None] if o is None]) # filter out Nones
) # filter out Nones
return max_sqm return max_sqm
async def calculate_sqm_ocr(self, recalculate=True): async def calculate_sqm_ocr(self, recalculate=True):
@ -143,15 +137,12 @@ class Listing:
objs = [] objs = []
for floorplan_path in self.list_floorplans(): for floorplan_path in self.list_floorplans():
estimated_sqm, model_output = await asyncio.to_thread( estimated_sqm, model_output = await asyncio.to_thread(
floorplan.calculate_ocr, floorplan_path floorplan.calculate_ocr, floorplan_path)
) objs.append({
objs.append( "floorplan_path": str(floorplan_path),
{ "estimated_sqm": estimated_sqm,
"floorplan_path": str(floorplan_path), "text": model_output,
"estimated_sqm": estimated_sqm, })
"text": model_output,
}
)
with open(self.path_floorplan_ocr_json(), "w") as f: with open(self.path_floorplan_ocr_json(), "w") as f:
json.dump(objs, f) json.dump(objs, f)
@ -164,19 +155,23 @@ class Listing:
with open(self.path_floorplan_ocr_json()) as f: with open(self.path_floorplan_ocr_json()) as f:
objs = json.load(f) objs = json.load(f)
sqms = [o["estimated_sqm"] for o in objs if o["estimated_sqm"] is not None] sqms = [
o["estimated_sqm"] for o in objs if o["estimated_sqm"] is not None
]
if len(sqms) == 0: if len(sqms) == 0:
return None return None
max_sqm = max(sqms) max_sqm = max(sqms)
return max_sqm return max_sqm
def calculate_route(self, dest_lat: float, dest_lon: float, recalculate=False): def calculate_route(self,
dest_lat: float,
dest_lon: float,
recalculate=False):
if self.path_routing_json().exists() and not recalculate: if self.path_routing_json().exists() and not recalculate:
return return
result = routing.transit_route( result = routing.transit_route(self.latitude, self.longitude, dest_lat,
self.latitude, self.longitude, dest_lat, dest_lon dest_lon)
)
with open(self.path_routing_json(), "w") as f: with open(self.path_routing_json(), "w") as f:
json.dump(result, f) json.dump(result, f)
@ -200,11 +195,11 @@ class Listing:
return json.load(f) return json.load(f)
@property @property
def detailobject(self): def detailobject(self) -> dict[str, Any]:
if self._cached is None: if self._cached is None:
with open(self.path_detail_json()) as f: with open(self.path_detail_json()) as f:
self._cached = json.load(f) self._cached = json.load(f)
return self._cached return self._cached # type: ignore
@property @property
def price(self) -> float: def price(self) -> float:
@ -217,7 +212,7 @@ class Listing:
@property @property
def price_per_sqm(self) -> float: def price_per_sqm(self) -> float:
if self.sqm_ocr is None or self.sqm_ocr == 0: if self.sqm_ocr is None or self.sqm_ocr == 0:
return None return -1
return self.price / self.sqm_ocr return self.price / self.sqm_ocr
@property @property
@ -233,8 +228,9 @@ class Listing:
return self.detailobject["property"]["longitude"] return self.detailobject["property"]["longitude"]
@property @property
def leaseLeft(self) -> int: def leaseLeft(self) -> float | None:
ds = self.detailobject["property"].get("tenureInfo", {}).get("content", []) ds = self.detailobject["property"].get("tenureInfo",
{}).get("content", [])
for d in ds: for d in ds:
if d["type"] == "lengthOfLease": if d["type"] == "lengthOfLease":
matches = re.findall(r"(\d+\.?\d*)", d["value"]) matches = re.findall(r"(\d+\.?\d*)", d["value"])
@ -250,7 +246,7 @@ class Listing:
return (now - ds).days return (now - ds).days
@property @property
def last_seen(self) -> int: def last_seen(self) -> int | None:
if not self.path_last_seen_listing().exists(): if not self.path_last_seen_listing().exists():
return None return None
@ -260,8 +256,9 @@ class Listing:
return (datetime.datetime.now() - dt).days return (datetime.datetime.now() - dt).days
@property @property
def serviceCharge(self) -> float: def serviceCharge(self) -> float | None:
ds = self.detailobject["property"].get("tenureInfo", {}).get("content", []) ds = self.detailobject["property"].get("tenureInfo",
{}).get("content", [])
for d in ds: for d in ds:
if d["type"] == "annualServiceCharge": if d["type"] == "annualServiceCharge":
matches = re.findall(r"([\d,.]+)", d["value"]) matches = re.findall(r"([\d,.]+)", d["value"])
@ -276,8 +273,7 @@ class Listing:
# aka new home # aka new home
try: try:
return self.detailobject["property"]["development"] return self.detailobject["property"]["development"]
except: except Exception:
print(self.identifier)
return False return False
@property @property
@ -294,39 +290,33 @@ class Listing:
def dict_nicely(self): def dict_nicely(self):
return { return {
"identifier": "identifier":
self.identifier, self.identifier,
"sqm_ocr": "sqm_ocr":
self.sqm_ocr, self.sqm_ocr,
"price": "price":
self.price, self.price,
"price_per_sqm": "price_per_sqm":
self.price_per_sqm, self.price_per_sqm,
"url": "url":
self.url, self.url,
"bedrooms": "bedrooms":
self.bedrooms, self.bedrooms,
"travel_time_fastest": "travel_time_fastest":
None if len(self.travel_time) == 0 else self.travel_time[0], None if len(self.travel_time) == 0 else self.travel_time[0],
"travel_time_second": "travel_time_second":
None if len(self.travel_time) < 2 else self.travel_time[1], None if len(self.travel_time) < 2 else self.travel_time[1],
"lease_left": "lease_left":
self.leaseLeft, self.leaseLeft,
"service_charge": "service_charge":
self.serviceCharge, self.serviceCharge,
"development": "development":
self.development, self.development,
"tenure_type": "tenure_type":
self.tenure_type, self.tenure_type,
"updated_days": "updated_days":
self.updateDaysAgo, self.updateDaysAgo,
"status": "status":
self.status, self.status,
"last_seen": "last_seen":
self.last_seen, self.last_seen,
} }
if __name__ == "__main__":
listing_paths = sorted(list(pathlib.Path("data/rs").glob("*/listing.json")))
listings = Listing.get_all_listings()
print(listings[0].list_floorplans())

View file

@ -1,11 +1,11 @@
# from diskcache import Cache # from diskcache import Cache
import enum import enum
from typing import List from typing import Any, List
import aiohttp import aiohttp
import requests import requests
import urllib3 import urllib3
urllib3.disable_warnings() urllib3.disable_warnings() # type: ignore
class ListingType(enum.StrEnum): class ListingType(enum.StrEnum):
@ -38,12 +38,12 @@ async def detail_query(detail_id: int):
} }
url = f"https://api.rightmove.co.uk/api/property/{detail_id}" url = f"https://api.rightmove.co.uk/api/property/{detail_id}"
async with aiohttp.ClientSession() as session: async with aiohttp.ClientSession() as session:
async with session.get(url, params=params, headers=headers) as response: async with session.get(url, params=params,
headers=headers) as response:
if response.status != 200: if response.status != 200:
raise Exception( raise Exception(
f"""id: {detail_id}. Status Code: {response.status}.""" f"""id: {detail_id}. Status Code: {response.status}."""
f"""Failed due to: {await response.text()}""" f"""Failed due to: {await response.text()}""")
)
return await response.json() return await response.json()
@ -57,11 +57,11 @@ async def listing_query(
max_price: int, max_price: int,
location_id: str = "STATION^5168", # kings cross station location_id: str = "STATION^5168", # kings cross station
mustNewHome: bool = False, mustNewHome: bool = False,
max_days_since_added: int = None, max_days_since_added: int = 30,
property_type: List["PropertyType"] = [], property_type: List["PropertyType"] = [],
page_size=25, page_size: int = 25,
) -> dict: ) -> dict[str, Any]:
params = { params: dict[str, str] = {
"locationIdentifier": location_id, "locationIdentifier": location_id,
"channel": channel.upper(), "channel": channel.upper(),
"page": str(page), "page": str(page),
@ -77,14 +77,14 @@ async def listing_query(
"appVersion": "4.28.0", "appVersion": "4.28.0",
} }
if channel is ListingType.BUY: if channel is ListingType.BUY:
params["dontShow"] = "sharedOwnership,retirement", params["dontShow"] = "sharedOwnership,retirement"
if len(property_type) > 0: if len(property_type) > 0:
params["propertyTypes"] = ",".join(property_type) params["propertyTypes"] = ",".join(property_type)
if max_days_since_added is not None and max_days_since_added not in [ if max_days_since_added is not None and max_days_since_added not in [
1, 3, 7, 14 1, 3, 7, 14
]: ]:
raise Exception("Invalid max days. Can only be", [1, 3, 7, 14]) raise Exception("Invalid max days. Can only be", [1, 3, 7, 14])
params["maxDaysSinceAdded"] = max_days_since_added params["maxDaysSinceAdded"] = str(max_days_since_added)
if mustNewHome: if mustNewHome:
params["mustHave"] = "newHome" params["mustHave"] = "newHome"