From b873eaf203f8259505a87ed51abbf492ee851544 Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Sun, 18 May 2025 12:27:26 +0000 Subject: [PATCH] fix types and format --- crawler/1_dump_listings.py | 48 ++++++------- crawler/2_dump_detail.py | 13 +--- crawler/3_dump_images.py | 16 ++--- crawler/4_detect_floorplan.py | 24 ++----- crawler/5_routing.py | 38 ++++------- crawler/csv_exporter.py | 4 +- crawler/data_access.py | 122 ++++++++++++++++------------------ crawler/rec/query.py | 24 +++---- 8 files changed, 117 insertions(+), 172 deletions(-) diff --git a/crawler/1_dump_listings.py b/crawler/1_dump_listings.py index 188f5aa..6f76e04 100644 --- a/crawler/1_dump_listings.py +++ b/crawler/1_dump_listings.py @@ -16,12 +16,13 @@ class QueryParameters: district_names: set[str] radius: float = 0 page_size: int = 500 # items per page - max_days_since_added: int | None = None + max_days_since_added: int = 30 + # available from; furnished/unfurnished; council tax async def dump_listings( - parameters: QueryParameters, - data_dir: pathlib.Path = pathlib.Path("data/rs/"), + parameters: QueryParameters, + data_dir: pathlib.Path = pathlib.Path("data/rs/"), ) -> list[Listing]: districts = { district: locid @@ -31,29 +32,28 @@ async def dump_listings( print("Valid districts to scrape:", districts.keys()) listings = [] - json_responses = await asyncio.gather( - *[ - listing_query( - page=i, - channel=parameters.listing_type, - min_bedrooms=parameters.min_bedrooms, - max_bedrooms=parameters.max_bedrooms, - radius=parameters.radius, - min_price=parameters.min_price, - max_price=parameters.max_price, - location_id=locid, - page_size=parameters.page_size, - max_days_since_added=parameters.max_days_since_added, - ) for locid in districts.values() for i in [1, 2] - ] - ) + json_responses = await asyncio.gather(*[ + listing_query( + page=i, + channel=parameters.listing_type, + min_bedrooms=parameters.min_bedrooms, + max_bedrooms=parameters.max_bedrooms, + radius=parameters.radius, + min_price=parameters.min_price, + max_price=parameters.max_price, + location_id=locid, + page_size=parameters.page_size, + max_days_since_added=parameters.max_days_since_added, + ) for locid in districts.values() for i in [1, 2] + ]) listings = [] for response_json in json_responses: if response_json["totalAvailableResults"] == 0: print("No results found") continue if response_json["totalAvailableResults"] > 0: - print("totalAvailableResults: ", response_json["totalAvailableResults"]) + print("totalAvailableResults: ", + response_json["totalAvailableResults"]) for property in response_json["properties"]: identifier = property["identifier"] @@ -62,11 +62,3 @@ async def dump_listings( listings.append(listing) return listings - - -def main(): - dump_listings() - - -if __name__ == "__main__": - main() diff --git a/crawler/2_dump_detail.py b/crawler/2_dump_detail.py index 8019fe2..ba87135 100644 --- a/crawler/2_dump_detail.py +++ b/crawler/2_dump_detail.py @@ -1,6 +1,5 @@ import asyncio import json -import pathlib from rec.query import detail_query from tqdm.asyncio import tqdm @@ -13,8 +12,7 @@ semaphore = asyncio.Semaphore(10) async def dump_detail(listing_paths: list[str]): listings = Listing.get_all_listings(listing_paths) filtered_listings = await tqdm.gather( - *[_dump_detail_for_listing(listing) for listing in listings] - ) + *[_dump_detail_for_listing(listing) for listing in listings]) return filtered_listings @@ -27,12 +25,3 @@ async def _dump_detail_for_listing(listing: Listing): d = await detail_query(listing.identifier) with open(listing.path_detail_json(), "w") as f: json.dump(d, f) - - -def main(): - listing_paths = sorted(list(pathlib.Path("data/rs").glob("*/listing.json"))) - dump_detail(listing_paths) - - -if __name__ == "__main__": - main() diff --git a/crawler/3_dump_images.py b/crawler/3_dump_images.py index a659235..c11e5eb 100644 --- a/crawler/3_dump_images.py +++ b/crawler/3_dump_images.py @@ -1,6 +1,5 @@ import asyncio import json -import pathlib import aiohttp from tqdm.asyncio import tqdm from data_access import Listing @@ -11,7 +10,8 @@ semaphore = asyncio.Semaphore(10) async def dump_images(listing_paths: list[str]): listings = Listing.get_all_listings(listing_paths) - await tqdm.gather(*[dump_images_for_listing(listing) for listing in listings]) + await tqdm.gather( + *[dump_images_for_listing(listing) for listing in listings]) async def dump_images_for_listing(listing: Listing): @@ -30,17 +30,9 @@ async def dump_images_for_listing(listing: Listing): async with semaphore: async with session.get(url) as response: if response.status != 200: - raise Exception(f"Error for {url}: {response.status}") + raise Exception( + f"Error for {url}: {response.status}") with open(p, "wb") as f: f.write(await response.read()) except Exception as e: tqdm.write(f"Error for {url}: {e}") - - -def main(): - listing_paths = sorted(list(pathlib.Path("data/rs").glob("*/listing.json"))) - dump_images(listing_paths) - - -if __name__ == "__main__": - main() diff --git a/crawler/4_detect_floorplan.py b/crawler/4_detect_floorplan.py index bcb1c71..199b601 100644 --- a/crawler/4_detect_floorplan.py +++ b/crawler/4_detect_floorplan.py @@ -1,5 +1,4 @@ import asyncio -import pathlib from data_access import Listing from tqdm.asyncio import tqdm import multiprocessing @@ -7,25 +6,16 @@ import multiprocessing async def detect_floorplan(listing_paths: list[str]): listings = Listing.get_all_listings(listing_paths) - cpu_count = multiprocessing.cpu_count() / 4 + cpu_count = multiprocessing.cpu_count() // 4 semaphore = asyncio.Semaphore(cpu_count) - await tqdm.gather( - *[_detect_floorplan_with_semaphore(listing, semaphore) for listing in listings] - ) + await tqdm.gather(*[ + _detect_floorplan_with_semaphore(listing, semaphore) + for listing in listings + ]) -async def _detect_floorplan_with_semaphore( - listing: Listing, semaphore: asyncio.Semaphore -): +async def _detect_floorplan_with_semaphore(listing: Listing, + semaphore: asyncio.Semaphore): async with semaphore: return await listing.calculate_sqm_ocr(recalculate=False) - - -def main(): - listing_paths = sorted(list(pathlib.Path("data/rs").glob("*/listing.json"))) - detect_floorplan(listing_paths) - - -if __name__ == "__main__": - main() diff --git a/crawler/5_routing.py b/crawler/5_routing.py index 1ce61df..815b6f4 100644 --- a/crawler/5_routing.py +++ b/crawler/5_routing.py @@ -1,4 +1,3 @@ -import pathlib from data_access import Listing from tqdm import tqdm from geopy.distance import geodesic @@ -15,30 +14,30 @@ def calculate_route(listing_paths: list[str]): # reduce listings to everything within 7 miles filtered_listings = [] for listing in listings: - miles = geodesic( - BROCK_STREET_LAT_LONG, (listing.latitude, listing.longitude) - ).miles + miles = geodesic(BROCK_STREET_LAT_LONG, + (listing.latitude, listing.longitude)).miles if listing.isRemoved: - log.info(f"Removed-Skip: Skipping {listing.identifier} is already removed.") + log.info(f"Removed-Skip: Skipping {listing.identifier} " + "is already removed.") continue if miles > 7: - log.info( - f"Miles-Skip: Skipping {listing.identifier} as it is {miles} miles away" - ) + log.info(f"Miles-Skip: Skipping {listing.identifier} as it is " + f"{miles} miles away") continue if listing.path_routing_json().exists(): log.info( - f"Path-Skip: Skipping {listing.identifier} as path routing already exists" - ) + (f"Path-Skip: Skipping {listing.identifier} as path routing " + "already exists")) continue - if listing.sqm_ocr is None or listing.sqm_ocr < 30 or listing.sqm_ocr > 200: - log.info( - f"Floorplan-Skip: Skipping {listing.identifier} as sqm_ocr is {listing.sqm_ocr}" - ) + if (listing.sqm_ocr is None or listing.sqm_ocr < 30 + or listing.sqm_ocr > 200): + log.info((f"Floorplan-Skip: Skipping {listing.identifier} as " + f"sqm_ocr is {listing.sqm_ocr}")) continue filtered_listings.append(listing) - print(f"Filtered listings from {len(listings)} to {len(filtered_listings)}") + print( + f"Filtered listings from {len(listings)} to {len(filtered_listings)}") for listing in tqdm(filtered_listings): lat, long = BROCK_STREET_LAT_LONG @@ -47,12 +46,3 @@ def calculate_route(listing_paths: list[str]): duration_minutes = traveltime["duration"] / 60.0 tqdm.write(f"{listing.identifier} {duration_minutes}") - - -def main(): - listing_paths = sorted(list(pathlib.Path("data/rs").glob("*/listing.json"))) - calculate_route(listing_paths) - - -if __name__ == "__main__": - main() diff --git a/crawler/csv_exporter.py b/crawler/csv_exporter.py index 10d0a0f..1c6d023 100644 --- a/crawler/csv_exporter.py +++ b/crawler/csv_exporter.py @@ -4,7 +4,9 @@ import pandas as pd def export_to_csv( - listings: list[Listing], output_file: Path, columns: list[str] + listings: list[Listing], + output_file: Path, + columns: list[str], ) -> None: ds = [listing.dict_nicely() for listing in listings] df = pd.DataFrame(ds) diff --git a/crawler/data_access.py b/crawler/data_access.py index 1955255..a658543 100644 --- a/crawler/data_access.py +++ b/crawler/data_access.py @@ -2,7 +2,7 @@ import asyncio from dataclasses import dataclass import json import pathlib -from typing import List, Dict +from typing import Any, List, Dict from rec import floorplan, routing import re import datetime @@ -11,7 +11,7 @@ import datetime @dataclass() class Listing: identifier: int - _cached: Dict = None + _cached: Dict | None = None data_dir: pathlib.Path = pathlib.Path("data/rs/") ALL_COLUMNS = [ "identifier", @@ -46,10 +46,8 @@ class Listing: while str(d['identifier']) in str(data_dir.resolve().absolute()): data_dir = data_dir.parent listing = Listing(d["identifier"], data_dir=data_dir) - if ( - listing.last_seen is not None - and listing.last_seen < seen_in_the_last_n_days - ): + if (listing.last_seen is not None + and listing.last_seen < seen_in_the_last_n_days): identifiers.append(listing) return identifiers @@ -107,18 +105,15 @@ class Listing: objs = [] for floorplan_path in self.list_floorplans(): estimated_sqm, model_output, predictions = floorplan.calculate_model( - floorplan_path - ) - objs.append( - { - "floorplan_path": str(floorplan_path), - "estimated_sqm": estimated_sqm, - "model_output": model_output, - "no_predictions": len( - predictions - ), # cant serialize the predictions itself since its a tensor - } - ) + floorplan_path) + objs.append({ + "floorplan_path": str(floorplan_path), + "estimated_sqm": estimated_sqm, + "model_output": model_output, + "no_predictions": len( + predictions + ), # cant serialize the predictions itself since its a tensor + }) with open(self.path_floorplan_model_json(), "w") as f: json.dump(objs, f) @@ -131,9 +126,8 @@ class Listing: with open(self.path_floorplan_json()) as f: objs = json.load(f) - max_sqm = max( - [o["estimated_sqm"] for o in objs if o is None] - ) # filter out Nones + max_sqm = max([o["estimated_sqm"] for o in objs + if o is None]) # filter out Nones return max_sqm async def calculate_sqm_ocr(self, recalculate=True): @@ -143,15 +137,12 @@ class Listing: objs = [] for floorplan_path in self.list_floorplans(): estimated_sqm, model_output = await asyncio.to_thread( - floorplan.calculate_ocr, floorplan_path - ) - objs.append( - { - "floorplan_path": str(floorplan_path), - "estimated_sqm": estimated_sqm, - "text": model_output, - } - ) + floorplan.calculate_ocr, floorplan_path) + objs.append({ + "floorplan_path": str(floorplan_path), + "estimated_sqm": estimated_sqm, + "text": model_output, + }) with open(self.path_floorplan_ocr_json(), "w") as f: json.dump(objs, f) @@ -164,19 +155,23 @@ class Listing: with open(self.path_floorplan_ocr_json()) as f: objs = json.load(f) - sqms = [o["estimated_sqm"] for o in objs if o["estimated_sqm"] is not None] + sqms = [ + o["estimated_sqm"] for o in objs if o["estimated_sqm"] is not None + ] if len(sqms) == 0: return None max_sqm = max(sqms) return max_sqm - def calculate_route(self, dest_lat: float, dest_lon: float, recalculate=False): + def calculate_route(self, + dest_lat: float, + dest_lon: float, + recalculate=False): if self.path_routing_json().exists() and not recalculate: return - result = routing.transit_route( - self.latitude, self.longitude, dest_lat, dest_lon - ) + result = routing.transit_route(self.latitude, self.longitude, dest_lat, + dest_lon) with open(self.path_routing_json(), "w") as f: json.dump(result, f) @@ -200,11 +195,11 @@ class Listing: return json.load(f) @property - def detailobject(self): + def detailobject(self) -> dict[str, Any]: if self._cached is None: with open(self.path_detail_json()) as f: self._cached = json.load(f) - return self._cached + return self._cached # type: ignore @property def price(self) -> float: @@ -217,7 +212,7 @@ class Listing: @property def price_per_sqm(self) -> float: if self.sqm_ocr is None or self.sqm_ocr == 0: - return None + return -1 return self.price / self.sqm_ocr @property @@ -233,8 +228,9 @@ class Listing: return self.detailobject["property"]["longitude"] @property - def leaseLeft(self) -> int: - ds = self.detailobject["property"].get("tenureInfo", {}).get("content", []) + def leaseLeft(self) -> float | None: + ds = self.detailobject["property"].get("tenureInfo", + {}).get("content", []) for d in ds: if d["type"] == "lengthOfLease": matches = re.findall(r"(\d+\.?\d*)", d["value"]) @@ -250,7 +246,7 @@ class Listing: return (now - ds).days @property - def last_seen(self) -> int: + def last_seen(self) -> int | None: if not self.path_last_seen_listing().exists(): return None @@ -260,8 +256,9 @@ class Listing: return (datetime.datetime.now() - dt).days @property - def serviceCharge(self) -> float: - ds = self.detailobject["property"].get("tenureInfo", {}).get("content", []) + def serviceCharge(self) -> float | None: + ds = self.detailobject["property"].get("tenureInfo", + {}).get("content", []) for d in ds: if d["type"] == "annualServiceCharge": matches = re.findall(r"([\d,.]+)", d["value"]) @@ -276,8 +273,7 @@ class Listing: # aka new home try: return self.detailobject["property"]["development"] - except: - print(self.identifier) + except Exception: return False @property @@ -294,39 +290,33 @@ class Listing: def dict_nicely(self): return { "identifier": - self.identifier, + self.identifier, "sqm_ocr": - self.sqm_ocr, + self.sqm_ocr, "price": - self.price, + self.price, "price_per_sqm": - self.price_per_sqm, + self.price_per_sqm, "url": - self.url, + self.url, "bedrooms": - self.bedrooms, + self.bedrooms, "travel_time_fastest": - None if len(self.travel_time) == 0 else self.travel_time[0], + None if len(self.travel_time) == 0 else self.travel_time[0], "travel_time_second": - None if len(self.travel_time) < 2 else self.travel_time[1], + None if len(self.travel_time) < 2 else self.travel_time[1], "lease_left": - self.leaseLeft, + self.leaseLeft, "service_charge": - self.serviceCharge, + self.serviceCharge, "development": - self.development, + self.development, "tenure_type": - self.tenure_type, + self.tenure_type, "updated_days": - self.updateDaysAgo, + self.updateDaysAgo, "status": - self.status, + self.status, "last_seen": - self.last_seen, + self.last_seen, } - - -if __name__ == "__main__": - listing_paths = sorted(list(pathlib.Path("data/rs").glob("*/listing.json"))) - listings = Listing.get_all_listings() - print(listings[0].list_floorplans()) diff --git a/crawler/rec/query.py b/crawler/rec/query.py index 121fcd8..4e7b2f4 100644 --- a/crawler/rec/query.py +++ b/crawler/rec/query.py @@ -1,11 +1,11 @@ # from diskcache import Cache import enum -from typing import List +from typing import Any, List import aiohttp import requests import urllib3 -urllib3.disable_warnings() +urllib3.disable_warnings() # type: ignore class ListingType(enum.StrEnum): @@ -38,12 +38,12 @@ async def detail_query(detail_id: int): } url = f"https://api.rightmove.co.uk/api/property/{detail_id}" async with aiohttp.ClientSession() as session: - async with session.get(url, params=params, headers=headers) as response: + async with session.get(url, params=params, + headers=headers) as response: if response.status != 200: raise Exception( f"""id: {detail_id}. Status Code: {response.status}.""" - f"""Failed due to: {await response.text()}""" - ) + f"""Failed due to: {await response.text()}""") return await response.json() @@ -57,11 +57,11 @@ async def listing_query( max_price: int, location_id: str = "STATION^5168", # kings cross station mustNewHome: bool = False, - max_days_since_added: int = None, + max_days_since_added: int = 30, property_type: List["PropertyType"] = [], - page_size=25, -) -> dict: - params = { + page_size: int = 25, +) -> dict[str, Any]: + params: dict[str, str] = { "locationIdentifier": location_id, "channel": channel.upper(), "page": str(page), @@ -77,14 +77,14 @@ async def listing_query( "appVersion": "4.28.0", } if channel is ListingType.BUY: - params["dontShow"] = "sharedOwnership,retirement", + params["dontShow"] = "sharedOwnership,retirement" if len(property_type) > 0: params["propertyTypes"] = ",".join(property_type) if max_days_since_added is not None and max_days_since_added not in [ - 1, 3, 7, 14 + 1, 3, 7, 14 ]: raise Exception("Invalid max days. Can only be", [1, 3, 7, 14]) - params["maxDaysSinceAdded"] = max_days_since_added + params["maxDaysSinceAdded"] = str(max_days_since_added) if mustNewHome: params["mustHave"] = "newHome"