From b873eaf203f8259505a87ed51abbf492ee851544 Mon Sep 17 00:00:00 2001
From: Viktor Barzin <vbarzin@gmail.com>
Date: Sun, 18 May 2025 12:27:26 +0000
Subject: [PATCH] fix types and format

---
 crawler/1_dump_listings.py    |  48 ++++++-------
 crawler/2_dump_detail.py      |  13 +---
 crawler/3_dump_images.py      |  16 ++---
 crawler/4_detect_floorplan.py |  24 ++-----
 crawler/5_routing.py          |  38 ++++-------
 crawler/csv_exporter.py       |   4 +-
 crawler/data_access.py        | 122 ++++++++++++++++------------------
 crawler/rec/query.py          |  24 +++----
 8 files changed, 117 insertions(+), 172 deletions(-)

diff --git a/crawler/1_dump_listings.py b/crawler/1_dump_listings.py
index 188f5aa..6f76e04 100644
--- a/crawler/1_dump_listings.py
+++ b/crawler/1_dump_listings.py
@@ -16,12 +16,13 @@ class QueryParameters:
     district_names: set[str]
     radius: float = 0
     page_size: int = 500  # items per page
-    max_days_since_added: int | None = None
+    max_days_since_added: int = 30
+    # available from; furnished/unfurnished; council tax
 
 
 async def dump_listings(
-    parameters: QueryParameters,
-    data_dir: pathlib.Path = pathlib.Path("data/rs/"),
+        parameters: QueryParameters,
+        data_dir: pathlib.Path = pathlib.Path("data/rs/"),
 ) -> list[Listing]:
     districts = {
         district: locid
@@ -31,29 +32,28 @@ async def dump_listings(
     print("Valid districts to scrape:", districts.keys())
     listings = []
 
-    json_responses = await asyncio.gather(
-        *[
-            listing_query(
-                page=i,
-                channel=parameters.listing_type,
-                min_bedrooms=parameters.min_bedrooms,
-                max_bedrooms=parameters.max_bedrooms,
-                radius=parameters.radius,
-                min_price=parameters.min_price,
-                max_price=parameters.max_price,
-                location_id=locid,
-                page_size=parameters.page_size,
-                max_days_since_added=parameters.max_days_since_added,
-            ) for locid in districts.values() for i in [1, 2]
-        ]
-    )
+    json_responses = await asyncio.gather(*[
+        listing_query(
+            page=i,
+            channel=parameters.listing_type,
+            min_bedrooms=parameters.min_bedrooms,
+            max_bedrooms=parameters.max_bedrooms,
+            radius=parameters.radius,
+            min_price=parameters.min_price,
+            max_price=parameters.max_price,
+            location_id=locid,
+            page_size=parameters.page_size,
+            max_days_since_added=parameters.max_days_since_added,
+        ) for locid in districts.values() for i in [1, 2]
+    ])
     listings = []
     for response_json in json_responses:
         if response_json["totalAvailableResults"] == 0:
             print("No results found")
             continue
         if response_json["totalAvailableResults"] > 0:
-            print("totalAvailableResults: ", response_json["totalAvailableResults"])
+            print("totalAvailableResults: ",
+                  response_json["totalAvailableResults"])
         for property in response_json["properties"]:
             identifier = property["identifier"]
 
@@ -62,11 +62,3 @@ async def dump_listings(
             listings.append(listing)
 
     return listings
-
-
-def main():
-    dump_listings()
-
-
-if __name__ == "__main__":
-    main()
diff --git a/crawler/2_dump_detail.py b/crawler/2_dump_detail.py
index 8019fe2..ba87135 100644
--- a/crawler/2_dump_detail.py
+++ b/crawler/2_dump_detail.py
@@ -1,6 +1,5 @@
 import asyncio
 import json
-import pathlib
 from rec.query import detail_query
 from tqdm.asyncio import tqdm
 
@@ -13,8 +12,7 @@ semaphore = asyncio.Semaphore(10)
 async def dump_detail(listing_paths: list[str]):
     listings = Listing.get_all_listings(listing_paths)
     filtered_listings = await tqdm.gather(
-        *[_dump_detail_for_listing(listing) for listing in listings]
-    )
+        *[_dump_detail_for_listing(listing) for listing in listings])
     return filtered_listings
 
 
@@ -27,12 +25,3 @@ async def _dump_detail_for_listing(listing: Listing):
         d = await detail_query(listing.identifier)
     with open(listing.path_detail_json(), "w") as f:
         json.dump(d, f)
-
-
-def main():
-    listing_paths = sorted(list(pathlib.Path("data/rs").glob("*/listing.json")))
-    dump_detail(listing_paths)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/crawler/3_dump_images.py b/crawler/3_dump_images.py
index a659235..c11e5eb 100644
--- a/crawler/3_dump_images.py
+++ b/crawler/3_dump_images.py
@@ -1,6 +1,5 @@
 import asyncio
 import json
-import pathlib
 import aiohttp
 from tqdm.asyncio import tqdm
 from data_access import Listing
@@ -11,7 +10,8 @@ semaphore = asyncio.Semaphore(10)
 
 async def dump_images(listing_paths: list[str]):
     listings = Listing.get_all_listings(listing_paths)
-    await tqdm.gather(*[dump_images_for_listing(listing) for listing in listings])
+    await tqdm.gather(
+        *[dump_images_for_listing(listing) for listing in listings])
 
 
 async def dump_images_for_listing(listing: Listing):
@@ -30,17 +30,9 @@ async def dump_images_for_listing(listing: Listing):
                 async with semaphore:
                     async with session.get(url) as response:
                         if response.status != 200:
-                            raise Exception(f"Error for {url}: {response.status}")
+                            raise Exception(
+                                f"Error for {url}: {response.status}")
                         with open(p, "wb") as f:
                             f.write(await response.read())
         except Exception as e:
             tqdm.write(f"Error for {url}: {e}")
-
-
-def main():
-    listing_paths = sorted(list(pathlib.Path("data/rs").glob("*/listing.json")))
-    dump_images(listing_paths)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/crawler/4_detect_floorplan.py b/crawler/4_detect_floorplan.py
index bcb1c71..199b601 100644
--- a/crawler/4_detect_floorplan.py
+++ b/crawler/4_detect_floorplan.py
@@ -1,5 +1,4 @@
 import asyncio
-import pathlib
 from data_access import Listing
 from tqdm.asyncio import tqdm
 import multiprocessing
@@ -7,25 +6,16 @@ import multiprocessing
 
 async def detect_floorplan(listing_paths: list[str]):
     listings = Listing.get_all_listings(listing_paths)
-    cpu_count = multiprocessing.cpu_count() / 4
+    cpu_count = multiprocessing.cpu_count() // 4
     semaphore = asyncio.Semaphore(cpu_count)
 
-    await tqdm.gather(
-        *[_detect_floorplan_with_semaphore(listing, semaphore) for listing in listings]
-    )
+    await tqdm.gather(*[
+        _detect_floorplan_with_semaphore(listing, semaphore)
+        for listing in listings
+    ])
 
 
-async def _detect_floorplan_with_semaphore(
-    listing: Listing, semaphore: asyncio.Semaphore
-):
+async def _detect_floorplan_with_semaphore(listing: Listing,
+                                           semaphore: asyncio.Semaphore):
     async with semaphore:
         return await listing.calculate_sqm_ocr(recalculate=False)
-
-
-def main():
-    listing_paths = sorted(list(pathlib.Path("data/rs").glob("*/listing.json")))
-    detect_floorplan(listing_paths)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/crawler/5_routing.py b/crawler/5_routing.py
index 1ce61df..815b6f4 100644
--- a/crawler/5_routing.py
+++ b/crawler/5_routing.py
@@ -1,4 +1,3 @@
-import pathlib
 from data_access import Listing
 from tqdm import tqdm
 from geopy.distance import geodesic
@@ -15,30 +14,30 @@ def calculate_route(listing_paths: list[str]):
     # reduce listings to everything within 7 miles
     filtered_listings = []
     for listing in listings:
-        miles = geodesic(
-            BROCK_STREET_LAT_LONG, (listing.latitude, listing.longitude)
-        ).miles
+        miles = geodesic(BROCK_STREET_LAT_LONG,
+                         (listing.latitude, listing.longitude)).miles
         if listing.isRemoved:
-            log.info(f"Removed-Skip: Skipping {listing.identifier} is already removed.")
+            log.info(f"Removed-Skip: Skipping {listing.identifier} "
+                     "is already removed.")
             continue
         if miles > 7:
-            log.info(
-                f"Miles-Skip: Skipping {listing.identifier} as it is {miles} miles away"
-            )
+            log.info(f"Miles-Skip: Skipping {listing.identifier} as it is "
+                     f"{miles} miles away")
             continue
         if listing.path_routing_json().exists():
             log.info(
-                f"Path-Skip: Skipping {listing.identifier} as path routing already exists"
-            )
+                (f"Path-Skip: Skipping {listing.identifier} as path routing "
+                 "already exists"))
             continue
-        if listing.sqm_ocr is None or listing.sqm_ocr < 30 or listing.sqm_ocr > 200:
-            log.info(
-                f"Floorplan-Skip: Skipping {listing.identifier} as sqm_ocr is {listing.sqm_ocr}"
-            )
+        if (listing.sqm_ocr is None or listing.sqm_ocr < 30
+                or listing.sqm_ocr > 200):
+            log.info((f"Floorplan-Skip: Skipping {listing.identifier} as "
+                      f"sqm_ocr is {listing.sqm_ocr}"))
             continue
         filtered_listings.append(listing)
 
-    print(f"Filtered listings from {len(listings)} to {len(filtered_listings)}")
+    print(
+        f"Filtered listings from {len(listings)} to {len(filtered_listings)}")
 
     for listing in tqdm(filtered_listings):
         lat, long = BROCK_STREET_LAT_LONG
@@ -47,12 +46,3 @@ def calculate_route(listing_paths: list[str]):
         duration_minutes = traveltime["duration"] / 60.0
 
         tqdm.write(f"{listing.identifier} {duration_minutes}")
-
-
-def main():
-    listing_paths = sorted(list(pathlib.Path("data/rs").glob("*/listing.json")))
-    calculate_route(listing_paths)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/crawler/csv_exporter.py b/crawler/csv_exporter.py
index 10d0a0f..1c6d023 100644
--- a/crawler/csv_exporter.py
+++ b/crawler/csv_exporter.py
@@ -4,7 +4,9 @@ import pandas as pd
 
 
 def export_to_csv(
-    listings: list[Listing], output_file: Path, columns: list[str]
+    listings: list[Listing],
+    output_file: Path,
+    columns: list[str],
 ) -> None:
     ds = [listing.dict_nicely() for listing in listings]
     df = pd.DataFrame(ds)
diff --git a/crawler/data_access.py b/crawler/data_access.py
index 1955255..a658543 100644
--- a/crawler/data_access.py
+++ b/crawler/data_access.py
@@ -2,7 +2,7 @@ import asyncio
 from dataclasses import dataclass
 import json
 import pathlib
-from typing import List, Dict
+from typing import Any, List, Dict
 from rec import floorplan, routing
 import re
 import datetime
@@ -11,7 +11,7 @@ import datetime
 @dataclass()
 class Listing:
     identifier: int
-    _cached: Dict = None
+    _cached: Dict | None = None
     data_dir: pathlib.Path = pathlib.Path("data/rs/")
     ALL_COLUMNS = [
         "identifier",
@@ -46,10 +46,8 @@ class Listing:
             while str(d['identifier']) in str(data_dir.resolve().absolute()):
                 data_dir = data_dir.parent
             listing = Listing(d["identifier"], data_dir=data_dir)
-            if (
-                listing.last_seen is not None
-                and listing.last_seen < seen_in_the_last_n_days
-            ):
+            if (listing.last_seen is not None
+                    and listing.last_seen < seen_in_the_last_n_days):
                 identifiers.append(listing)
 
         return identifiers
@@ -107,18 +105,15 @@ class Listing:
         objs = []
         for floorplan_path in self.list_floorplans():
             estimated_sqm, model_output, predictions = floorplan.calculate_model(
-                floorplan_path
-            )
-            objs.append(
-                {
-                    "floorplan_path": str(floorplan_path),
-                    "estimated_sqm": estimated_sqm,
-                    "model_output": model_output,
-                    "no_predictions": len(
-                        predictions
-                    ),  # cant serialize the predictions itself since its a tensor
-                }
-            )
+                floorplan_path)
+            objs.append({
+                "floorplan_path": str(floorplan_path),
+                "estimated_sqm": estimated_sqm,
+                "model_output": model_output,
+                "no_predictions": len(
+                    predictions
+                ),  # cant serialize the predictions itself since its a tensor
+            })
 
         with open(self.path_floorplan_model_json(), "w") as f:
             json.dump(objs, f)
@@ -131,9 +126,8 @@ class Listing:
         with open(self.path_floorplan_json()) as f:
             objs = json.load(f)
 
-        max_sqm = max(
-            [o["estimated_sqm"] for o in objs if o is None]
-        )  # filter out Nones
+        max_sqm = max([o["estimated_sqm"] for o in objs
+                       if o is None])  # filter out Nones
         return max_sqm
 
     async def calculate_sqm_ocr(self, recalculate=True):
@@ -143,15 +137,12 @@ class Listing:
         objs = []
         for floorplan_path in self.list_floorplans():
             estimated_sqm, model_output = await asyncio.to_thread(
-                floorplan.calculate_ocr, floorplan_path
-            )
-            objs.append(
-                {
-                    "floorplan_path": str(floorplan_path),
-                    "estimated_sqm": estimated_sqm,
-                    "text": model_output,
-                }
-            )
+                floorplan.calculate_ocr, floorplan_path)
+            objs.append({
+                "floorplan_path": str(floorplan_path),
+                "estimated_sqm": estimated_sqm,
+                "text": model_output,
+            })
 
         with open(self.path_floorplan_ocr_json(), "w") as f:
             json.dump(objs, f)
@@ -164,19 +155,23 @@ class Listing:
         with open(self.path_floorplan_ocr_json()) as f:
             objs = json.load(f)
 
-        sqms = [o["estimated_sqm"] for o in objs if o["estimated_sqm"] is not None]
+        sqms = [
+            o["estimated_sqm"] for o in objs if o["estimated_sqm"] is not None
+        ]
         if len(sqms) == 0:
             return None
         max_sqm = max(sqms)
         return max_sqm
 
-    def calculate_route(self, dest_lat: float, dest_lon: float, recalculate=False):
+    def calculate_route(self,
+                        dest_lat: float,
+                        dest_lon: float,
+                        recalculate=False):
         if self.path_routing_json().exists() and not recalculate:
             return
 
-        result = routing.transit_route(
-            self.latitude, self.longitude, dest_lat, dest_lon
-        )
+        result = routing.transit_route(self.latitude, self.longitude, dest_lat,
+                                       dest_lon)
         with open(self.path_routing_json(), "w") as f:
             json.dump(result, f)
 
@@ -200,11 +195,11 @@ class Listing:
                 return json.load(f)
 
     @property
-    def detailobject(self):
+    def detailobject(self) -> dict[str, Any]:
         if self._cached is None:
             with open(self.path_detail_json()) as f:
                 self._cached = json.load(f)
-        return self._cached
+        return self._cached  # type: ignore
 
     @property
     def price(self) -> float:
@@ -217,7 +212,7 @@ class Listing:
     @property
     def price_per_sqm(self) -> float:
         if self.sqm_ocr is None or self.sqm_ocr == 0:
-            return None
+            return -1
         return self.price / self.sqm_ocr
 
     @property
@@ -233,8 +228,9 @@ class Listing:
         return self.detailobject["property"]["longitude"]
 
     @property
-    def leaseLeft(self) -> int:
-        ds = self.detailobject["property"].get("tenureInfo", {}).get("content", [])
+    def leaseLeft(self) -> float | None:
+        ds = self.detailobject["property"].get("tenureInfo",
+                                               {}).get("content", [])
         for d in ds:
             if d["type"] == "lengthOfLease":
                 matches = re.findall(r"(\d+\.?\d*)", d["value"])
@@ -250,7 +246,7 @@ class Listing:
         return (now - ds).days
 
     @property
-    def last_seen(self) -> int:
+    def last_seen(self) -> int | None:
         if not self.path_last_seen_listing().exists():
             return None
 
@@ -260,8 +256,9 @@ class Listing:
             return (datetime.datetime.now() - dt).days
 
     @property
-    def serviceCharge(self) -> float:
-        ds = self.detailobject["property"].get("tenureInfo", {}).get("content", [])
+    def serviceCharge(self) -> float | None:
+        ds = self.detailobject["property"].get("tenureInfo",
+                                               {}).get("content", [])
         for d in ds:
             if d["type"] == "annualServiceCharge":
                 matches = re.findall(r"([\d,.]+)", d["value"])
@@ -276,8 +273,7 @@ class Listing:
         # aka new home
         try:
             return self.detailobject["property"]["development"]
-        except:
-            print(self.identifier)
+        except Exception:
             return False
 
     @property
@@ -294,39 +290,33 @@ class Listing:
     def dict_nicely(self):
         return {
             "identifier":
-                self.identifier,
+            self.identifier,
             "sqm_ocr":
-                self.sqm_ocr,
+            self.sqm_ocr,
             "price":
-                self.price,
+            self.price,
             "price_per_sqm":
-                self.price_per_sqm,
+            self.price_per_sqm,
             "url":
-                self.url,
+            self.url,
             "bedrooms":
-                self.bedrooms,
+            self.bedrooms,
             "travel_time_fastest":
-                None if len(self.travel_time) == 0 else self.travel_time[0],
+            None if len(self.travel_time) == 0 else self.travel_time[0],
             "travel_time_second":
-                None if len(self.travel_time) < 2 else self.travel_time[1],
+            None if len(self.travel_time) < 2 else self.travel_time[1],
             "lease_left":
-                self.leaseLeft,
+            self.leaseLeft,
             "service_charge":
-                self.serviceCharge,
+            self.serviceCharge,
             "development":
-                self.development,
+            self.development,
             "tenure_type":
-                self.tenure_type,
+            self.tenure_type,
             "updated_days":
-                self.updateDaysAgo,
+            self.updateDaysAgo,
             "status":
-                self.status,
+            self.status,
             "last_seen":
-                self.last_seen,
+            self.last_seen,
         }
-
-
-if __name__ == "__main__":
-    listing_paths = sorted(list(pathlib.Path("data/rs").glob("*/listing.json")))
-    listings = Listing.get_all_listings()
-    print(listings[0].list_floorplans())
diff --git a/crawler/rec/query.py b/crawler/rec/query.py
index 121fcd8..4e7b2f4 100644
--- a/crawler/rec/query.py
+++ b/crawler/rec/query.py
@@ -1,11 +1,11 @@
 # from diskcache import Cache
 import enum
-from typing import List
+from typing import Any, List
 import aiohttp
 import requests
 import urllib3
 
-urllib3.disable_warnings()
+urllib3.disable_warnings()  # type: ignore
 
 
 class ListingType(enum.StrEnum):
@@ -38,12 +38,12 @@ async def detail_query(detail_id: int):
     }
     url = f"https://api.rightmove.co.uk/api/property/{detail_id}"
     async with aiohttp.ClientSession() as session:
-        async with session.get(url, params=params, headers=headers) as response:
+        async with session.get(url, params=params,
+                               headers=headers) as response:
             if response.status != 200:
                 raise Exception(
                     f"""id: {detail_id}. Status Code: {response.status}."""
-                    f"""Failed due to: {await response.text()}"""
-                )
+                    f"""Failed due to: {await response.text()}""")
             return await response.json()
 
 
@@ -57,11 +57,11 @@ async def listing_query(
     max_price: int,
     location_id: str = "STATION^5168",  # kings cross station
     mustNewHome: bool = False,
-    max_days_since_added: int = None,
+    max_days_since_added: int = 30,
     property_type: List["PropertyType"] = [],
-    page_size=25,
-) -> dict:
-    params = {
+    page_size: int = 25,
+) -> dict[str, Any]:
+    params: dict[str, str] = {
         "locationIdentifier": location_id,
         "channel": channel.upper(),
         "page": str(page),
@@ -77,14 +77,14 @@ async def listing_query(
         "appVersion": "4.28.0",
     }
     if channel is ListingType.BUY:
-        params["dontShow"] = "sharedOwnership,retirement",
+        params["dontShow"] = "sharedOwnership,retirement"
         if len(property_type) > 0:
             params["propertyTypes"] = ",".join(property_type)
         if max_days_since_added is not None and max_days_since_added not in [
-            1, 3, 7, 14
+                1, 3, 7, 14
         ]:
             raise Exception("Invalid max days. Can only be", [1, 3, 7, 14])
-        params["maxDaysSinceAdded"] = max_days_since_added
+        params["maxDaysSinceAdded"] = str(max_days_since_added)
 
         if mustNewHome:
             params["mustHave"] = "newHome"