diff --git a/crawler/1_dump_listings.py b/crawler/1_dump_listings.py index cb3cc2f..fdc12ac 100644 --- a/crawler/1_dump_listings.py +++ b/crawler/1_dump_listings.py @@ -4,17 +4,18 @@ import pathlib from typing import Any from rec.query import detail_query, listing_query, QueryParameters from rec.districts import get_districts -import repositories +from repositories import ListingRepository from sqlalchemy import Engine from tqdm.asyncio import tqdm from data_access import Listing +from models import Listing as modelListing async def dump_listings( parameters: QueryParameters, - db_engine: Engine, + repository: ListingRepository, data_dir: pathlib.Path = pathlib.Path("data/rs/"), -) -> list[Listing]: +) -> list[modelListing]: if parameters.district_names: districts = { district: locid @@ -28,7 +29,7 @@ async def dump_listings( semaphore = asyncio.Semaphore(5) # if too high, rightmove drops connections json_responses = await tqdm.gather( *[ - _dump_listings_with_semaphore(semaphore, i, parameters, locid) + _fetch_listings_with_semaphore(semaphore, i, parameters, locid) for locid in districts.values() for i in [1, 2] ], @@ -41,29 +42,36 @@ async def dump_listings( for property in response_json["properties"]: identifier = property["identifier"] - listing = Listing(identifier, data_dir=data_dir) - listing.dump_listing(property) + listing = Listing(identifier, data_dir=data_dir, _listing_object=property) listings.append(listing) # if listing is already in db, do not fetch details again - repository = repositories.ListingRepository(db_engine) all_listings = await repository.get_listings( only_ids=[listing.identifier for listing in listings] ) all_listing_ids = {listing.id for listing in all_listings} - await tqdm.gather( + listings_without_details = [ + listing for listing in listings if not listing.path_detail_json().exists() + ] + listing_details = await tqdm.gather( *[ - _dump_detail_with_semaphore(semaphore, listing) - for listing in listings + _fetch_detail_with_semaphore(semaphore, listing.identifier) + for listing in listings_without_details # if listing.identifier not in all_listing_ids # One day we will rely solely on the model data ], desc="Fetching details", ) - return listings + for listing, detail in zip(listings_without_details, listing_details): + listing._details_object = detail + + model_listings = await repository.upsert_listings(listings) # upsert in db + await dump_listings_to_fs(listings) + + return model_listings -async def _dump_listings_with_semaphore( +async def _fetch_listings_with_semaphore( semaphore: asyncio.Semaphore, page_id: int, parameters: QueryParameters, @@ -87,12 +95,18 @@ async def _dump_listings_with_semaphore( return listing_query_result -async def _dump_detail_with_semaphore(semaphore: asyncio.Semaphore, listing: Listing): - if listing.path_detail_json().exists(): - return - - # for listing in tqdm(filtered_listings): +async def _fetch_detail_with_semaphore( + semaphore: asyncio.Semaphore, listing_id: int +) -> dict[str, Any]: async with semaphore: - d = await detail_query(listing.identifier) - with open(listing.path_detail_json(), "w") as f: - json.dump(d, f) + d = await detail_query(listing_id) + return d + + +async def dump_listings_to_fs(listings: list[Listing]) -> None: + for listing in listings: + if not listing.path_listing_json().exists(): + listing.dump_listing() + if not listing.path_detail_json().exists(): + with open(listing.path_detail_json(), "w") as f: + json.dump(listing._details_object, f, indent=4) diff --git a/crawler/data_access.py b/crawler/data_access.py index b9a7b53..de6609d 100644 --- a/crawler/data_access.py +++ b/crawler/data_access.py @@ -13,7 +13,8 @@ import datetime @dataclass() class Listing: identifier: int - _cached: Dict | None = None + _details_object: dict[str, Any] | None = None + _listing_object: dict[str, Any] | None = None data_dir: pathlib.Path = pathlib.Path("data/rs/") ALL_COLUMNS = [ "identifier", @@ -98,15 +99,19 @@ class Listing: def path_price_history(self) -> pathlib.Path: return self.path_listing() / "price_history.json" - def dump_listing(self, d: dict): + def dump_listing(self) -> None: + if self._listing_object is None: + raise ValueError("No listing data provided to dump.") with open(self.path_listing_json(), "w") as f: - json.dump(d, f) + json.dump(self._listing_object, f) with open(self.path_last_seen_listing(), "w") as f: dt = datetime.datetime.now().isoformat() json.dump(dt, f) # some places list pw in price and others pcm - price = max(d["price"], d.get("monthlyRent", 0)) + price = max( + self._listing_object["price"], self._listing_object.get("monthlyRent", 0) + ) self.append_price_history(price) def append_price_history(self, price: float) -> None: @@ -249,19 +254,18 @@ class Listing: @property def listingobject(self): - if self._cached is None: - with open(self.path_listing_json()) as f: - return json.load(f) + with open(self.path_listing_json()) as f: + return json.load(f) @property def detailobject(self) -> dict[str, Any]: - if self._cached is None: + if self._details_object is None: if self.path_detail_json().exists(): with open(self.path_detail_json()) as f: - self._cached = json.load(f) + self._details_object = json.load(f) else: return {} - return self._cached # type: ignore + return self._details_object # type: ignore @property def price(self) -> float: diff --git a/crawler/main.py b/crawler/main.py index 35c26cf..b051530 100644 --- a/crawler/main.py +++ b/crawler/main.py @@ -155,11 +155,10 @@ def dump_listings( f"{query_parameters}" ) data_dir_path = pathlib.Path(data_dir) - listings = asyncio.run( - dump_listings_module.dump_listings(query_parameters, engine, data_dir_path) - ) repository = ListingRepository(engine=engine) - asyncio.run(repository.upsert_listings(listings)) + asyncio.run( + dump_listings_module.dump_listings(query_parameters, repository, data_dir_path) + ) @cli.command() diff --git a/crawler/models/listing.py b/crawler/models/listing.py index efdd2aa..1b4c9aa 100644 --- a/crawler/models/listing.py +++ b/crawler/models/listing.py @@ -50,9 +50,6 @@ class RentListing(Listing, table=True): class BuyListing(Listing, table=True): service_charge: float | None = Field(default=None, nullable=True) - council_tax_band: str | None = Field( - default=None, nullable=True - ) # e.g., A, B, C, D, E, F, G lease_left: int | None = Field( default=None, nullable=True ) # in years, e.g., 90, 80, etc. diff --git a/crawler/rec/query.py b/crawler/rec/query.py index 758cc6c..5ce2bce 100644 --- a/crawler/rec/query.py +++ b/crawler/rec/query.py @@ -115,7 +115,7 @@ async def listing_query( radius: float, min_price: int, max_price: int, - location_id: str = "STATION^5168", # kings cross station + location_id: str, # = "STATION^5168", # kings cross station mustNewHome: bool = False, max_days_since_added: int = 30, property_type: list[PropertyType] = [],