From 289206afc0b60bb87c548e0efb2d91f75bdbfcfc Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Sun, 8 Jun 2025 20:58:28 +0000 Subject: [PATCH] some cleanups --- crawler/1_dump_listings.py | 5 ++--- crawler/3_dump_images.py | 1 - crawler/data_access.py | 8 ++++++-- crawler/main.py | 10 +++++----- crawler/models/listing.py | 2 +- crawler/rec/query.py | 5 ++++- crawler/runall.sh | 4 ++-- crawler/ui_exporter.py | 2 -- 8 files changed, 20 insertions(+), 17 deletions(-) diff --git a/crawler/1_dump_listings.py b/crawler/1_dump_listings.py index 8f44e22..94ab514 100644 --- a/crawler/1_dump_listings.py +++ b/crawler/1_dump_listings.py @@ -5,7 +5,6 @@ from typing import Any from rec.query import detail_query, listing_query, QueryParameters from rec.districts import get_districts from repositories import ListingRepository -from sqlalchemy import Engine from tqdm.asyncio import tqdm from data_access import Listing from models import Listing as modelListing @@ -61,13 +60,13 @@ async def dump_listings( for listing in listings_without_details if listing.identifier not in all_listing_ids ], - desc="Fetching details", + desc="Fetching details (only missing)", ) for listing, detail in zip(listings_without_details, listing_details): listing._details_object = detail - model_listings = await repository.upsert_listings_legacy(listings) # upsert in db await dump_listings_to_fs(listings) + model_listings = await repository.upsert_listings_legacy(listings) # upsert in db return model_listings diff --git a/crawler/3_dump_images.py b/crawler/3_dump_images.py index 2cac145..2277814 100644 --- a/crawler/3_dump_images.py +++ b/crawler/3_dump_images.py @@ -5,7 +5,6 @@ import aiohttp from repositories import ListingRepository from tqdm.asyncio import tqdm -# from data_access import Listing from models import Listing # Setting this too high either crashes rightmove or gets us blocked diff --git a/crawler/data_access.py b/crawler/data_access.py index f6e4bf2..7493bf1 100644 --- a/crawler/data_access.py +++ b/crawler/data_access.py @@ -263,9 +263,13 @@ class Listing: if self.path_detail_json().exists(): with open(self.path_detail_json()) as f: self._details_object = json.load(f) + return self._details_object # type: ignore else: - return {} - return self._details_object # type: ignore + raise ValueError( + f"Detail object for listing {self.identifier} not found." + ) + else: + return self._details_object @property def price(self) -> float: diff --git a/crawler/main.py b/crawler/main.py index 150efa5..580e4ac 100644 --- a/crawler/main.py +++ b/crawler/main.py @@ -44,9 +44,9 @@ def listing_filter_options(func): ) @click.option( "--max-bedrooms", - default=5, + default=10, help="Maximum number of bedrooms", - type=click.IntRange(min=1), + type=click.IntRange(min=1, max=10), # Right move gets unhappy with >10 ) @click.option( "--min-price", @@ -56,9 +56,9 @@ def listing_filter_options(func): ) @click.option( "--max-price", - default=1000000, + default=999_999, help="Maximum price", - type=click.IntRange(min=0), + type=click.IntRange(min=0, max=40_000), # 40k for renting ) @click.option( "--district", @@ -359,7 +359,7 @@ def populate_db( listings = Listing.get_all_listings( [path for path in pathlib.Path(data_dir).glob("*/listing.json")] ) - asyncio.run(repository.upsert_listings(listings)) + asyncio.run(repository.upsert_listings_legacy(listings)) if __name__ == "__main__": diff --git a/crawler/models/listing.py b/crawler/models/listing.py index f97b53d..6feb0f7 100644 --- a/crawler/models/listing.py +++ b/crawler/models/listing.py @@ -239,7 +239,7 @@ class QueryParameters: district_names: set[str] = dataclasses.field(default_factory=set) radius: float = 0 page_size: int = 500 # items per page - max_days_since_added: int = 30 + max_days_since_added: int = 14 # for buy listings furnish_types: list[FurnishType] | None = None # The values below are not supported by rightmove # hence we apply them after fetching diff --git a/crawler/rec/query.py b/crawler/rec/query.py index 003fb2b..b60ea6b 100644 --- a/crawler/rec/query.py +++ b/crawler/rec/query.py @@ -84,7 +84,10 @@ async def listing_query( 7, 14, ]: - raise Exception("Invalid max days. Can only be", [1, 3, 7, 14]) + raise Exception( + f"Invalid max days - {max_days_since_added} Can only be got", + [1, 3, 7, 14], + ) params["maxDaysSinceAdded"] = str(max_days_since_added) if mustNewHome: diff --git a/crawler/runall.sh b/crawler/runall.sh index b5500af..fdbd35b 100755 --- a/crawler/runall.sh +++ b/crawler/runall.sh @@ -2,9 +2,9 @@ set -euxo pipefail -DATA_DIR="data/rs/test" +DATA_DIR="data/rs" -LISTING_FILTER_OPTIONS="--min-price 2000 --max-price 4000 --min-bedrooms 2 --max-bedrooms 4 -t rent --available-from $(date +%Y-%m-%d) --last-seen-days 7 --furnish-types furnished" +LISTING_FILTER_OPTIONS="--min-price 2000 --max-price 4000 --min-bedrooms 2 -t rent --available-from $(date +%Y-%m-%d) --last-seen-days 7 --furnish-types furnished" #LISTING_FILTER_OPTIONS="--min-price 2000 --max-price 2500 --min-bedrooms 2 --max-bedrooms 4 -t rent --available-from $(date +%Y-%m-%d) --last-seen-days 7 --furnish-types furnished --district Islington" # DEBUG: UNCOMMENT ME WHEN TESTING diff --git a/crawler/ui_exporter.py b/crawler/ui_exporter.py index 2091f86..78b180a 100644 --- a/crawler/ui_exporter.py +++ b/crawler/ui_exporter.py @@ -1,8 +1,6 @@ -import dataclasses import json import pathlib -from data_access import Listing from rec.query import QueryParameters from repositories.listing_repository import ListingRepository