diff --git a/crawler/data_access.py b/crawler/data_access.py index cda1708..a2a9838 100644 --- a/crawler/data_access.py +++ b/crawler/data_access.py @@ -158,7 +158,7 @@ class Listing: json.dump(objs, f) @property - def sqm_model(self, recalculate=True): + def sqm_model(self, recalculate=True) -> float: if not self.path_floorplan_model_json().exists() or recalculate: self.calculate_sqm_model() @@ -193,7 +193,7 @@ class Listing: with open(self.path_floorplan_ocr_json(), "w") as f: json.dump(objs, f) - async def sqm_ocr(self, recalculate=False): + async def sqm_ocr(self, recalculate=False) -> float | None: if not self.path_floorplan_ocr_json().exists() or recalculate: await self.calculate_sqm_ocr() diff --git a/crawler/main.py b/crawler/main.py index c833c41..4d8f697 100644 --- a/crawler/main.py +++ b/crawler/main.py @@ -88,6 +88,12 @@ def listing_filter_options(func): default=14, type=int, ) + @click.option( + "--min-sqm", + help="Minimum square meters for the listing", + default=None, + type=int, + ) @wraps(func) def wrapper(*args, **kwargs): return func(*args, **kwargs) @@ -128,6 +134,7 @@ def dump_listings( furnish_types: list[str], available_from: datetime | None, last_seen_days: int, + min_sqm: int | None = None, ): data_dir: str = ctx.obj["data_dir"] query_parameters = QueryParameters( @@ -140,6 +147,7 @@ def dump_listings( furnish_types=[FurnishType[furnish_type] for furnish_type in furnish_types], let_date_available_from=available_from, last_seen_days=last_seen_days, + min_sqm=min_sqm, ) click.echo( f"Running dump_listings for districts {district}, data dir {data_dir} and parameters: " @@ -288,6 +296,7 @@ def export_immoweb( furnish_types: list[str], available_from: datetime | None, last_seen_days: int, + min_sqm: int | None = None, ): query_parameters = QueryParameters( listing_type=ListingType[type], @@ -299,6 +308,7 @@ def export_immoweb( furnish_types=[FurnishType[furnish_type] for furnish_type in furnish_types], let_date_available_from=available_from, last_seen_days=last_seen_days, + min_sqm=min_sqm, ) click.echo( f"Exporting data to {output_file} that matches the query parameters: {query_parameters}" diff --git a/crawler/rec/query.py b/crawler/rec/query.py index e340bd0..3c0c50e 100644 --- a/crawler/rec/query.py +++ b/crawler/rec/query.py @@ -36,6 +36,7 @@ class QueryParameters: # available from; council tax let_date_available_from: datetime | None = None last_seen_days: int | None = None + min_sqm: int | None = None async def filter_listings( @@ -68,6 +69,9 @@ async def filter_listings( and listing.letDateAvailable < query_parameters.let_date_available_from ): continue + sqm_ocr = await listing.sqm_ocr() or 0 + if query_parameters.min_sqm is not None and sqm_ocr < query_parameters.min_sqm: + continue filtered_listings.append(listing) return filtered_listings