From bb9afc76fef76caf09390c94732fa2125e7f6c36 Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Sat, 17 May 2025 20:40:44 +0000 Subject: [PATCH] expose rightmove query parameters as cli options --- crawler/1_dump_listings.py | 40 +++++++++++++++++++++++---------- crawler/main.py | 45 +++++++++++++++++++++++++++++++++++--- 2 files changed, 70 insertions(+), 15 deletions(-) diff --git a/crawler/1_dump_listings.py b/crawler/1_dump_listings.py index 36fe470..ea6162f 100644 --- a/crawler/1_dump_listings.py +++ b/crawler/1_dump_listings.py @@ -1,33 +1,47 @@ +from dataclasses import dataclass import pathlib from rec.query import listing_query from rec.districts import get_districts from data_access import Listing +@dataclass(frozen=True) +class QueryParameters: + min_bedrooms: int + max_bedrooms: int + min_price: int + max_price: int + district_names: set[str] + radius: float = 0 + page_size: int = 500 # items per page + max_days_since_added: int | None = None + + def dump_listings( - district_names: set[str] | None = None, - data_dir: pathlib.Path = pathlib.Path("data/rs/") -): - districts = get_districts() if district_names is None else { + parameters: QueryParameters, + data_dir: pathlib.Path = pathlib.Path("data/rs/"), +) -> list[Listing]: + districts = { district: locid for district, locid in get_districts().items() - if district in district_names + if district in parameters.district_names } print("Valid districts to scrape:", districts.keys()) + listings = [] for district, locid in districts.items(): print("#### District:", district) for i in [1, 2]: try: d = listing_query( page=i, - min_bedrooms=1, - max_bedrooms=4, - radius=0, - min_price=0, - max_price=1000000, + min_bedrooms=parameters.min_bedrooms, + max_bedrooms=parameters.max_bedrooms, + radius=parameters.radius, + min_price=parameters.min_price, + max_price=parameters.max_price, location_id=locid, - page_size=500, - max_days_since_added=None, + page_size=parameters.page_size, + max_days_since_added=parameters.max_days_since_added, ) except Exception as e: print(e) @@ -43,7 +57,9 @@ def dump_listings( listing = Listing(identifier, data_dir=data_dir) listing.dump_listing(property) + listings.append(listing) print() # break line as we used end=, above. + return listings def main(): diff --git a/crawler/main.py b/crawler/main.py index 6e40416..66e5687 100644 --- a/crawler/main.py +++ b/crawler/main.py @@ -41,6 +41,30 @@ def cli(ctx, data_dir: str): @cli.command() +@click.option( + '--min-bedrooms', + default=1, + help='Minimum number of bedrooms', + type=click.IntRange(min=1), +) +@click.option( + '--max-bedrooms', + default=5, + help='Maximum number of bedrooms', + type=click.IntRange(min=1), +) +@click.option( + '--min-price', + default=0, + help='Minimum price', + type=click.IntRange(min=0), +) +@click.option( + '--max-price', + default=1000000, + help='Maximum price', + type=click.IntRange(min=0), +) @click.option( '--district', default=None, @@ -49,13 +73,28 @@ def cli(ctx, data_dir: str): multiple=True, ) @click.pass_context -def dump_listings(ctx: click.core.Context, district: list[str]): +def dump_listings( + ctx: click.core.Context, + district: list[str], + min_bedrooms: int, + max_bedrooms: int, + min_price: int, + max_price: int, +): data_dir: str = ctx.obj['data_dir'] + query_parameters = dump_listings_module.QueryParameters( + district_names=set(district), + min_bedrooms=min_bedrooms, + max_bedrooms=max_bedrooms, + min_price=min_price, + max_price=max_price, + ) click.echo( - f'Running dump_listings for districts {district} and data dir {data_dir}' + f'Running dump_listings for districts {district}, data dir {data_dir} and parameters: ' + f'{query_parameters}' ) data_dir_path = pathlib.Path(data_dir) - dump_listings_module.dump_listings(set(district), data_dir_path) + dump_listings_module.dump_listings(query_parameters, data_dir_path) @cli.command()