diff --git a/crawler/2_dump_detail.py b/crawler/2_dump_detail.py index e29e995..bf9696a 100644 --- a/crawler/2_dump_detail.py +++ b/crawler/2_dump_detail.py @@ -1,13 +1,14 @@ import json +import pathlib from rec.query import detail_query from tqdm import tqdm from data_access import Listing -def dump_detail(): +def dump_detail(listing_paths: list[str]): incremental = True - listings = Listing.get_all_listings() + listings = Listing.get_all_listings(listing_paths) filtered_listings = [] for listing in listings: # We introduced last_seen later, so not all entries have it. @@ -32,7 +33,8 @@ def dump_detail(): def main(): - dump_detail() + listing_paths = sorted(list(pathlib.Path("data/rs").glob("*/listing.json"))) + dump_detail(listing_paths) if __name__ == "__main__": diff --git a/crawler/data_access.py b/crawler/data_access.py index b0be4b1..e7fea83 100644 --- a/crawler/data_access.py +++ b/crawler/data_access.py @@ -13,8 +13,8 @@ class Listing: _cached: Dict = None data_dir: pathlib.Path = pathlib.Path("data/rs/") - def get_all_listings(self) -> List["Listing"]: - listing_paths = sorted(list(self.data_dir.glob("*/listing.json"))) + @staticmethod + def get_all_listings(listing_paths: list[str]) -> List["Listing"]: identifiers = [] for listing_path in listing_paths: with open(listing_path) as f: @@ -294,5 +294,6 @@ class Listing: if __name__ == "__main__": + listing_paths = sorted(list(pathlib.Path("data/rs").glob("*/listing.json"))) listings = Listing.get_all_listings() print(listings[0].list_floorplans()) diff --git a/crawler/main.py b/crawler/main.py index 847bf91..5175357 100644 --- a/crawler/main.py +++ b/crawler/main.py @@ -20,18 +20,6 @@ steps_to_handlers = { @click.group() -def cli(): - pass - - -@cli.command() -@click.option( - '--district', - default=None, - help='Districts to scrape', - type=click.Choice(get_districts().keys(), case_sensitive=False), - multiple=True, -) @click.option( '--data-dir', default=pathlib.Path("data/rs/"), @@ -43,7 +31,24 @@ def cli(): resolve_path=True, ), ) -def dump_listings(district: list[str], data_dir: str): +@click.pass_context +def cli(ctx, data_dir: str): + ctx.ensure_object(dict) + ctx.obj['data_dir'] = data_dir + pass + + +@cli.command() +@click.option( + '--district', + default=None, + help='Districts to scrape', + type=click.Choice(get_districts().keys(), case_sensitive=False), + multiple=True, +) +@click.pass_context +def dump_listings(ctx: click.core.Context, district: list[str]): + data_dir: str = ctx.obj['data_dir'] click.echo( f'Running dump_listings for districts {district} and data dir {data_dir}' ) @@ -52,9 +57,12 @@ def dump_listings(district: list[str], data_dir: str): @cli.command() -def dump_detail(): - click.echo('Running dump_detail') - dump_detail_module.dump_detail() +@click.pass_context +def dump_detail(ctx: click.core.Context): + data_dir = ctx.obj['data_dir'] + click.echo(f'Running dump_detail for listings stored in {data_dir}') + listing_paths = sorted(list(pathlib.Path(data_dir).glob("*/listing.json"))) + dump_detail_module.dump_detail(listing_paths) @cli.command()