From 48d379567b72297dc0d1da5cb127ec27ee7ae87a Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Wed, 14 May 2025 20:19:08 +0000 Subject: [PATCH] parameterize data path when fetching listings --- crawler/1_dump_listings.py | 8 ++++++-- crawler/data_access.py | 10 ++++------ crawler/main.py | 21 ++++++++++++++++++--- 3 files changed, 28 insertions(+), 11 deletions(-) diff --git a/crawler/1_dump_listings.py b/crawler/1_dump_listings.py index 8ba92d1..36fe470 100644 --- a/crawler/1_dump_listings.py +++ b/crawler/1_dump_listings.py @@ -1,9 +1,13 @@ +import pathlib from rec.query import listing_query from rec.districts import get_districts from data_access import Listing -def dump_listings(district_names: set[str] | None = None): +def dump_listings( + district_names: set[str] | None = None, + data_dir: pathlib.Path = pathlib.Path("data/rs/") +): districts = get_districts() if district_names is None else { district: locid for district, locid in get_districts().items() @@ -37,7 +41,7 @@ def dump_listings(district_names: set[str] | None = None): for property in d["properties"]: identifier = property["identifier"] - listing = Listing(identifier) + listing = Listing(identifier, data_dir=data_dir) listing.dump_listing(property) print() # break line as we used end=, above. diff --git a/crawler/data_access.py b/crawler/data_access.py index 52e4157..b0be4b1 100644 --- a/crawler/data_access.py +++ b/crawler/data_access.py @@ -6,17 +6,15 @@ from rec import floorplan, routing import re import datetime -_DATA_DIR = pathlib.Path("data/rs/") - @dataclass() class Listing: identifier: int _cached: Dict = None + data_dir: pathlib.Path = pathlib.Path("data/rs/") - @staticmethod - def get_all_listings() -> List["Listing"]: - listing_paths = sorted(list(_DATA_DIR.glob("*/listing.json"))) + def get_all_listings(self) -> List["Listing"]: + listing_paths = sorted(list(self.data_dir.glob("*/listing.json"))) identifiers = [] for listing_path in listing_paths: with open(listing_path) as f: @@ -26,7 +24,7 @@ class Listing: return identifiers def path_listing(self) -> pathlib.Path: - p = _DATA_DIR / str(self.identifier) + p = self.data_dir / str(self.identifier) p.mkdir(parents=True, exist_ok=True) return p diff --git a/crawler/main.py b/crawler/main.py index e9a60a2..847bf91 100644 --- a/crawler/main.py +++ b/crawler/main.py @@ -1,3 +1,4 @@ +import pathlib import click import importlib @@ -31,9 +32,23 @@ def cli(): type=click.Choice(get_districts().keys(), case_sensitive=False), multiple=True, ) -def dump_listings(district: list[str]): - click.echo(f'Running dump_listings for districts {district}') - dump_listings_module.dump_listings(set(district)) +@click.option( + '--data-dir', + default=pathlib.Path("data/rs/"), + help='Districts to scrape', + type=click.Path( + writable=True, + file_okay=False, + dir_okay=True, + resolve_path=True, + ), +) +def dump_listings(district: list[str], data_dir: str): + click.echo( + f'Running dump_listings for districts {district} and data dir {data_dir}' + ) + data_dir_path = pathlib.Path(data_dir) + dump_listings_module.dump_listings(set(district), data_dir_path) @cli.command()