diff --git a/crawler/3_dump_images.py b/crawler/3_dump_images.py index bd45491..2036ffa 100644 --- a/crawler/3_dump_images.py +++ b/crawler/3_dump_images.py @@ -1,11 +1,12 @@ import json +import pathlib from urllib.request import urlretrieve from tqdm import tqdm from data_access import Listing -def dump_images(): - for listing in tqdm(Listing.get_all_listings()): +def dump_images(listing_paths: list[str]): + for listing in tqdm(Listing.get_all_listings(listing_paths)): with open(listing.path_detail_json()) as f: detail = json.load(f) @@ -29,12 +30,13 @@ def dump_images(): tqdm.write(str(p)) try: urlretrieve(url, p) - except: - tqdm.write(f"404 for {url}") + except Exception as e: + tqdm.write(f"Error for {url}: {e}") def main(): - dump_images() + listing_paths = sorted(list(pathlib.Path("data/rs").glob("*/listing.json"))) + dump_images(listing_paths) if __name__ == "__main__": diff --git a/crawler/data_access.py b/crawler/data_access.py index e7fea83..71a9c82 100644 --- a/crawler/data_access.py +++ b/crawler/data_access.py @@ -19,7 +19,12 @@ class Listing: for listing_path in listing_paths: with open(listing_path) as f: d = json.load(f) - identifiers.append(Listing(d["identifier"])) + + # data_dir is the first directory before the listing_path + data_dir = pathlib.Path(listing_path) + while str(d['identifier']) in str(data_dir.resolve().absolute()): + data_dir = data_dir.parent + identifiers.append(Listing(d["identifier"], data_dir=data_dir)) return identifiers diff --git a/crawler/main.py b/crawler/main.py index 5175357..1d1ef62 100644 --- a/crawler/main.py +++ b/crawler/main.py @@ -66,9 +66,12 @@ def dump_detail(ctx: click.core.Context): @cli.command() -def dump_images(): - click.echo('Running dump_images') - dump_images_module.dump_images() +@click.pass_context +def dump_images(ctx: click.core.Context): + data_dir = ctx.obj['data_dir'] + click.echo(f'Running dump_images stored in {data_dir}') + listing_paths = sorted(list(pathlib.Path(data_dir).glob("*/listing.json"))) + dump_images_module.dump_images(listing_paths) @cli.command()