parameterize dump images step to work with custom data paths

This commit is contained in:
Viktor Barzin 2025-05-14 21:01:58 +00:00
parent e424361ed9
commit 07fef7fbab
No known key found for this signature in database
GPG key ID: 4056458DBDBF8863
3 changed files with 19 additions and 9 deletions

View file

@ -1,11 +1,12 @@
import json import json
import pathlib
from urllib.request import urlretrieve from urllib.request import urlretrieve
from tqdm import tqdm from tqdm import tqdm
from data_access import Listing from data_access import Listing
def dump_images(): def dump_images(listing_paths: list[str]):
for listing in tqdm(Listing.get_all_listings()): for listing in tqdm(Listing.get_all_listings(listing_paths)):
with open(listing.path_detail_json()) as f: with open(listing.path_detail_json()) as f:
detail = json.load(f) detail = json.load(f)
@ -29,12 +30,13 @@ def dump_images():
tqdm.write(str(p)) tqdm.write(str(p))
try: try:
urlretrieve(url, p) urlretrieve(url, p)
except: except Exception as e:
tqdm.write(f"404 for {url}") tqdm.write(f"Error for {url}: {e}")
def main(): def main():
dump_images() listing_paths = sorted(list(pathlib.Path("data/rs").glob("*/listing.json")))
dump_images(listing_paths)
if __name__ == "__main__": if __name__ == "__main__":

View file

@ -19,7 +19,12 @@ class Listing:
for listing_path in listing_paths: for listing_path in listing_paths:
with open(listing_path) as f: with open(listing_path) as f:
d = json.load(f) d = json.load(f)
identifiers.append(Listing(d["identifier"]))
# data_dir is the first directory before the listing_path
data_dir = pathlib.Path(listing_path)
while str(d['identifier']) in str(data_dir.resolve().absolute()):
data_dir = data_dir.parent
identifiers.append(Listing(d["identifier"], data_dir=data_dir))
return identifiers return identifiers

View file

@ -66,9 +66,12 @@ def dump_detail(ctx: click.core.Context):
@cli.command() @cli.command()
def dump_images(): @click.pass_context
click.echo('Running dump_images') def dump_images(ctx: click.core.Context):
dump_images_module.dump_images() data_dir = ctx.obj['data_dir']
click.echo(f'Running dump_images stored in {data_dir}')
listing_paths = sorted(list(pathlib.Path(data_dir).glob("*/listing.json")))
dump_images_module.dump_images(listing_paths)
@cli.command() @cli.command()