parameterize dump images step to work with custom data paths

This commit is contained in:
Viktor Barzin 2025-05-14 21:01:58 +00:00
parent e424361ed9
commit 07fef7fbab
No known key found for this signature in database
GPG key ID: 4056458DBDBF8863
3 changed files with 19 additions and 9 deletions

View file

@ -1,11 +1,12 @@
import json
import pathlib
from urllib.request import urlretrieve
from tqdm import tqdm
from data_access import Listing
def dump_images():
for listing in tqdm(Listing.get_all_listings()):
def dump_images(listing_paths: list[str]):
for listing in tqdm(Listing.get_all_listings(listing_paths)):
with open(listing.path_detail_json()) as f:
detail = json.load(f)
@ -29,12 +30,13 @@ def dump_images():
tqdm.write(str(p))
try:
urlretrieve(url, p)
except:
tqdm.write(f"404 for {url}")
except Exception as e:
tqdm.write(f"Error for {url}: {e}")
def main():
dump_images()
listing_paths = sorted(list(pathlib.Path("data/rs").glob("*/listing.json")))
dump_images(listing_paths)
if __name__ == "__main__":

View file

@ -19,7 +19,12 @@ class Listing:
for listing_path in listing_paths:
with open(listing_path) as f:
d = json.load(f)
identifiers.append(Listing(d["identifier"]))
# data_dir is the first directory before the listing_path
data_dir = pathlib.Path(listing_path)
while str(d['identifier']) in str(data_dir.resolve().absolute()):
data_dir = data_dir.parent
identifiers.append(Listing(d["identifier"], data_dir=data_dir))
return identifiers

View file

@ -66,9 +66,12 @@ def dump_detail(ctx: click.core.Context):
@cli.command()
def dump_images():
click.echo('Running dump_images')
dump_images_module.dump_images()
@click.pass_context
def dump_images(ctx: click.core.Context):
data_dir = ctx.obj['data_dir']
click.echo(f'Running dump_images stored in {data_dir}')
listing_paths = sorted(list(pathlib.Path(data_dir).glob("*/listing.json")))
dump_images_module.dump_images(listing_paths)
@cli.command()