From 70e8ef9f954f24f7e840971fe36109a0a540bb03 Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Sun, 11 May 2025 19:04:19 +0000 Subject: [PATCH] [3/n] click-ify add dump images command run with poetry run python main.py --step dump_images --- crawler/3_dump_images.py | 58 +++++++++++++++++++++++----------------- crawler/main.py | 2 ++ 2 files changed, 36 insertions(+), 24 deletions(-) diff --git a/crawler/3_dump_images.py b/crawler/3_dump_images.py index 5792b5b..bd45491 100644 --- a/crawler/3_dump_images.py +++ b/crawler/3_dump_images.py @@ -3,29 +3,39 @@ from urllib.request import urlretrieve from tqdm import tqdm from data_access import Listing -for listing in tqdm(Listing.get_all_listings()): - with open(listing.path_detail_json()) as f: - detail = json.load(f) - # for photo in detail["property"]["photos"]: - # url = photo["maxSizeUrl"] - # picname = url.split("/")[-1] - # order = photo["order"] - # p = listing.path_pic_file(order, picname) - # if p.exists(): - # continue - # tqdm.write(str(p)) - # urlretrieve(url, p) +def dump_images(): + for listing in tqdm(Listing.get_all_listings()): + with open(listing.path_detail_json()) as f: + detail = json.load(f) - for photo in detail["property"]["floorplans"]: - url = photo["url"] - picname = url.split("/")[-1] - order = photo["order"] - p = listing.path_floorplan_file(order, picname) - if p.exists(): - continue - tqdm.write(str(p)) - try: - urlretrieve(url, p) - except: - tqdm.write(f"404 for {url}") + # for photo in detail["property"]["photos"]: + # url = photo["maxSizeUrl"] + # picname = url.split("/")[-1] + # order = photo["order"] + # p = listing.path_pic_file(order, picname) + # if p.exists(): + # continue + # tqdm.write(str(p)) + # urlretrieve(url, p) + + for photo in detail["property"]["floorplans"]: + url = photo["url"] + picname = url.split("/")[-1] + order = photo["order"] + p = listing.path_floorplan_file(order, picname) + if p.exists(): + continue + tqdm.write(str(p)) + try: + urlretrieve(url, p) + except: + tqdm.write(f"404 for {url}") + + +def main(): + dump_images() + + +if __name__ == "__main__": + main() diff --git a/crawler/main.py b/crawler/main.py index b7767c3..530a58f 100644 --- a/crawler/main.py +++ b/crawler/main.py @@ -3,10 +3,12 @@ import importlib dump_listings_module = importlib.import_module('1_dump_listings') dump_detail_module = importlib.import_module('2_dump_detail') +dump_images_module = importlib.import_module('3_dump_images') steps_to_handlers = { 'dump_listings': dump_listings_module.dump_listings, 'dump_detail': dump_detail_module.dump_detail, + 'dump_images': dump_images_module.dump_images, }