From 68cc70bd115188b94edcc155d70fa0f2ac959c70 Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Sat, 17 May 2025 22:42:37 +0000 Subject: [PATCH] dump images using aiohttp and concurrently --- crawler/2_dump_detail.py | 3 +-- crawler/3_dump_images.py | 55 +++++++++++++++++++++------------------- 2 files changed, 30 insertions(+), 28 deletions(-) diff --git a/crawler/2_dump_detail.py b/crawler/2_dump_detail.py index 42984e5..8019fe2 100644 --- a/crawler/2_dump_detail.py +++ b/crawler/2_dump_detail.py @@ -19,8 +19,7 @@ async def dump_detail(listing_paths: list[str]): async def _dump_detail_for_listing(listing: Listing): - incremental = True - if incremental and not listing.path_detail_json().exists(): + if listing.path_detail_json().exists(): return # for listing in tqdm(filtered_listings): diff --git a/crawler/3_dump_images.py b/crawler/3_dump_images.py index 2036ffa..ecbbe2c 100644 --- a/crawler/3_dump_images.py +++ b/crawler/3_dump_images.py @@ -1,37 +1,40 @@ +import asyncio import json import pathlib -from urllib.request import urlretrieve +import aiohttp from tqdm import tqdm from data_access import Listing +# Setting this too high either crashes rightmove or gets us blocked +semaphore = asyncio.Semaphore(10) -def dump_images(listing_paths: list[str]): - for listing in tqdm(Listing.get_all_listings(listing_paths)): - with open(listing.path_detail_json()) as f: - detail = json.load(f) - # for photo in detail["property"]["photos"]: - # url = photo["maxSizeUrl"] - # picname = url.split("/")[-1] - # order = photo["order"] - # p = listing.path_pic_file(order, picname) - # if p.exists(): - # continue - # tqdm.write(str(p)) - # urlretrieve(url, p) +async def dump_images(listing_paths: list[str]): + listings = Listing.get_all_listings(listing_paths) + await tqdm.gather(*[dump_images_for_listing(listing) for listing in listings]) - for photo in detail["property"]["floorplans"]: - url = photo["url"] - picname = url.split("/")[-1] - order = photo["order"] - p = listing.path_floorplan_file(order, picname) - if p.exists(): - continue - tqdm.write(str(p)) - try: - urlretrieve(url, p) - except Exception as e: - tqdm.write(f"Error for {url}: {e}") + +async def dump_images_for_listing(listing: Listing): + with open(listing.path_detail_json()) as f: + detail = json.load(f) + + for photo in detail["property"]["floorplans"]: + url = photo["url"] + picname = url.split("/")[-1] + order = photo["order"] + p = listing.path_floorplan_file(order, picname) + if p.exists(): + continue + try: + async with aiohttp.ClientSession() as session: + async with semaphore: + async with session.get(url) as response: + if response.status != 200: + raise Exception(f"Error for {url}: {response.status}") + with open(p, "wb") as f: + f.write(await response.read()) + except Exception as e: + tqdm.write(f"Error for {url}: {e}") def main():