diff --git a/crawler/3_dump_images.py b/crawler/3_dump_images.py deleted file mode 100644 index afc3fd5..0000000 --- a/crawler/3_dump_images.py +++ /dev/null @@ -1,51 +0,0 @@ -import asyncio -from pathlib import Path -import aiohttp -from repositories import ListingRepository -from tenacity import retry, stop_after_attempt, wait_random -from tqdm.asyncio import tqdm - -from models import Listing - -# Setting this too high either crashes rightmove or gets us blocked -semaphore = asyncio.Semaphore(5) - - -async def dump_images( - repository: ListingRepository, - image_base_path: Path = Path("data/rs/"), -): - listings = await repository.get_listings() - updated_listings = await tqdm.gather( - *[dump_images_for_listing(listing, image_base_path) for listing in listings] - ) - await repository.upsert_listings( - [listing for listing in updated_listings if listing is not None] - ) - - -@retry(wait=wait_random(min=1, max=2), stop=stop_after_attempt(3)) -async def dump_images_for_listing(listing: Listing, base_path: Path) -> Listing | None: - all_floorplans = listing.additional_info.get("property", {}).get("floorplans", []) - for floorplan in all_floorplans: - url = floorplan["url"] - picname = url.split("/")[-1] - floorplan_path = Path(base_path, str(listing.id), "floorplans", picname) - if floorplan_path.exists(): - continue - try: - async with semaphore: - async with aiohttp.ClientSession() as session: - async with session.get(url) as response: - if response.status == 404: - return None - if response.status != 200: - raise Exception(f"Error for {url}: {response.status}") - floorplan_path.parent.mkdir(parents=True, exist_ok=True) - with open(floorplan_path, "wb") as f: - f.write(await response.read()) - listing.floorplan_image_paths.append(str(floorplan_path)) - return listing - except Exception as e: - tqdm.write(f"Error for {url}: {e}") - raise e # raise so that we retry it