"""Image fetcher service - downloads floorplan images for listings.""" import asyncio import logging from pathlib import Path from urllib.parse import urlparse import aiohttp from rec.exceptions import FloorplanDownloadError from repositories import ListingRepository from tenacity import retry, stop_after_attempt, wait_random from tqdm.asyncio import tqdm from models import Listing logger = logging.getLogger(__name__) # Maximum number of concurrent image downloads. # Setting this too high either crashes Rightmove or gets us blocked. MAX_CONCURRENT_DOWNLOADS = 5 semaphore = asyncio.Semaphore(MAX_CONCURRENT_DOWNLOADS) async def dump_images( repository: ListingRepository, image_base_path: Path = Path("data/rs/"), ) -> None: """Download floorplan images for all listings.""" listings = await repository.get_listings() async with aiohttp.ClientSession() as session: updated_listings = await tqdm.gather( *[ dump_images_for_listing(listing, image_base_path, session=session) for listing in listings ] ) await repository.upsert_listings( [listing for listing in updated_listings if listing is not None] ) @retry(wait=wait_random(min=1, max=2), stop=stop_after_attempt(3)) async def dump_images_for_listing( listing: Listing, base_path: Path, session: aiohttp.ClientSession | None = None, ) -> Listing | None: """Download floorplan images for a single listing.""" all_floorplans = listing.additional_info.get("property", {}).get("floorplans", []) for floorplan in all_floorplans: url = floorplan["url"] picname = Path(urlparse(url).path).name floorplan_path = Path(base_path, str(listing.id), "floorplans", picname) if floorplan_path.exists(): continue try: owns_session = session is None active_session = session or aiohttp.ClientSession() try: async with semaphore: async with active_session.get(url) as response: if response.status == 404: logger.warning( "Listing %s: floorplan not found (404) at %s", listing.id, url, ) return None if response.status != 200: raise FloorplanDownloadError(url, response.status) floorplan_path.parent.mkdir(parents=True, exist_ok=True) with open(floorplan_path, "wb") as f: f.write(await response.read()) listing.floorplan_image_paths.append(str(floorplan_path)) return listing finally: if owns_session: await active_session.close() except Exception as e: logger.error( "Listing %s: error downloading floorplan from %s: %s", listing.id, url, e, ) raise return None