"""Image fetcher service - downloads floorplan images for listings.""" import asyncio import logging from pathlib import Path from urllib.parse import urlparse import aiohttp from repositories import ListingRepository from tenacity import retry, stop_after_attempt, wait_random from tqdm.asyncio import tqdm from models import Listing logger = logging.getLogger(__name__) # Maximum number of concurrent image downloads. # Setting this too high either crashes Rightmove or gets us blocked. MAX_CONCURRENT_DOWNLOADS = 5 semaphore = asyncio.Semaphore(MAX_CONCURRENT_DOWNLOADS) async def dump_images( repository: ListingRepository, image_base_path: Path = Path("data/rs/"), ) -> None: """Download floorplan images for all listings.""" listings = await repository.get_listings() async with aiohttp.ClientSession() as session: updated_listings = await tqdm.gather( *[ dump_images_for_listing(listing, image_base_path, session=session) for listing in listings ] ) await repository.upsert_listings( [listing for listing in updated_listings if listing is not None] ) @retry(wait=wait_random(min=1, max=2), stop=stop_after_attempt(3)) async def dump_images_for_listing( listing: Listing, base_path: Path, session: aiohttp.ClientSession | None = None, ) -> Listing | None: """Download floorplan images for a single listing.""" all_floorplans = listing.additional_info.get("property", {}).get("floorplans", []) for floorplan in all_floorplans: url = floorplan["url"] picname = Path(urlparse(url).path).name floorplan_path = Path(base_path, str(listing.id), "floorplans", picname) if floorplan_path.exists(): continue try: owns_session = session is None active_session = session or aiohttp.ClientSession() try: async with semaphore: async with active_session.get(url) as response: if response.status == 404: logger.warning( "Listing %s: floorplan not found (404) at %s", listing.id, url, ) return None if response.status != 200: raise Exception( f"Error downloading floorplan for listing {listing.id} " f"from {url}: HTTP {response.status}" ) floorplan_path.parent.mkdir(parents=True, exist_ok=True) with open(floorplan_path, "wb") as f: f.write(await response.read()) listing.floorplan_image_paths.append(str(floorplan_path)) return listing finally: if owns_session: await active_session.close() except Exception as e: logger.error( "Listing %s: error downloading floorplan from %s: %s", listing.id, url, e, ) raise return None