wrongmove/services/image_fetcher.py

88 lines
3.2 KiB
Python
Raw Normal View History

"""Image fetcher service - downloads floorplan images for listings."""
import asyncio
import logging
from pathlib import Path
from urllib.parse import urlparse
import aiohttp
from rec.exceptions import FloorplanDownloadError
from repositories import ListingRepository
from tenacity import retry, stop_after_attempt, wait_random
from models import Listing
logger = logging.getLogger(__name__)
# Maximum number of concurrent image downloads.
# Setting this too high either crashes Rightmove or gets us blocked.
MAX_CONCURRENT_DOWNLOADS = 5
semaphore = asyncio.Semaphore(MAX_CONCURRENT_DOWNLOADS)
async def dump_images(
repository: ListingRepository,
image_base_path: Path = Path("data/rs/"),
) -> None:
"""Download floorplan images for all listings."""
listings = await repository.get_listings()
logger.info("Downloading images for %d listings", len(listings))
async with aiohttp.ClientSession() as session:
updated_listings = await asyncio.gather(
*[
dump_images_for_listing(listing, image_base_path, session=session)
for listing in listings
]
)
logger.info("Finished downloading images for %d listings", len(listings))
await repository.upsert_listings(
[listing for listing in updated_listings if listing is not None]
)
@retry(wait=wait_random(min=1, max=2), stop=stop_after_attempt(3))
async def dump_images_for_listing(
listing: Listing,
base_path: Path,
session: aiohttp.ClientSession | None = None,
) -> Listing | None:
"""Download floorplan images for a single listing."""
all_floorplans = listing.additional_info.get("property", {}).get("floorplans", [])
for floorplan in all_floorplans:
url = floorplan["url"]
picname = Path(urlparse(url).path).name
floorplan_path = Path(base_path, str(listing.id), "floorplans", picname)
if floorplan_path.exists():
continue
try:
owns_session = session is None
active_session = session or aiohttp.ClientSession()
try:
async with semaphore:
async with active_session.get(url) as response:
if response.status == 404:
logger.warning(
"Listing %s: floorplan not found (404) at %s",
listing.id,
url,
)
return None
if response.status != 200:
raise FloorplanDownloadError(url, response.status)
floorplan_path.parent.mkdir(parents=True, exist_ok=True)
with open(floorplan_path, "wb") as f:
f.write(await response.read())
listing.floorplan_image_paths.append(str(floorplan_path))
return listing
finally:
if owns_session:
await active_session.close()
except Exception as e:
logger.error(
"Listing %s: error downloading floorplan from %s: %s",
listing.id,
url,
e,
)
raise
return None