- Extract rate limiter DRY: consolidate 3 duplicated check/respond paths into _check_counter and _enforce_limit helpers, add proper type annotations - Replace bare Exception raises with FloorplanDownloadError and RightmoveApiError; narrow catch clauses to specific exception types; fix Step base class to inherit from ABC - Consolidate MAX_OCR_WORKERS into config/scraper_config.py; extract _find_tenure_value helper to deduplicate tenure parsing - Extract _build_poi_distances_lookup from stream endpoint to reduce nesting - Fix csv_exporter: optional decisions.json, NaN instead of -1 sentinels, guard against division by zero on missing square meters - Fix notifications.py broken list[Surface]() constructor, database.py stale comments and missing type annotation, auth.py type:ignore, ui_exporter.py stale TODO - Fix 3 pre-existing test failures: mock cache layer in streaming tests, bypass rate limiter for test isolation, fix cache invalidation test to account for two-pattern scan loop
86 lines
3.1 KiB
Python
86 lines
3.1 KiB
Python
"""Image fetcher service - downloads floorplan images for listings."""
|
|
import asyncio
|
|
import logging
|
|
from pathlib import Path
|
|
from urllib.parse import urlparse
|
|
|
|
import aiohttp
|
|
from rec.exceptions import FloorplanDownloadError
|
|
from repositories import ListingRepository
|
|
from tenacity import retry, stop_after_attempt, wait_random
|
|
from tqdm.asyncio import tqdm
|
|
|
|
from models import Listing
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Maximum number of concurrent image downloads.
|
|
# Setting this too high either crashes Rightmove or gets us blocked.
|
|
MAX_CONCURRENT_DOWNLOADS = 5
|
|
semaphore = asyncio.Semaphore(MAX_CONCURRENT_DOWNLOADS)
|
|
|
|
|
|
async def dump_images(
|
|
repository: ListingRepository,
|
|
image_base_path: Path = Path("data/rs/"),
|
|
) -> None:
|
|
"""Download floorplan images for all listings."""
|
|
listings = await repository.get_listings()
|
|
async with aiohttp.ClientSession() as session:
|
|
updated_listings = await tqdm.gather(
|
|
*[
|
|
dump_images_for_listing(listing, image_base_path, session=session)
|
|
for listing in listings
|
|
]
|
|
)
|
|
await repository.upsert_listings(
|
|
[listing for listing in updated_listings if listing is not None]
|
|
)
|
|
|
|
|
|
@retry(wait=wait_random(min=1, max=2), stop=stop_after_attempt(3))
|
|
async def dump_images_for_listing(
|
|
listing: Listing,
|
|
base_path: Path,
|
|
session: aiohttp.ClientSession | None = None,
|
|
) -> Listing | None:
|
|
"""Download floorplan images for a single listing."""
|
|
all_floorplans = listing.additional_info.get("property", {}).get("floorplans", [])
|
|
for floorplan in all_floorplans:
|
|
url = floorplan["url"]
|
|
picname = Path(urlparse(url).path).name
|
|
floorplan_path = Path(base_path, str(listing.id), "floorplans", picname)
|
|
if floorplan_path.exists():
|
|
continue
|
|
try:
|
|
owns_session = session is None
|
|
active_session = session or aiohttp.ClientSession()
|
|
try:
|
|
async with semaphore:
|
|
async with active_session.get(url) as response:
|
|
if response.status == 404:
|
|
logger.warning(
|
|
"Listing %s: floorplan not found (404) at %s",
|
|
listing.id,
|
|
url,
|
|
)
|
|
return None
|
|
if response.status != 200:
|
|
raise FloorplanDownloadError(url, response.status)
|
|
floorplan_path.parent.mkdir(parents=True, exist_ok=True)
|
|
with open(floorplan_path, "wb") as f:
|
|
f.write(await response.read())
|
|
listing.floorplan_image_paths.append(str(floorplan_path))
|
|
return listing
|
|
finally:
|
|
if owns_session:
|
|
await active_session.close()
|
|
except Exception as e:
|
|
logger.error(
|
|
"Listing %s: error downloading floorplan from %s: %s",
|
|
listing.id,
|
|
url,
|
|
e,
|
|
)
|
|
raise
|
|
return None
|