wrongmove/services/image_fetcher.py
Viktor Barzin f833309297
Refactor backend for cleaner error handling, DRY, and type safety
- Extract rate limiter DRY: consolidate 3 duplicated check/respond paths
  into _check_counter and _enforce_limit helpers, add proper type annotations
- Replace bare Exception raises with FloorplanDownloadError and
  RightmoveApiError; narrow catch clauses to specific exception types;
  fix Step base class to inherit from ABC
- Consolidate MAX_OCR_WORKERS into config/scraper_config.py; extract
  _find_tenure_value helper to deduplicate tenure parsing
- Extract _build_poi_distances_lookup from stream endpoint to reduce nesting
- Fix csv_exporter: optional decisions.json, NaN instead of -1 sentinels,
  guard against division by zero on missing square meters
- Fix notifications.py broken list[Surface]() constructor, database.py
  stale comments and missing type annotation, auth.py type:ignore,
  ui_exporter.py stale TODO
- Fix 3 pre-existing test failures: mock cache layer in streaming tests,
  bypass rate limiter for test isolation, fix cache invalidation test to
  account for two-pattern scan loop
2026-02-10 22:19:24 +00:00

86 lines
3.1 KiB
Python

"""Image fetcher service - downloads floorplan images for listings."""
import asyncio
import logging
from pathlib import Path
from urllib.parse import urlparse
import aiohttp
from rec.exceptions import FloorplanDownloadError
from repositories import ListingRepository
from tenacity import retry, stop_after_attempt, wait_random
from tqdm.asyncio import tqdm
from models import Listing
logger = logging.getLogger(__name__)
# Maximum number of concurrent image downloads.
# Setting this too high either crashes Rightmove or gets us blocked.
MAX_CONCURRENT_DOWNLOADS = 5
semaphore = asyncio.Semaphore(MAX_CONCURRENT_DOWNLOADS)
async def dump_images(
repository: ListingRepository,
image_base_path: Path = Path("data/rs/"),
) -> None:
"""Download floorplan images for all listings."""
listings = await repository.get_listings()
async with aiohttp.ClientSession() as session:
updated_listings = await tqdm.gather(
*[
dump_images_for_listing(listing, image_base_path, session=session)
for listing in listings
]
)
await repository.upsert_listings(
[listing for listing in updated_listings if listing is not None]
)
@retry(wait=wait_random(min=1, max=2), stop=stop_after_attempt(3))
async def dump_images_for_listing(
listing: Listing,
base_path: Path,
session: aiohttp.ClientSession | None = None,
) -> Listing | None:
"""Download floorplan images for a single listing."""
all_floorplans = listing.additional_info.get("property", {}).get("floorplans", [])
for floorplan in all_floorplans:
url = floorplan["url"]
picname = Path(urlparse(url).path).name
floorplan_path = Path(base_path, str(listing.id), "floorplans", picname)
if floorplan_path.exists():
continue
try:
owns_session = session is None
active_session = session or aiohttp.ClientSession()
try:
async with semaphore:
async with active_session.get(url) as response:
if response.status == 404:
logger.warning(
"Listing %s: floorplan not found (404) at %s",
listing.id,
url,
)
return None
if response.status != 200:
raise FloorplanDownloadError(url, response.status)
floorplan_path.parent.mkdir(parents=True, exist_ok=True)
with open(floorplan_path, "wb") as f:
f.write(await response.read())
listing.floorplan_image_paths.append(str(floorplan_path))
return listing
finally:
if owns_session:
await active_session.close()
except Exception as e:
logger.error(
"Listing %s: error downloading floorplan from %s: %s",
listing.id,
url,
e,
)
raise
return None