wrongmove/crawler/services/image_fetcher.py
Viktor Barzin 150342bb9e
Refactor codebase following Clean Code principles and add 229 tests
- Extract helpers to reduce function sizes (listing_tasks, app.py, query.py, listing_fetcher)
  - Replace nonlocal mutations with _PipelineState dataclass in listing_tasks
  - Fix bugs: isinstance→equality check in repository, verify_exp for OIDC tokens
  - Consolidate duplicate filter methods in listing_repository
  - Move hardcoded config to env vars with backward-compatible defaults
  - Simplify CLI decorator to auto-build QueryParameters
  - Add deprecation docstring to data_access.py
  - Test count: 158 → 387 (all passing)
2026-02-07 20:19:57 +00:00

88 lines
3.2 KiB
Python

"""Image fetcher service - downloads floorplan images for listings."""
import asyncio
import logging
from pathlib import Path
from urllib.parse import urlparse
import aiohttp
from repositories import ListingRepository
from tenacity import retry, stop_after_attempt, wait_random
from tqdm.asyncio import tqdm
from models import Listing
logger = logging.getLogger(__name__)
# Maximum number of concurrent image downloads.
# Setting this too high either crashes Rightmove or gets us blocked.
MAX_CONCURRENT_DOWNLOADS = 5
semaphore = asyncio.Semaphore(MAX_CONCURRENT_DOWNLOADS)
async def dump_images(
repository: ListingRepository,
image_base_path: Path = Path("data/rs/"),
) -> None:
"""Download floorplan images for all listings."""
listings = await repository.get_listings()
async with aiohttp.ClientSession() as session:
updated_listings = await tqdm.gather(
*[
dump_images_for_listing(listing, image_base_path, session=session)
for listing in listings
]
)
await repository.upsert_listings(
[listing for listing in updated_listings if listing is not None]
)
@retry(wait=wait_random(min=1, max=2), stop=stop_after_attempt(3))
async def dump_images_for_listing(
listing: Listing,
base_path: Path,
session: aiohttp.ClientSession | None = None,
) -> Listing | None:
"""Download floorplan images for a single listing."""
all_floorplans = listing.additional_info.get("property", {}).get("floorplans", [])
for floorplan in all_floorplans:
url = floorplan["url"]
picname = Path(urlparse(url).path).name
floorplan_path = Path(base_path, str(listing.id), "floorplans", picname)
if floorplan_path.exists():
continue
try:
owns_session = session is None
active_session = session or aiohttp.ClientSession()
try:
async with semaphore:
async with active_session.get(url) as response:
if response.status == 404:
logger.warning(
"Listing %s: floorplan not found (404) at %s",
listing.id,
url,
)
return None
if response.status != 200:
raise Exception(
f"Error downloading floorplan for listing {listing.id} "
f"from {url}: HTTP {response.status}"
)
floorplan_path.parent.mkdir(parents=True, exist_ok=True)
with open(floorplan_path, "wb") as f:
f.write(await response.read())
listing.floorplan_image_paths.append(str(floorplan_path))
return listing
finally:
if owns_session:
await active_session.close()
except Exception as e:
logger.error(
"Listing %s: error downloading floorplan from %s: %s",
listing.id,
url,
e,
)
raise
return None