wrongmove/services/image_fetcher.py
Viktor Barzin cde3540a1e
Remove watchdog and tqdm dependencies, replace with logging
- Remove watchdog (unused) and tqdm from pyproject.toml dependencies
- Replace tqdm.gather() with asyncio.gather() + logger.info() in
  image_fetcher, floorplan_detector, and route_calculator services
- Replace tqdm progress bar with logger.info() in listing_repository
- Remove tqdm from mypy ignore_missing_imports overrides
2026-02-21 19:39:49 +00:00

87 lines
3.2 KiB
Python

"""Image fetcher service - downloads floorplan images for listings."""
import asyncio
import logging
from pathlib import Path
from urllib.parse import urlparse
import aiohttp
from rec.exceptions import FloorplanDownloadError
from repositories import ListingRepository
from tenacity import retry, stop_after_attempt, wait_random
from models import Listing
logger = logging.getLogger(__name__)
# Maximum number of concurrent image downloads.
# Setting this too high either crashes Rightmove or gets us blocked.
MAX_CONCURRENT_DOWNLOADS = 5
semaphore = asyncio.Semaphore(MAX_CONCURRENT_DOWNLOADS)
async def dump_images(
repository: ListingRepository,
image_base_path: Path = Path("data/rs/"),
) -> None:
"""Download floorplan images for all listings."""
listings = await repository.get_listings()
logger.info("Downloading images for %d listings", len(listings))
async with aiohttp.ClientSession() as session:
updated_listings = await asyncio.gather(
*[
dump_images_for_listing(listing, image_base_path, session=session)
for listing in listings
]
)
logger.info("Finished downloading images for %d listings", len(listings))
await repository.upsert_listings(
[listing for listing in updated_listings if listing is not None]
)
@retry(wait=wait_random(min=1, max=2), stop=stop_after_attempt(3))
async def dump_images_for_listing(
listing: Listing,
base_path: Path,
session: aiohttp.ClientSession | None = None,
) -> Listing | None:
"""Download floorplan images for a single listing."""
all_floorplans = listing.additional_info.get("property", {}).get("floorplans", [])
for floorplan in all_floorplans:
url = floorplan["url"]
picname = Path(urlparse(url).path).name
floorplan_path = Path(base_path, str(listing.id), "floorplans", picname)
if floorplan_path.exists():
continue
try:
owns_session = session is None
active_session = session or aiohttp.ClientSession()
try:
async with semaphore:
async with active_session.get(url) as response:
if response.status == 404:
logger.warning(
"Listing %s: floorplan not found (404) at %s",
listing.id,
url,
)
return None
if response.status != 200:
raise FloorplanDownloadError(url, response.status)
floorplan_path.parent.mkdir(parents=True, exist_ok=True)
with open(floorplan_path, "wb") as f:
f.write(await response.read())
listing.floorplan_image_paths.append(str(floorplan_path))
return listing
finally:
if owns_session:
await active_session.close()
except Exception as e:
logger.error(
"Listing %s: error downloading floorplan from %s: %s",
listing.id,
url,
e,
)
raise
return None