Refactor codebase following Clean Code principles and add 229 tests
- Extract helpers to reduce function sizes (listing_tasks, app.py, query.py, listing_fetcher) - Replace nonlocal mutations with _PipelineState dataclass in listing_tasks - Fix bugs: isinstance→equality check in repository, verify_exp for OIDC tokens - Consolidate duplicate filter methods in listing_repository - Move hardcoded config to env vars with backward-compatible defaults - Simplify CLI decorator to auto-build QueryParameters - Add deprecation docstring to data_access.py - Test count: 158 → 387 (all passing)
This commit is contained in:
parent
7e05b3c971
commit
150342bb9e
48 changed files with 5029 additions and 990 deletions
|
|
@ -1,6 +1,9 @@
|
|||
"""Image fetcher service - downloads floorplan images for listings."""
|
||||
import asyncio
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from urllib.parse import urlparse
|
||||
|
||||
import aiohttp
|
||||
from repositories import ListingRepository
|
||||
from tenacity import retry, stop_after_attempt, wait_random
|
||||
|
|
@ -8,8 +11,12 @@ from tqdm.asyncio import tqdm
|
|||
|
||||
from models import Listing
|
||||
|
||||
# Setting this too high either crashes rightmove or gets us blocked
|
||||
semaphore = asyncio.Semaphore(5)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Maximum number of concurrent image downloads.
|
||||
# Setting this too high either crashes Rightmove or gets us blocked.
|
||||
MAX_CONCURRENT_DOWNLOADS = 5
|
||||
semaphore = asyncio.Semaphore(MAX_CONCURRENT_DOWNLOADS)
|
||||
|
||||
|
||||
async def dump_images(
|
||||
|
|
@ -18,38 +25,64 @@ async def dump_images(
|
|||
) -> None:
|
||||
"""Download floorplan images for all listings."""
|
||||
listings = await repository.get_listings()
|
||||
updated_listings = await tqdm.gather(
|
||||
*[dump_images_for_listing(listing, image_base_path) for listing in listings]
|
||||
)
|
||||
async with aiohttp.ClientSession() as session:
|
||||
updated_listings = await tqdm.gather(
|
||||
*[
|
||||
dump_images_for_listing(listing, image_base_path, session=session)
|
||||
for listing in listings
|
||||
]
|
||||
)
|
||||
await repository.upsert_listings(
|
||||
[listing for listing in updated_listings if listing is not None]
|
||||
)
|
||||
|
||||
|
||||
@retry(wait=wait_random(min=1, max=2), stop=stop_after_attempt(3))
|
||||
async def dump_images_for_listing(listing: Listing, base_path: Path) -> Listing | None:
|
||||
async def dump_images_for_listing(
|
||||
listing: Listing,
|
||||
base_path: Path,
|
||||
session: aiohttp.ClientSession | None = None,
|
||||
) -> Listing | None:
|
||||
"""Download floorplan images for a single listing."""
|
||||
all_floorplans = listing.additional_info.get("property", {}).get("floorplans", [])
|
||||
for floorplan in all_floorplans:
|
||||
url = floorplan["url"]
|
||||
picname = url.split("/")[-1]
|
||||
picname = Path(urlparse(url).path).name
|
||||
floorplan_path = Path(base_path, str(listing.id), "floorplans", picname)
|
||||
if floorplan_path.exists():
|
||||
continue
|
||||
try:
|
||||
async with semaphore:
|
||||
async with aiohttp.ClientSession() as session:
|
||||
async with session.get(url) as response:
|
||||
owns_session = session is None
|
||||
active_session = session or aiohttp.ClientSession()
|
||||
try:
|
||||
async with semaphore:
|
||||
async with active_session.get(url) as response:
|
||||
if response.status == 404:
|
||||
logger.warning(
|
||||
"Listing %s: floorplan not found (404) at %s",
|
||||
listing.id,
|
||||
url,
|
||||
)
|
||||
return None
|
||||
if response.status != 200:
|
||||
raise Exception(f"Error for {url}: {response.status}")
|
||||
raise Exception(
|
||||
f"Error downloading floorplan for listing {listing.id} "
|
||||
f"from {url}: HTTP {response.status}"
|
||||
)
|
||||
floorplan_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(floorplan_path, "wb") as f:
|
||||
f.write(await response.read())
|
||||
listing.floorplan_image_paths.append(str(floorplan_path))
|
||||
return listing
|
||||
finally:
|
||||
if owns_session:
|
||||
await active_session.close()
|
||||
except Exception as e:
|
||||
tqdm.write(f"Error for {url}: {e}")
|
||||
raise e # raise so that we retry it
|
||||
logger.error(
|
||||
"Listing %s: error downloading floorplan from %s: %s",
|
||||
listing.id,
|
||||
url,
|
||||
e,
|
||||
)
|
||||
raise
|
||||
return None
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue