Refactor codebase following Clean Code principles and add 229 tests
- Extract helpers to reduce function sizes (listing_tasks, app.py, query.py, listing_fetcher) - Replace nonlocal mutations with _PipelineState dataclass in listing_tasks - Fix bugs: isinstance→equality check in repository, verify_exp for OIDC tokens - Consolidate duplicate filter methods in listing_repository - Move hardcoded config to env vars with backward-compatible defaults - Simplify CLI decorator to auto-build QueryParameters - Add deprecation docstring to data_access.py - Test count: 158 → 387 (all passing)
This commit is contained in:
parent
7e05b3c971
commit
150342bb9e
48 changed files with 5029 additions and 990 deletions
|
|
@ -6,6 +6,7 @@ from datetime import datetime
|
|||
import logging
|
||||
import multiprocessing
|
||||
from pathlib import Path
|
||||
from urllib.parse import urlparse
|
||||
import aiohttp
|
||||
from models.listing import FurnishType, Listing, ListingSite, RentListing
|
||||
from rec import floorplan
|
||||
|
|
@ -14,8 +15,33 @@ from repositories.listing_repository import ListingRepository
|
|||
|
||||
logger = logging.getLogger("uvicorn.error")
|
||||
|
||||
# Also use celery task logger for visibility in worker output
|
||||
celery_logger = logging.getLogger("celery.task")
|
||||
# Limit OCR threads to 25% of available cores to avoid starving other work.
|
||||
MAX_OCR_WORKERS = max(1, multiprocessing.cpu_count() // 4)
|
||||
|
||||
|
||||
def _parse_furnish_type(raw: str | None) -> FurnishType:
|
||||
"""Normalise the raw furnish-type string from the API into a FurnishType enum."""
|
||||
if raw is None:
|
||||
return FurnishType.UNKNOWN
|
||||
if "landlord" in raw.lower():
|
||||
return FurnishType.ASK_LANDLORD
|
||||
lowered = raw.lower()
|
||||
try:
|
||||
return FurnishType(lowered)
|
||||
except ValueError:
|
||||
return FurnishType.UNKNOWN
|
||||
|
||||
|
||||
def _parse_available_from(raw: str | None) -> datetime | None:
|
||||
"""Parse the available-from date string into a datetime, or None."""
|
||||
if raw is None:
|
||||
return None
|
||||
if raw.lower() == "now":
|
||||
return datetime.now()
|
||||
try:
|
||||
return datetime.strptime(raw, "%d/%m/%Y")
|
||||
except ValueError:
|
||||
return None
|
||||
|
||||
|
||||
class ListingProcessor:
|
||||
|
|
@ -62,7 +88,6 @@ class ListingProcessor:
|
|||
on_step_complete(short_name)
|
||||
except Exception as e:
|
||||
logger.error(f"[{listing_id}] {step_class_name} failed: {e}")
|
||||
celery_logger.error(f"[{listing_id}] {step_class_name} failed: {e}")
|
||||
return None
|
||||
return listing
|
||||
|
||||
|
|
@ -92,7 +117,7 @@ class FetchListingDetailsStep(Step):
|
|||
|
||||
async def process(self, listing_id: int) -> Listing:
|
||||
logger.debug(f"[{listing_id}] Fetching property details from API")
|
||||
celery_logger.info(f"[{listing_id}] Fetching details...")
|
||||
logger.info(f"[{listing_id}] Fetching details...")
|
||||
|
||||
existing_listings = await self.listing_repository.get_listings(
|
||||
only_ids=[listing_id]
|
||||
|
|
@ -105,30 +130,15 @@ class FetchListingDetailsStep(Step):
|
|||
|
||||
listing_details = await detail_query(listing_id)
|
||||
|
||||
furnish_type_str = listing_details["property"].get("letFurnishType", "unknown")
|
||||
if furnish_type_str is None:
|
||||
furnish_type_str = "unknown"
|
||||
elif "landlord" in furnish_type_str.lower():
|
||||
furnish_type_str = "ask landlord"
|
||||
else:
|
||||
furnish_type_str = furnish_type_str.lower()
|
||||
furnish_type = FurnishType(furnish_type_str)
|
||||
furnish_type = _parse_furnish_type(
|
||||
listing_details["property"].get("letFurnishType", "unknown")
|
||||
)
|
||||
|
||||
available_from: datetime | None = None
|
||||
available_from_str: str | None = listing_details["property"]["letDateAvailable"]
|
||||
if available_from_str is None:
|
||||
available_from = None
|
||||
elif available_from_str.lower() == "now":
|
||||
available_from = datetime.now()
|
||||
else:
|
||||
try:
|
||||
available_from = datetime.strptime(available_from_str, "%d/%m/%Y")
|
||||
except ValueError:
|
||||
# If the date format is not as expected, return None
|
||||
available_from = None
|
||||
available_from = _parse_available_from(
|
||||
listing_details["property"]["letDateAvailable"]
|
||||
)
|
||||
|
||||
photos = listing_details["property"]["photos"]
|
||||
# listing = Listing(
|
||||
listing = RentListing( # TODO: should pick based on price?
|
||||
id=listing_id,
|
||||
price=listing_details["property"]["price"],
|
||||
|
|
@ -150,7 +160,7 @@ class FetchListingDetailsStep(Step):
|
|||
)
|
||||
await self.listing_repository.upsert_listings([listing])
|
||||
|
||||
celery_logger.info(
|
||||
logger.info(
|
||||
f"[{listing_id}] Details fetched: £{listing.price}, "
|
||||
f"{listing.number_of_bedrooms}BR, {listing.agency}"
|
||||
)
|
||||
|
|
@ -190,13 +200,13 @@ class FetchImagesStep(Step):
|
|||
|
||||
downloaded = 0
|
||||
client_timeout = aiohttp.ClientTimeout(total=30)
|
||||
for floorplan_obj in all_floorplans:
|
||||
url = floorplan_obj["url"]
|
||||
picname = url.split("/")[-1]
|
||||
floorplan_path = Path(base_path, str(listing.id), "floorplans", picname)
|
||||
if floorplan_path.exists():
|
||||
continue
|
||||
async with aiohttp.ClientSession() as session:
|
||||
async with aiohttp.ClientSession() as session:
|
||||
for floorplan_obj in all_floorplans:
|
||||
url = floorplan_obj["url"]
|
||||
picname = Path(urlparse(url).path).name
|
||||
floorplan_path = Path(base_path, str(listing.id), "floorplans", picname)
|
||||
if floorplan_path.exists():
|
||||
continue
|
||||
async with session.get(url, timeout=client_timeout) as response:
|
||||
if response.status == 404:
|
||||
return listing
|
||||
|
|
@ -210,7 +220,7 @@ class FetchImagesStep(Step):
|
|||
|
||||
await self.listing_repository.upsert_listings([listing])
|
||||
|
||||
celery_logger.info(f"[{listing_id}] Downloaded {downloaded} floorplan images")
|
||||
logger.info(f"[{listing_id}] Downloaded {downloaded} floorplan images")
|
||||
logger.debug(f"[{listing_id}] Image fetch complete")
|
||||
return listing
|
||||
|
||||
|
|
@ -220,7 +230,7 @@ class DetectFloorplanStep(Step):
|
|||
|
||||
def __init__(self, listing_repository: ListingRepository):
|
||||
super().__init__(listing_repository)
|
||||
self.ocr_semaphore = asyncio.Semaphore(multiprocessing.cpu_count() // 4)
|
||||
self.ocr_semaphore = asyncio.Semaphore(MAX_OCR_WORKERS)
|
||||
|
||||
async def needs_processing(self, listing_id: int) -> bool:
|
||||
listings = await self.listing_repository.get_listings(only_ids=[listing_id])
|
||||
|
|
@ -256,7 +266,7 @@ class DetectFloorplanStep(Step):
|
|||
await self.listing_repository.upsert_listings([listing])
|
||||
|
||||
if max_sqm > 0:
|
||||
celery_logger.info(f"[{listing_id}] OCR detected {max_sqm} sqm")
|
||||
logger.info(f"[{listing_id}] OCR detected {max_sqm} sqm")
|
||||
else:
|
||||
logger.debug(f"[{listing_id}] OCR: no square meters detected")
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue