Refactor codebase following Clean Code principles and add 229 tests

- Extract helpers to reduce function sizes (listing_tasks, app.py, query.py, listing_fetcher)
  - Replace nonlocal mutations with _PipelineState dataclass in listing_tasks
  - Fix bugs: isinstance→equality check in repository, verify_exp for OIDC tokens
  - Consolidate duplicate filter methods in listing_repository
  - Move hardcoded config to env vars with backward-compatible defaults
  - Simplify CLI decorator to auto-build QueryParameters
  - Add deprecation docstring to data_access.py
  - Test count: 158 → 387 (all passing)
This commit is contained in:
Viktor Barzin 2026-02-07 20:19:57 +00:00
parent 7e05b3c971
commit 150342bb9e
No known key found for this signature in database
GPG key ID: 0EB088298288D958
48 changed files with 5029 additions and 990 deletions

View file

@ -6,6 +6,7 @@ from datetime import datetime
import logging
import multiprocessing
from pathlib import Path
from urllib.parse import urlparse
import aiohttp
from models.listing import FurnishType, Listing, ListingSite, RentListing
from rec import floorplan
@ -14,8 +15,33 @@ from repositories.listing_repository import ListingRepository
logger = logging.getLogger("uvicorn.error")
# Also use celery task logger for visibility in worker output
celery_logger = logging.getLogger("celery.task")
# Limit OCR threads to 25% of available cores to avoid starving other work.
MAX_OCR_WORKERS = max(1, multiprocessing.cpu_count() // 4)
def _parse_furnish_type(raw: str | None) -> FurnishType:
"""Normalise the raw furnish-type string from the API into a FurnishType enum."""
if raw is None:
return FurnishType.UNKNOWN
if "landlord" in raw.lower():
return FurnishType.ASK_LANDLORD
lowered = raw.lower()
try:
return FurnishType(lowered)
except ValueError:
return FurnishType.UNKNOWN
def _parse_available_from(raw: str | None) -> datetime | None:
"""Parse the available-from date string into a datetime, or None."""
if raw is None:
return None
if raw.lower() == "now":
return datetime.now()
try:
return datetime.strptime(raw, "%d/%m/%Y")
except ValueError:
return None
class ListingProcessor:
@ -62,7 +88,6 @@ class ListingProcessor:
on_step_complete(short_name)
except Exception as e:
logger.error(f"[{listing_id}] {step_class_name} failed: {e}")
celery_logger.error(f"[{listing_id}] {step_class_name} failed: {e}")
return None
return listing
@ -92,7 +117,7 @@ class FetchListingDetailsStep(Step):
async def process(self, listing_id: int) -> Listing:
logger.debug(f"[{listing_id}] Fetching property details from API")
celery_logger.info(f"[{listing_id}] Fetching details...")
logger.info(f"[{listing_id}] Fetching details...")
existing_listings = await self.listing_repository.get_listings(
only_ids=[listing_id]
@ -105,30 +130,15 @@ class FetchListingDetailsStep(Step):
listing_details = await detail_query(listing_id)
furnish_type_str = listing_details["property"].get("letFurnishType", "unknown")
if furnish_type_str is None:
furnish_type_str = "unknown"
elif "landlord" in furnish_type_str.lower():
furnish_type_str = "ask landlord"
else:
furnish_type_str = furnish_type_str.lower()
furnish_type = FurnishType(furnish_type_str)
furnish_type = _parse_furnish_type(
listing_details["property"].get("letFurnishType", "unknown")
)
available_from: datetime | None = None
available_from_str: str | None = listing_details["property"]["letDateAvailable"]
if available_from_str is None:
available_from = None
elif available_from_str.lower() == "now":
available_from = datetime.now()
else:
try:
available_from = datetime.strptime(available_from_str, "%d/%m/%Y")
except ValueError:
# If the date format is not as expected, return None
available_from = None
available_from = _parse_available_from(
listing_details["property"]["letDateAvailable"]
)
photos = listing_details["property"]["photos"]
# listing = Listing(
listing = RentListing( # TODO: should pick based on price?
id=listing_id,
price=listing_details["property"]["price"],
@ -150,7 +160,7 @@ class FetchListingDetailsStep(Step):
)
await self.listing_repository.upsert_listings([listing])
celery_logger.info(
logger.info(
f"[{listing_id}] Details fetched: £{listing.price}, "
f"{listing.number_of_bedrooms}BR, {listing.agency}"
)
@ -190,13 +200,13 @@ class FetchImagesStep(Step):
downloaded = 0
client_timeout = aiohttp.ClientTimeout(total=30)
for floorplan_obj in all_floorplans:
url = floorplan_obj["url"]
picname = url.split("/")[-1]
floorplan_path = Path(base_path, str(listing.id), "floorplans", picname)
if floorplan_path.exists():
continue
async with aiohttp.ClientSession() as session:
async with aiohttp.ClientSession() as session:
for floorplan_obj in all_floorplans:
url = floorplan_obj["url"]
picname = Path(urlparse(url).path).name
floorplan_path = Path(base_path, str(listing.id), "floorplans", picname)
if floorplan_path.exists():
continue
async with session.get(url, timeout=client_timeout) as response:
if response.status == 404:
return listing
@ -210,7 +220,7 @@ class FetchImagesStep(Step):
await self.listing_repository.upsert_listings([listing])
celery_logger.info(f"[{listing_id}] Downloaded {downloaded} floorplan images")
logger.info(f"[{listing_id}] Downloaded {downloaded} floorplan images")
logger.debug(f"[{listing_id}] Image fetch complete")
return listing
@ -220,7 +230,7 @@ class DetectFloorplanStep(Step):
def __init__(self, listing_repository: ListingRepository):
super().__init__(listing_repository)
self.ocr_semaphore = asyncio.Semaphore(multiprocessing.cpu_count() // 4)
self.ocr_semaphore = asyncio.Semaphore(MAX_OCR_WORKERS)
async def needs_processing(self, listing_id: int) -> bool:
listings = await self.listing_repository.get_listings(only_ids=[listing_id])
@ -256,7 +266,7 @@ class DetectFloorplanStep(Step):
await self.listing_repository.upsert_listings([listing])
if max_sqm > 0:
celery_logger.info(f"[{listing_id}] OCR detected {max_sqm} sqm")
logger.info(f"[{listing_id}] OCR detected {max_sqm} sqm")
else:
logger.debug(f"[{listing_id}] OCR: no square meters detected")