Structured logging via JsonFormatter replaces uvicorn's default format so Loki can parse timestamps and fields. 14 business metrics (scrape stats, throttle events, circuit breaker state, cache hit rate, OCR success rate, Celery task lifecycle) are defined in a shared metrics module and instrumented across the scraper pipeline, API, and workers. Celery workers expose a Prometheus HTTP endpoint on configurable ports.
342 lines
13 KiB
Python
342 lines
13 KiB
Python
from __future__ import annotations
|
|
from abc import ABC, abstractmethod
|
|
import asyncio
|
|
from collections.abc import Callable
|
|
from datetime import datetime
|
|
import logging
|
|
from pathlib import Path
|
|
import re
|
|
from typing import Any
|
|
from urllib.parse import urlparse
|
|
import aiohttp
|
|
from config.scraper_config import MAX_OCR_WORKERS
|
|
from models.listing import (
|
|
BuyListing,
|
|
FurnishType,
|
|
Listing,
|
|
ListingSite,
|
|
ListingType,
|
|
QueryParameters,
|
|
RentListing,
|
|
)
|
|
from rec import floorplan
|
|
from rec.exceptions import FloorplanDownloadError
|
|
from rec.query import detail_query
|
|
from repositories.listing_repository import ListingRepository
|
|
|
|
logger = logging.getLogger("uvicorn.error")
|
|
|
|
|
|
def _parse_furnish_type(raw: str | None) -> FurnishType:
|
|
"""Normalise the raw furnish-type string from the API into a FurnishType enum."""
|
|
if raw is None:
|
|
return FurnishType.UNKNOWN
|
|
if "landlord" in raw.lower():
|
|
return FurnishType.ASK_LANDLORD
|
|
lowered = raw.lower()
|
|
try:
|
|
return FurnishType(lowered)
|
|
except ValueError:
|
|
return FurnishType.UNKNOWN
|
|
|
|
|
|
def _parse_available_from(raw: str | None) -> datetime | None:
|
|
"""Parse the available-from date string into a datetime, or None."""
|
|
if raw is None:
|
|
return None
|
|
if raw.lower() == "now":
|
|
return datetime.now()
|
|
try:
|
|
return datetime.strptime(raw, "%d/%m/%Y")
|
|
except ValueError:
|
|
return None
|
|
|
|
|
|
class ListingProcessor:
|
|
semaphore: asyncio.Semaphore
|
|
process_steps: list[Step]
|
|
listing_repository: ListingRepository
|
|
|
|
# Map step class names to short names for progress reporting
|
|
STEP_NAMES: dict[str, str] = {
|
|
"FetchListingDetailsStep": "details",
|
|
"FetchImagesStep": "images",
|
|
"DetectFloorplanStep": "ocr",
|
|
}
|
|
|
|
def __init__(self, listing_repository: ListingRepository, listing_type: ListingType = ListingType.RENT):
|
|
self.semaphore = asyncio.Semaphore(20)
|
|
self.listing_repository = listing_repository
|
|
self.listing_type = listing_type
|
|
# Register new processing steps here
|
|
# Order is important
|
|
self.process_steps = [
|
|
FetchListingDetailsStep(listing_repository, listing_type),
|
|
FetchImagesStep(listing_repository, listing_type),
|
|
DetectFloorplanStep(listing_repository, listing_type),
|
|
]
|
|
|
|
async def process_listing(
|
|
self,
|
|
listing_id: int,
|
|
on_step_complete: Callable[[str], None] | None = None,
|
|
) -> Listing | None:
|
|
await self.listing_repository.mark_seen(listing_id, self.listing_type)
|
|
listing = None
|
|
for step in self.process_steps:
|
|
if await step.needs_processing(listing_id):
|
|
async with self.semaphore:
|
|
step_class_name = step.__class__.__name__
|
|
try:
|
|
listing = await step.process(listing_id)
|
|
logger.debug(f"[{listing_id}] {step_class_name} completed")
|
|
if on_step_complete:
|
|
short_name = self.STEP_NAMES.get(
|
|
step_class_name, step_class_name
|
|
)
|
|
on_step_complete(short_name)
|
|
except (ValueError, KeyError, aiohttp.ClientError, FloorplanDownloadError) as e:
|
|
logger.error(f"[{listing_id}] {step_class_name} failed: {e}")
|
|
return None
|
|
return listing
|
|
|
|
|
|
class Step(ABC):
|
|
listing_repository: ListingRepository
|
|
listing_type: ListingType
|
|
|
|
def __init__(self, listing_repository: ListingRepository, listing_type: ListingType = ListingType.RENT):
|
|
self.listing_repository = listing_repository
|
|
self.listing_type = listing_type
|
|
|
|
def _query_params(self) -> QueryParameters:
|
|
"""Build minimal QueryParameters for ID-based lookups in the correct table."""
|
|
return QueryParameters(listing_type=self.listing_type)
|
|
|
|
@abstractmethod
|
|
async def process(self, listing_id: int) -> Listing: ...
|
|
|
|
@abstractmethod
|
|
async def needs_processing(self, listing_id: int) -> bool:
|
|
return True
|
|
|
|
|
|
def _find_tenure_value(details: dict[str, Any], tenure_type: str) -> str | None:
|
|
"""Find a value in the tenure info content by type key."""
|
|
tenure_content = details.get("property", {}).get("tenureInfo", {}).get("content", [])
|
|
for item in tenure_content:
|
|
if item.get("type") == tenure_type:
|
|
return item.get("value")
|
|
return None
|
|
|
|
|
|
def _parse_service_charge(details: dict[str, Any]) -> float | None:
|
|
"""Parse annual service charge from the tenure info in API response."""
|
|
value = _find_tenure_value(details, "annualServiceCharge")
|
|
if value is not None:
|
|
matches = re.findall(r"([\d,.]+)", str(value))
|
|
if matches:
|
|
return float(matches[0].replace(",", ""))
|
|
return None
|
|
|
|
|
|
def _parse_lease_left(details: dict[str, Any]) -> int | None:
|
|
"""Parse remaining lease years from the tenure info in API response."""
|
|
value = _find_tenure_value(details, "lengthOfLease")
|
|
if value is not None:
|
|
matches = re.findall(r"(\d+)", str(value))
|
|
if matches:
|
|
return int(matches[0])
|
|
return None
|
|
|
|
|
|
class FetchListingDetailsStep(Step):
|
|
async def needs_processing(self, listing_id: int) -> bool:
|
|
existing_listings = await self.listing_repository.get_listings(
|
|
only_ids=[listing_id], query_parameters=self._query_params()
|
|
)
|
|
if len(existing_listings) == 0:
|
|
return True
|
|
return False
|
|
|
|
async def process(self, listing_id: int) -> Listing:
|
|
logger.debug(f"[{listing_id}] Fetching property details from API")
|
|
logger.info(f"[{listing_id}] Fetching details...")
|
|
|
|
existing_listings = await self.listing_repository.get_listings(
|
|
only_ids=[listing_id], query_parameters=self._query_params()
|
|
)
|
|
now = datetime.now()
|
|
if len(existing_listings) > 0:
|
|
# listing exists, do not refresh
|
|
logger.debug(f"[{listing_id}] Already exists, skipping refresh")
|
|
return existing_listings[0]
|
|
|
|
listing_details = await detail_query(listing_id)
|
|
|
|
photos = listing_details["property"]["photos"]
|
|
common_kwargs = dict(
|
|
id=listing_id,
|
|
price=listing_details["property"]["price"],
|
|
number_of_bedrooms=listing_details["property"]["bedrooms"],
|
|
square_meters=None, # populated later
|
|
agency=listing_details["property"]["branch"]["brandName"],
|
|
council_tax_band=listing_details["property"]["councilTaxInfo"]["content"][
|
|
0
|
|
]["value"],
|
|
longitude=listing_details["property"]["longitude"],
|
|
latitude=listing_details["property"]["latitude"],
|
|
price_history_json="{}", # TODO: should upsert from existing
|
|
listing_site=ListingSite.RIGHTMOVE,
|
|
last_seen=now,
|
|
photo_thumbnail=photos[0]["thumbnailUrl"] if len(photos) > 0 else None,
|
|
additional_info=listing_details,
|
|
)
|
|
|
|
listing: Listing
|
|
if self.listing_type == ListingType.BUY:
|
|
listing = BuyListing(
|
|
**common_kwargs,
|
|
service_charge=_parse_service_charge(listing_details),
|
|
lease_left=_parse_lease_left(listing_details),
|
|
)
|
|
else:
|
|
furnish_type = _parse_furnish_type(
|
|
listing_details["property"].get("letFurnishType", "unknown")
|
|
)
|
|
available_from = _parse_available_from(
|
|
listing_details["property"]["letDateAvailable"]
|
|
)
|
|
listing = RentListing(
|
|
**common_kwargs,
|
|
furnish_type=furnish_type,
|
|
available_from=available_from,
|
|
)
|
|
|
|
await self.listing_repository.upsert_listings([listing])
|
|
|
|
logger.info(
|
|
f"[{listing_id}] Details fetched: £{listing.price}, "
|
|
f"{listing.number_of_bedrooms}BR, {listing.agency}"
|
|
)
|
|
logger.debug(f"[{listing_id}] Details fetch complete")
|
|
# TODO: dump to filesystem
|
|
return listing
|
|
|
|
|
|
class FetchImagesStep(Step):
|
|
async def needs_processing(self, listing_id: int) -> bool:
|
|
existing_listings = await self.listing_repository.get_listings(
|
|
only_ids=[listing_id], query_parameters=self._query_params()
|
|
)
|
|
if len(existing_listings) == 0:
|
|
return False # if listing doesn't exist, we can't process it
|
|
listing = existing_listings[0]
|
|
return len(listing.floorplan_image_paths) == 0
|
|
|
|
async def process(self, listing_id: int) -> Listing:
|
|
logger.debug(f"[{listing_id}] Fetching floorplan images")
|
|
|
|
existing_listings = await self.listing_repository.get_listings(
|
|
only_ids=[listing_id], query_parameters=self._query_params()
|
|
)
|
|
if len(existing_listings) == 0:
|
|
raise ValueError(f"Listing {listing_id} not found")
|
|
listing = existing_listings[0]
|
|
|
|
base_path = Path("data/rs/")
|
|
all_floorplans = listing.additional_info.get("property", {}).get(
|
|
"floorplans", []
|
|
)
|
|
|
|
if len(all_floorplans) == 0:
|
|
logger.debug(f"[{listing_id}] No floorplans available")
|
|
return listing
|
|
|
|
downloaded = 0
|
|
client_timeout = aiohttp.ClientTimeout(total=30)
|
|
async with aiohttp.ClientSession() as session:
|
|
for floorplan_obj in all_floorplans:
|
|
url = floorplan_obj["url"]
|
|
picname = Path(urlparse(url).path).name
|
|
floorplan_path = Path(base_path, str(listing.id), "floorplans", picname)
|
|
if floorplan_path.exists():
|
|
continue
|
|
async with session.get(url, timeout=client_timeout) as response:
|
|
if response.status == 404:
|
|
return listing
|
|
if response.status != 200:
|
|
raise FloorplanDownloadError(url, response.status)
|
|
floorplan_path.parent.mkdir(parents=True, exist_ok=True)
|
|
with open(floorplan_path, "wb") as f:
|
|
f.write(await response.read())
|
|
listing.floorplan_image_paths.append(str(floorplan_path))
|
|
downloaded += 1
|
|
|
|
await self.listing_repository.upsert_listings([listing])
|
|
|
|
logger.info(f"[{listing_id}] Downloaded {downloaded} floorplan images")
|
|
logger.debug(f"[{listing_id}] Image fetch complete")
|
|
return listing
|
|
|
|
|
|
class DetectFloorplanStep(Step):
|
|
ocr_semaphore: asyncio.Semaphore
|
|
|
|
def __init__(self, listing_repository: ListingRepository, listing_type: ListingType = ListingType.RENT):
|
|
super().__init__(listing_repository, listing_type)
|
|
self.ocr_semaphore = asyncio.Semaphore(MAX_OCR_WORKERS)
|
|
|
|
async def needs_processing(self, listing_id: int) -> bool:
|
|
listings = await self.listing_repository.get_listings(
|
|
only_ids=[listing_id], query_parameters=self._query_params()
|
|
)
|
|
if len(listings) == 0:
|
|
return False
|
|
return listings[0].square_meters is None
|
|
|
|
async def process(self, listing_id: int) -> Listing:
|
|
logger.debug(f"[{listing_id}] Running OCR on floorplans")
|
|
|
|
listings = await self.listing_repository.get_listings(
|
|
only_ids=[listing_id], query_parameters=self._query_params()
|
|
)
|
|
if len(listings) == 0:
|
|
raise ValueError(f"Listing {listing_id} does not exist")
|
|
listing = listings[0]
|
|
|
|
if len(listing.floorplan_image_paths) == 0:
|
|
logger.debug(f"[{listing_id}] No floorplan images to process")
|
|
listing.square_meters = 0
|
|
await self.listing_repository.upsert_listings([listing])
|
|
return listing
|
|
|
|
sqms = []
|
|
for floorplan_path in listing.floorplan_image_paths:
|
|
async with self.ocr_semaphore:
|
|
estimated_sqm, _ = await asyncio.to_thread(
|
|
floorplan.calculate_ocr, floorplan_path
|
|
)
|
|
if estimated_sqm is not None:
|
|
sqms.append(estimated_sqm)
|
|
|
|
max_sqm = max(sqms, default=0) # try once, if we fail, keep as 0
|
|
listing.square_meters = max_sqm
|
|
await self.listing_repository.upsert_listings([listing])
|
|
|
|
# Record OCR metrics
|
|
try:
|
|
from api.metrics import ocr_attempts, ocr_successes
|
|
ocr_attempts.add(1)
|
|
if max_sqm > 0:
|
|
ocr_successes.add(1)
|
|
except Exception:
|
|
pass # Metrics not initialised
|
|
|
|
if max_sqm > 0:
|
|
logger.info(f"[{listing_id}] OCR detected {max_sqm} sqm")
|
|
else:
|
|
logger.debug(f"[{listing_id}] OCR: no square meters detected")
|
|
|
|
logger.debug(f"[{listing_id}] OCR complete")
|
|
return listing
|