2025-07-27 18:33:39 +00:00
|
|
|
from __future__ import annotations
|
|
|
|
|
from abc import abstractmethod
|
|
|
|
|
import asyncio
|
2026-02-06 22:37:53 +00:00
|
|
|
from collections.abc import Callable
|
2025-07-27 18:33:39 +00:00
|
|
|
from datetime import datetime
|
|
|
|
|
import logging
|
|
|
|
|
import multiprocessing
|
|
|
|
|
from pathlib import Path
|
2026-02-07 23:34:08 +00:00
|
|
|
import re
|
|
|
|
|
from typing import Any
|
2026-02-07 20:19:57 +00:00
|
|
|
from urllib.parse import urlparse
|
2025-07-27 18:33:39 +00:00
|
|
|
import aiohttp
|
2026-02-07 23:34:08 +00:00
|
|
|
from models.listing import (
|
|
|
|
|
BuyListing,
|
|
|
|
|
FurnishType,
|
|
|
|
|
Listing,
|
|
|
|
|
ListingSite,
|
|
|
|
|
ListingType,
|
|
|
|
|
QueryParameters,
|
|
|
|
|
RentListing,
|
|
|
|
|
)
|
2025-07-27 18:33:39 +00:00
|
|
|
from rec import floorplan
|
|
|
|
|
from rec.query import detail_query
|
|
|
|
|
from repositories.listing_repository import ListingRepository
|
|
|
|
|
|
|
|
|
|
logger = logging.getLogger("uvicorn.error")
|
|
|
|
|
|
2026-02-07 20:19:57 +00:00
|
|
|
# Limit OCR threads to 25% of available cores to avoid starving other work.
|
|
|
|
|
MAX_OCR_WORKERS = max(1, multiprocessing.cpu_count() // 4)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _parse_furnish_type(raw: str | None) -> FurnishType:
|
|
|
|
|
"""Normalise the raw furnish-type string from the API into a FurnishType enum."""
|
|
|
|
|
if raw is None:
|
|
|
|
|
return FurnishType.UNKNOWN
|
|
|
|
|
if "landlord" in raw.lower():
|
|
|
|
|
return FurnishType.ASK_LANDLORD
|
|
|
|
|
lowered = raw.lower()
|
|
|
|
|
try:
|
|
|
|
|
return FurnishType(lowered)
|
|
|
|
|
except ValueError:
|
|
|
|
|
return FurnishType.UNKNOWN
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _parse_available_from(raw: str | None) -> datetime | None:
|
|
|
|
|
"""Parse the available-from date string into a datetime, or None."""
|
|
|
|
|
if raw is None:
|
|
|
|
|
return None
|
|
|
|
|
if raw.lower() == "now":
|
|
|
|
|
return datetime.now()
|
|
|
|
|
try:
|
|
|
|
|
return datetime.strptime(raw, "%d/%m/%Y")
|
|
|
|
|
except ValueError:
|
|
|
|
|
return None
|
2026-02-02 23:01:13 +00:00
|
|
|
|
2025-07-27 18:33:39 +00:00
|
|
|
|
|
|
|
|
class ListingProcessor:
|
|
|
|
|
semaphore: asyncio.Semaphore
|
|
|
|
|
process_steps: list[Step]
|
2025-08-23 22:36:37 +00:00
|
|
|
listing_repository: ListingRepository
|
2025-07-27 18:33:39 +00:00
|
|
|
|
2026-02-06 22:37:53 +00:00
|
|
|
# Map step class names to short names for progress reporting
|
|
|
|
|
STEP_NAMES: dict[str, str] = {
|
|
|
|
|
"FetchListingDetailsStep": "details",
|
|
|
|
|
"FetchImagesStep": "images",
|
|
|
|
|
"DetectFloorplanStep": "ocr",
|
|
|
|
|
}
|
|
|
|
|
|
2026-02-07 23:34:08 +00:00
|
|
|
def __init__(self, listing_repository: ListingRepository, listing_type: ListingType = ListingType.RENT):
|
2025-07-27 18:33:39 +00:00
|
|
|
self.semaphore = asyncio.Semaphore(20)
|
2025-08-23 22:36:37 +00:00
|
|
|
self.listing_repository = listing_repository
|
2026-02-07 23:34:08 +00:00
|
|
|
self.listing_type = listing_type
|
2025-07-27 18:33:39 +00:00
|
|
|
# Register new processing steps here
|
|
|
|
|
# Order is important
|
|
|
|
|
self.process_steps = [
|
2026-02-07 23:34:08 +00:00
|
|
|
FetchListingDetailsStep(listing_repository, listing_type),
|
|
|
|
|
FetchImagesStep(listing_repository, listing_type),
|
|
|
|
|
DetectFloorplanStep(listing_repository, listing_type),
|
2025-07-27 18:33:39 +00:00
|
|
|
]
|
|
|
|
|
|
2026-02-06 22:37:53 +00:00
|
|
|
async def process_listing(
|
|
|
|
|
self,
|
|
|
|
|
listing_id: int,
|
|
|
|
|
on_step_complete: Callable[[str], None] | None = None,
|
|
|
|
|
) -> Listing | None:
|
2026-02-07 23:34:08 +00:00
|
|
|
await self.listing_repository.mark_seen(listing_id, self.listing_type)
|
2025-07-27 18:33:39 +00:00
|
|
|
listing = None
|
|
|
|
|
for step in self.process_steps:
|
|
|
|
|
if await step.needs_processing(listing_id):
|
|
|
|
|
async with self.semaphore:
|
2026-02-06 22:37:53 +00:00
|
|
|
step_class_name = step.__class__.__name__
|
2025-08-23 22:20:42 +00:00
|
|
|
try:
|
|
|
|
|
listing = await step.process(listing_id)
|
2026-02-06 22:37:53 +00:00
|
|
|
logger.debug(f"[{listing_id}] {step_class_name} completed")
|
|
|
|
|
if on_step_complete:
|
|
|
|
|
short_name = self.STEP_NAMES.get(
|
|
|
|
|
step_class_name, step_class_name
|
|
|
|
|
)
|
|
|
|
|
on_step_complete(short_name)
|
2025-08-23 22:20:42 +00:00
|
|
|
except Exception as e:
|
2026-02-06 22:37:53 +00:00
|
|
|
logger.error(f"[{listing_id}] {step_class_name} failed: {e}")
|
2025-08-23 22:20:42 +00:00
|
|
|
return None
|
2025-07-27 18:33:39 +00:00
|
|
|
return listing
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class Step:
|
|
|
|
|
listing_repository: ListingRepository
|
2026-02-07 23:34:08 +00:00
|
|
|
listing_type: ListingType
|
2025-07-27 18:33:39 +00:00
|
|
|
|
2026-02-07 23:34:08 +00:00
|
|
|
def __init__(self, listing_repository: ListingRepository, listing_type: ListingType = ListingType.RENT):
|
2025-07-27 18:33:39 +00:00
|
|
|
self.listing_repository = listing_repository
|
2026-02-07 23:34:08 +00:00
|
|
|
self.listing_type = listing_type
|
|
|
|
|
|
|
|
|
|
def _query_params(self) -> QueryParameters:
|
|
|
|
|
"""Build minimal QueryParameters for ID-based lookups in the correct table."""
|
|
|
|
|
return QueryParameters(listing_type=self.listing_type)
|
2025-07-27 18:33:39 +00:00
|
|
|
|
|
|
|
|
@abstractmethod
|
|
|
|
|
async def process(self, listing_id: int) -> Listing: ...
|
|
|
|
|
|
|
|
|
|
@abstractmethod
|
|
|
|
|
async def needs_processing(self, listing_id: int) -> bool:
|
|
|
|
|
return True
|
|
|
|
|
|
|
|
|
|
|
2026-02-07 23:34:08 +00:00
|
|
|
def _parse_service_charge(details: dict[str, Any]) -> float | None:
|
|
|
|
|
"""Parse annual service charge from the tenure info in API response."""
|
|
|
|
|
tenure_content = (
|
|
|
|
|
details.get("property", {}).get("tenureInfo", {}).get("content", [])
|
|
|
|
|
)
|
|
|
|
|
for item in tenure_content:
|
|
|
|
|
if item.get("type") == "annualServiceCharge":
|
|
|
|
|
matches = re.findall(r"([\d,.]+)", str(item.get("value", "")))
|
|
|
|
|
if matches:
|
|
|
|
|
return float(matches[0].replace(",", ""))
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _parse_lease_left(details: dict[str, Any]) -> int | None:
|
|
|
|
|
"""Parse remaining lease years from the tenure info in API response."""
|
|
|
|
|
tenure_content = (
|
|
|
|
|
details.get("property", {}).get("tenureInfo", {}).get("content", [])
|
|
|
|
|
)
|
|
|
|
|
for item in tenure_content:
|
|
|
|
|
if item.get("type") == "lengthOfLease":
|
|
|
|
|
matches = re.findall(r"(\d+)", str(item.get("value", "")))
|
|
|
|
|
if matches:
|
|
|
|
|
return int(matches[0])
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
|
2025-07-27 18:33:39 +00:00
|
|
|
class FetchListingDetailsStep(Step):
|
2025-07-27 20:09:41 +00:00
|
|
|
async def needs_processing(self, listing_id: int) -> bool:
|
|
|
|
|
existing_listings = await self.listing_repository.get_listings(
|
2026-02-07 23:34:08 +00:00
|
|
|
only_ids=[listing_id], query_parameters=self._query_params()
|
2025-07-27 20:09:41 +00:00
|
|
|
)
|
2026-02-02 23:01:13 +00:00
|
|
|
if len(existing_listings) == 0:
|
2025-07-27 20:09:41 +00:00
|
|
|
return True
|
|
|
|
|
return False
|
|
|
|
|
|
2025-07-27 18:33:39 +00:00
|
|
|
async def process(self, listing_id: int) -> Listing:
|
2026-02-02 23:01:13 +00:00
|
|
|
logger.debug(f"[{listing_id}] Fetching property details from API")
|
2026-02-07 20:19:57 +00:00
|
|
|
logger.info(f"[{listing_id}] Fetching details...")
|
2026-02-02 23:01:13 +00:00
|
|
|
|
2025-07-27 18:33:39 +00:00
|
|
|
existing_listings = await self.listing_repository.get_listings(
|
2026-02-07 23:34:08 +00:00
|
|
|
only_ids=[listing_id], query_parameters=self._query_params()
|
2025-07-27 18:33:39 +00:00
|
|
|
)
|
|
|
|
|
now = datetime.now()
|
|
|
|
|
if len(existing_listings) > 0:
|
|
|
|
|
# listing exists, do not refresh
|
2026-02-02 23:01:13 +00:00
|
|
|
logger.debug(f"[{listing_id}] Already exists, skipping refresh")
|
2025-07-27 18:33:39 +00:00
|
|
|
return existing_listings[0]
|
2026-02-02 23:01:13 +00:00
|
|
|
|
2025-07-27 18:33:39 +00:00
|
|
|
listing_details = await detail_query(listing_id)
|
|
|
|
|
|
|
|
|
|
photos = listing_details["property"]["photos"]
|
2026-02-07 23:34:08 +00:00
|
|
|
common_kwargs = dict(
|
2025-07-27 18:33:39 +00:00
|
|
|
id=listing_id,
|
|
|
|
|
price=listing_details["property"]["price"],
|
|
|
|
|
number_of_bedrooms=listing_details["property"]["bedrooms"],
|
|
|
|
|
square_meters=None, # populated later
|
|
|
|
|
agency=listing_details["property"]["branch"]["brandName"],
|
|
|
|
|
council_tax_band=listing_details["property"]["councilTaxInfo"]["content"][
|
|
|
|
|
0
|
|
|
|
|
]["value"],
|
2025-09-14 19:44:03 +01:00
|
|
|
longitude=listing_details["property"]["longitude"],
|
2025-07-27 18:33:39 +00:00
|
|
|
latitude=listing_details["property"]["latitude"],
|
|
|
|
|
price_history_json="{}", # TODO: should upsert from existing
|
|
|
|
|
listing_site=ListingSite.RIGHTMOVE,
|
|
|
|
|
last_seen=now,
|
|
|
|
|
photo_thumbnail=photos[0]["thumbnailUrl"] if len(photos) > 0 else None,
|
|
|
|
|
additional_info=listing_details,
|
|
|
|
|
)
|
2026-02-07 23:34:08 +00:00
|
|
|
|
|
|
|
|
listing: Listing
|
|
|
|
|
if self.listing_type == ListingType.BUY:
|
|
|
|
|
listing = BuyListing(
|
|
|
|
|
**common_kwargs,
|
|
|
|
|
service_charge=_parse_service_charge(listing_details),
|
|
|
|
|
lease_left=_parse_lease_left(listing_details),
|
|
|
|
|
)
|
|
|
|
|
else:
|
|
|
|
|
furnish_type = _parse_furnish_type(
|
|
|
|
|
listing_details["property"].get("letFurnishType", "unknown")
|
|
|
|
|
)
|
|
|
|
|
available_from = _parse_available_from(
|
|
|
|
|
listing_details["property"]["letDateAvailable"]
|
|
|
|
|
)
|
|
|
|
|
listing = RentListing(
|
|
|
|
|
**common_kwargs,
|
|
|
|
|
furnish_type=furnish_type,
|
|
|
|
|
available_from=available_from,
|
|
|
|
|
)
|
|
|
|
|
|
2025-07-27 18:33:39 +00:00
|
|
|
await self.listing_repository.upsert_listings([listing])
|
2026-02-02 23:01:13 +00:00
|
|
|
|
2026-02-07 20:19:57 +00:00
|
|
|
logger.info(
|
2026-02-02 23:01:13 +00:00
|
|
|
f"[{listing_id}] Details fetched: £{listing.price}, "
|
|
|
|
|
f"{listing.number_of_bedrooms}BR, {listing.agency}"
|
|
|
|
|
)
|
|
|
|
|
logger.debug(f"[{listing_id}] Details fetch complete")
|
2025-07-27 18:33:39 +00:00
|
|
|
# TODO: dump to filesystem
|
|
|
|
|
return listing
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class FetchImagesStep(Step):
|
2025-07-27 20:09:41 +00:00
|
|
|
async def needs_processing(self, listing_id: int) -> bool:
|
|
|
|
|
existing_listings = await self.listing_repository.get_listings(
|
2026-02-07 23:34:08 +00:00
|
|
|
only_ids=[listing_id], query_parameters=self._query_params()
|
2025-07-27 20:09:41 +00:00
|
|
|
)
|
|
|
|
|
if len(existing_listings) == 0:
|
|
|
|
|
return False # if listing doesn't exist, we can't process it
|
|
|
|
|
listing = existing_listings[0]
|
|
|
|
|
return len(listing.floorplan_image_paths) == 0
|
|
|
|
|
|
2025-07-27 18:33:39 +00:00
|
|
|
async def process(self, listing_id: int) -> Listing:
|
2026-02-02 23:01:13 +00:00
|
|
|
logger.debug(f"[{listing_id}] Fetching floorplan images")
|
|
|
|
|
|
2025-07-27 18:33:39 +00:00
|
|
|
existing_listings = await self.listing_repository.get_listings(
|
2026-02-07 23:34:08 +00:00
|
|
|
only_ids=[listing_id], query_parameters=self._query_params()
|
2025-07-27 18:33:39 +00:00
|
|
|
)
|
|
|
|
|
if len(existing_listings) == 0:
|
|
|
|
|
raise ValueError(f"Listing {listing_id} not found")
|
|
|
|
|
listing = existing_listings[0]
|
|
|
|
|
|
|
|
|
|
base_path = Path("data/rs/")
|
|
|
|
|
all_floorplans = listing.additional_info.get("property", {}).get(
|
|
|
|
|
"floorplans", []
|
|
|
|
|
)
|
2026-02-02 23:01:13 +00:00
|
|
|
|
|
|
|
|
if len(all_floorplans) == 0:
|
|
|
|
|
logger.debug(f"[{listing_id}] No floorplans available")
|
|
|
|
|
return listing
|
|
|
|
|
|
|
|
|
|
downloaded = 0
|
2025-08-23 22:20:42 +00:00
|
|
|
client_timeout = aiohttp.ClientTimeout(total=30)
|
2026-02-07 20:19:57 +00:00
|
|
|
async with aiohttp.ClientSession() as session:
|
|
|
|
|
for floorplan_obj in all_floorplans:
|
|
|
|
|
url = floorplan_obj["url"]
|
|
|
|
|
picname = Path(urlparse(url).path).name
|
|
|
|
|
floorplan_path = Path(base_path, str(listing.id), "floorplans", picname)
|
|
|
|
|
if floorplan_path.exists():
|
|
|
|
|
continue
|
2025-08-23 22:20:42 +00:00
|
|
|
async with session.get(url, timeout=client_timeout) as response:
|
2025-07-27 18:33:39 +00:00
|
|
|
if response.status == 404:
|
|
|
|
|
return listing
|
|
|
|
|
if response.status != 200:
|
|
|
|
|
raise Exception(f"Error for {url}: {response.status}")
|
|
|
|
|
floorplan_path.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
|
with open(floorplan_path, "wb") as f:
|
|
|
|
|
f.write(await response.read())
|
|
|
|
|
listing.floorplan_image_paths.append(str(floorplan_path))
|
2026-02-02 23:01:13 +00:00
|
|
|
downloaded += 1
|
|
|
|
|
|
2025-07-27 18:33:39 +00:00
|
|
|
await self.listing_repository.upsert_listings([listing])
|
2026-02-02 23:01:13 +00:00
|
|
|
|
2026-02-07 20:19:57 +00:00
|
|
|
logger.info(f"[{listing_id}] Downloaded {downloaded} floorplan images")
|
2026-02-02 23:01:13 +00:00
|
|
|
logger.debug(f"[{listing_id}] Image fetch complete")
|
2025-07-27 18:33:39 +00:00
|
|
|
return listing
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class DetectFloorplanStep(Step):
|
|
|
|
|
ocr_semaphore: asyncio.Semaphore
|
|
|
|
|
|
2026-02-07 23:34:08 +00:00
|
|
|
def __init__(self, listing_repository: ListingRepository, listing_type: ListingType = ListingType.RENT):
|
|
|
|
|
super().__init__(listing_repository, listing_type)
|
2026-02-07 20:19:57 +00:00
|
|
|
self.ocr_semaphore = asyncio.Semaphore(MAX_OCR_WORKERS)
|
2025-07-27 18:33:39 +00:00
|
|
|
|
|
|
|
|
async def needs_processing(self, listing_id: int) -> bool:
|
2026-02-07 23:34:08 +00:00
|
|
|
listings = await self.listing_repository.get_listings(
|
|
|
|
|
only_ids=[listing_id], query_parameters=self._query_params()
|
|
|
|
|
)
|
2025-07-27 18:33:39 +00:00
|
|
|
if len(listings) == 0:
|
|
|
|
|
return False
|
|
|
|
|
return listings[0].square_meters is None
|
|
|
|
|
|
|
|
|
|
async def process(self, listing_id: int) -> Listing:
|
2026-02-02 23:01:13 +00:00
|
|
|
logger.debug(f"[{listing_id}] Running OCR on floorplans")
|
|
|
|
|
|
2026-02-07 23:34:08 +00:00
|
|
|
listings = await self.listing_repository.get_listings(
|
|
|
|
|
only_ids=[listing_id], query_parameters=self._query_params()
|
|
|
|
|
)
|
2025-07-27 18:33:39 +00:00
|
|
|
if len(listings) == 0:
|
|
|
|
|
raise ValueError(f"Listing {listing_id} does not exist")
|
|
|
|
|
listing = listings[0]
|
2026-02-02 23:01:13 +00:00
|
|
|
|
|
|
|
|
if len(listing.floorplan_image_paths) == 0:
|
|
|
|
|
logger.debug(f"[{listing_id}] No floorplan images to process")
|
|
|
|
|
listing.square_meters = 0
|
|
|
|
|
await self.listing_repository.upsert_listings([listing])
|
|
|
|
|
return listing
|
|
|
|
|
|
2025-07-27 18:33:39 +00:00
|
|
|
sqms = []
|
|
|
|
|
for floorplan_path in listing.floorplan_image_paths:
|
|
|
|
|
async with self.ocr_semaphore:
|
|
|
|
|
estimated_sqm, _ = await asyncio.to_thread(
|
|
|
|
|
floorplan.calculate_ocr, floorplan_path
|
|
|
|
|
)
|
|
|
|
|
if estimated_sqm is not None:
|
|
|
|
|
sqms.append(estimated_sqm)
|
2026-02-02 23:01:13 +00:00
|
|
|
|
2025-07-27 18:33:39 +00:00
|
|
|
max_sqm = max(sqms, default=0) # try once, if we fail, keep as 0
|
|
|
|
|
listing.square_meters = max_sqm
|
|
|
|
|
await self.listing_repository.upsert_listings([listing])
|
2026-02-02 23:01:13 +00:00
|
|
|
|
|
|
|
|
if max_sqm > 0:
|
2026-02-07 20:19:57 +00:00
|
|
|
logger.info(f"[{listing_id}] OCR detected {max_sqm} sqm")
|
2026-02-02 23:01:13 +00:00
|
|
|
else:
|
|
|
|
|
logger.debug(f"[{listing_id}] OCR: no square meters detected")
|
|
|
|
|
|
|
|
|
|
logger.debug(f"[{listing_id}] OCR complete")
|
2025-07-27 18:33:39 +00:00
|
|
|
return listing
|