wrongmove/listing_processor.py
Viktor Barzin e5ce8c1201
Fix buy listing support: thread ListingType through processing pipeline
The listing processor was hardcoded to create RentListing objects and
query only the rentlisting table. Buy listings fetched from Rightmove
were stored in the wrong table with missing fields. This threads
ListingType through ListingProcessor and all Step subclasses so the
correct model (RentListing/BuyListing) is created, the correct table
is queried, and buy-specific fields (service_charge, lease_left) are
parsed from the API response and included in GeoJSON streaming output.
2026-02-07 23:34:08 +00:00

332 lines
12 KiB
Python

from __future__ import annotations
from abc import abstractmethod
import asyncio
from collections.abc import Callable
from datetime import datetime
import logging
import multiprocessing
from pathlib import Path
import re
from typing import Any
from urllib.parse import urlparse
import aiohttp
from models.listing import (
BuyListing,
FurnishType,
Listing,
ListingSite,
ListingType,
QueryParameters,
RentListing,
)
from rec import floorplan
from rec.query import detail_query
from repositories.listing_repository import ListingRepository
logger = logging.getLogger("uvicorn.error")
# Limit OCR threads to 25% of available cores to avoid starving other work.
MAX_OCR_WORKERS = max(1, multiprocessing.cpu_count() // 4)
def _parse_furnish_type(raw: str | None) -> FurnishType:
"""Normalise the raw furnish-type string from the API into a FurnishType enum."""
if raw is None:
return FurnishType.UNKNOWN
if "landlord" in raw.lower():
return FurnishType.ASK_LANDLORD
lowered = raw.lower()
try:
return FurnishType(lowered)
except ValueError:
return FurnishType.UNKNOWN
def _parse_available_from(raw: str | None) -> datetime | None:
"""Parse the available-from date string into a datetime, or None."""
if raw is None:
return None
if raw.lower() == "now":
return datetime.now()
try:
return datetime.strptime(raw, "%d/%m/%Y")
except ValueError:
return None
class ListingProcessor:
semaphore: asyncio.Semaphore
process_steps: list[Step]
listing_repository: ListingRepository
# Map step class names to short names for progress reporting
STEP_NAMES: dict[str, str] = {
"FetchListingDetailsStep": "details",
"FetchImagesStep": "images",
"DetectFloorplanStep": "ocr",
}
def __init__(self, listing_repository: ListingRepository, listing_type: ListingType = ListingType.RENT):
self.semaphore = asyncio.Semaphore(20)
self.listing_repository = listing_repository
self.listing_type = listing_type
# Register new processing steps here
# Order is important
self.process_steps = [
FetchListingDetailsStep(listing_repository, listing_type),
FetchImagesStep(listing_repository, listing_type),
DetectFloorplanStep(listing_repository, listing_type),
]
async def process_listing(
self,
listing_id: int,
on_step_complete: Callable[[str], None] | None = None,
) -> Listing | None:
await self.listing_repository.mark_seen(listing_id, self.listing_type)
listing = None
for step in self.process_steps:
if await step.needs_processing(listing_id):
async with self.semaphore:
step_class_name = step.__class__.__name__
try:
listing = await step.process(listing_id)
logger.debug(f"[{listing_id}] {step_class_name} completed")
if on_step_complete:
short_name = self.STEP_NAMES.get(
step_class_name, step_class_name
)
on_step_complete(short_name)
except Exception as e:
logger.error(f"[{listing_id}] {step_class_name} failed: {e}")
return None
return listing
class Step:
listing_repository: ListingRepository
listing_type: ListingType
def __init__(self, listing_repository: ListingRepository, listing_type: ListingType = ListingType.RENT):
self.listing_repository = listing_repository
self.listing_type = listing_type
def _query_params(self) -> QueryParameters:
"""Build minimal QueryParameters for ID-based lookups in the correct table."""
return QueryParameters(listing_type=self.listing_type)
@abstractmethod
async def process(self, listing_id: int) -> Listing: ...
@abstractmethod
async def needs_processing(self, listing_id: int) -> bool:
return True
def _parse_service_charge(details: dict[str, Any]) -> float | None:
"""Parse annual service charge from the tenure info in API response."""
tenure_content = (
details.get("property", {}).get("tenureInfo", {}).get("content", [])
)
for item in tenure_content:
if item.get("type") == "annualServiceCharge":
matches = re.findall(r"([\d,.]+)", str(item.get("value", "")))
if matches:
return float(matches[0].replace(",", ""))
return None
def _parse_lease_left(details: dict[str, Any]) -> int | None:
"""Parse remaining lease years from the tenure info in API response."""
tenure_content = (
details.get("property", {}).get("tenureInfo", {}).get("content", [])
)
for item in tenure_content:
if item.get("type") == "lengthOfLease":
matches = re.findall(r"(\d+)", str(item.get("value", "")))
if matches:
return int(matches[0])
return None
class FetchListingDetailsStep(Step):
async def needs_processing(self, listing_id: int) -> bool:
existing_listings = await self.listing_repository.get_listings(
only_ids=[listing_id], query_parameters=self._query_params()
)
if len(existing_listings) == 0:
return True
return False
async def process(self, listing_id: int) -> Listing:
logger.debug(f"[{listing_id}] Fetching property details from API")
logger.info(f"[{listing_id}] Fetching details...")
existing_listings = await self.listing_repository.get_listings(
only_ids=[listing_id], query_parameters=self._query_params()
)
now = datetime.now()
if len(existing_listings) > 0:
# listing exists, do not refresh
logger.debug(f"[{listing_id}] Already exists, skipping refresh")
return existing_listings[0]
listing_details = await detail_query(listing_id)
photos = listing_details["property"]["photos"]
common_kwargs = dict(
id=listing_id,
price=listing_details["property"]["price"],
number_of_bedrooms=listing_details["property"]["bedrooms"],
square_meters=None, # populated later
agency=listing_details["property"]["branch"]["brandName"],
council_tax_band=listing_details["property"]["councilTaxInfo"]["content"][
0
]["value"],
longitude=listing_details["property"]["longitude"],
latitude=listing_details["property"]["latitude"],
price_history_json="{}", # TODO: should upsert from existing
listing_site=ListingSite.RIGHTMOVE,
last_seen=now,
photo_thumbnail=photos[0]["thumbnailUrl"] if len(photos) > 0 else None,
additional_info=listing_details,
)
listing: Listing
if self.listing_type == ListingType.BUY:
listing = BuyListing(
**common_kwargs,
service_charge=_parse_service_charge(listing_details),
lease_left=_parse_lease_left(listing_details),
)
else:
furnish_type = _parse_furnish_type(
listing_details["property"].get("letFurnishType", "unknown")
)
available_from = _parse_available_from(
listing_details["property"]["letDateAvailable"]
)
listing = RentListing(
**common_kwargs,
furnish_type=furnish_type,
available_from=available_from,
)
await self.listing_repository.upsert_listings([listing])
logger.info(
f"[{listing_id}] Details fetched: £{listing.price}, "
f"{listing.number_of_bedrooms}BR, {listing.agency}"
)
logger.debug(f"[{listing_id}] Details fetch complete")
# TODO: dump to filesystem
return listing
class FetchImagesStep(Step):
async def needs_processing(self, listing_id: int) -> bool:
existing_listings = await self.listing_repository.get_listings(
only_ids=[listing_id], query_parameters=self._query_params()
)
if len(existing_listings) == 0:
return False # if listing doesn't exist, we can't process it
listing = existing_listings[0]
return len(listing.floorplan_image_paths) == 0
async def process(self, listing_id: int) -> Listing:
logger.debug(f"[{listing_id}] Fetching floorplan images")
existing_listings = await self.listing_repository.get_listings(
only_ids=[listing_id], query_parameters=self._query_params()
)
if len(existing_listings) == 0:
raise ValueError(f"Listing {listing_id} not found")
listing = existing_listings[0]
base_path = Path("data/rs/")
all_floorplans = listing.additional_info.get("property", {}).get(
"floorplans", []
)
if len(all_floorplans) == 0:
logger.debug(f"[{listing_id}] No floorplans available")
return listing
downloaded = 0
client_timeout = aiohttp.ClientTimeout(total=30)
async with aiohttp.ClientSession() as session:
for floorplan_obj in all_floorplans:
url = floorplan_obj["url"]
picname = Path(urlparse(url).path).name
floorplan_path = Path(base_path, str(listing.id), "floorplans", picname)
if floorplan_path.exists():
continue
async with session.get(url, timeout=client_timeout) as response:
if response.status == 404:
return listing
if response.status != 200:
raise Exception(f"Error for {url}: {response.status}")
floorplan_path.parent.mkdir(parents=True, exist_ok=True)
with open(floorplan_path, "wb") as f:
f.write(await response.read())
listing.floorplan_image_paths.append(str(floorplan_path))
downloaded += 1
await self.listing_repository.upsert_listings([listing])
logger.info(f"[{listing_id}] Downloaded {downloaded} floorplan images")
logger.debug(f"[{listing_id}] Image fetch complete")
return listing
class DetectFloorplanStep(Step):
ocr_semaphore: asyncio.Semaphore
def __init__(self, listing_repository: ListingRepository, listing_type: ListingType = ListingType.RENT):
super().__init__(listing_repository, listing_type)
self.ocr_semaphore = asyncio.Semaphore(MAX_OCR_WORKERS)
async def needs_processing(self, listing_id: int) -> bool:
listings = await self.listing_repository.get_listings(
only_ids=[listing_id], query_parameters=self._query_params()
)
if len(listings) == 0:
return False
return listings[0].square_meters is None
async def process(self, listing_id: int) -> Listing:
logger.debug(f"[{listing_id}] Running OCR on floorplans")
listings = await self.listing_repository.get_listings(
only_ids=[listing_id], query_parameters=self._query_params()
)
if len(listings) == 0:
raise ValueError(f"Listing {listing_id} does not exist")
listing = listings[0]
if len(listing.floorplan_image_paths) == 0:
logger.debug(f"[{listing_id}] No floorplan images to process")
listing.square_meters = 0
await self.listing_repository.upsert_listings([listing])
return listing
sqms = []
for floorplan_path in listing.floorplan_image_paths:
async with self.ocr_semaphore:
estimated_sqm, _ = await asyncio.to_thread(
floorplan.calculate_ocr, floorplan_path
)
if estimated_sqm is not None:
sqms.append(estimated_sqm)
max_sqm = max(sqms, default=0) # try once, if we fail, keep as 0
listing.square_meters = max_sqm
await self.listing_repository.upsert_listings([listing])
if max_sqm > 0:
logger.info(f"[{listing_id}] OCR detected {max_sqm} sqm")
else:
logger.debug(f"[{listing_id}] OCR: no square meters detected")
logger.debug(f"[{listing_id}] OCR complete")
return listing