wrongmove/listing_processor.py

from __future__ import annotations
from abc import ABC, abstractmethod
import asyncio
from collections.abc import Callable
from datetime import datetime
import logging
from pathlib import Path
import re
from typing import Any
from urllib.parse import urlparse
import aiohttp
from config.scraper_config import MAX_OCR_WORKERS
from models.listing import (
    BuyListing,
    FurnishType,
    Listing,
    ListingSite,
    ListingType,
    QueryParameters,
    RentListing,
)
from rec import floorplan
from rec.exceptions import FloorplanDownloadError
from rec.query import detail_query
from repositories.listing_repository import ListingRepository

logger = logging.getLogger("uvicorn.error")


def _parse_furnish_type(raw: str | None) -> FurnishType:
    """Normalise the raw furnish-type string from the API into a FurnishType enum."""
    if raw is None:
        return FurnishType.UNKNOWN
    if "landlord" in raw.lower():
        return FurnishType.ASK_LANDLORD
    lowered = raw.lower()
    try:
        return FurnishType(lowered)
    except ValueError:
        return FurnishType.UNKNOWN


def _parse_available_from(raw: str | None) -> datetime | None:
    """Parse the available-from date string into a datetime, or None."""
    if raw is None:
        return None
    if raw.lower() == "now":
        return datetime.now()
    try:
        return datetime.strptime(raw, "%d/%m/%Y")
    except ValueError:
        return None


class ListingProcessor:
    semaphore: asyncio.Semaphore
    process_steps: list[Step]
    listing_repository: ListingRepository

    # Map step class names to short names for progress reporting
    STEP_NAMES: dict[str, str] = {
        "FetchListingDetailsStep": "details",
        "FetchImagesStep": "images",
        "DetectFloorplanStep": "ocr",
    }

    def __init__(self, listing_repository: ListingRepository, listing_type: ListingType = ListingType.RENT):
        self.semaphore = asyncio.Semaphore(20)
        self.listing_repository = listing_repository
        self.listing_type = listing_type
        # Register new processing steps here
        # Order is important
        self.process_steps = [
            FetchListingDetailsStep(listing_repository, listing_type),
            FetchImagesStep(listing_repository, listing_type),
            DetectFloorplanStep(listing_repository, listing_type),
        ]

    async def process_listing(
        self,
        listing_id: int,
        on_step_complete: Callable[[str], None] | None = None,
    ) -> Listing | None:
        await self.listing_repository.mark_seen(listing_id, self.listing_type)
        listing = None
        for step in self.process_steps:
            if await step.needs_processing(listing_id):
                async with self.semaphore:
                    step_class_name = step.__class__.__name__
                    try:
                        listing = await step.process(listing_id)
                        logger.debug(f"[{listing_id}] {step_class_name} completed")
                        if on_step_complete:
                            short_name = self.STEP_NAMES.get(
                                step_class_name, step_class_name
                            )
                            on_step_complete(short_name)
                    except (ValueError, KeyError, aiohttp.ClientError, FloorplanDownloadError) as e:
                        logger.error(f"[{listing_id}] {step_class_name} failed: {e}")
                        return None
        return listing


class Step(ABC):
    listing_repository: ListingRepository
    listing_type: ListingType

    def __init__(self, listing_repository: ListingRepository, listing_type: ListingType = ListingType.RENT):
        self.listing_repository = listing_repository
        self.listing_type = listing_type

    def _query_params(self) -> QueryParameters:
        """Build minimal QueryParameters for ID-based lookups in the correct table."""
        return QueryParameters(listing_type=self.listing_type)

    @abstractmethod
    async def process(self, listing_id: int) -> Listing: ...

    @abstractmethod
    async def needs_processing(self, listing_id: int) -> bool:
        return True


def _find_tenure_value(details: dict[str, Any], tenure_type: str) -> str | None:
    """Find a value in the tenure info content by type key."""
    tenure_content = details.get("property", {}).get("tenureInfo", {}).get("content", [])
    for item in tenure_content:
        if item.get("type") == tenure_type:
            return item.get("value")
    return None


def _parse_service_charge(details: dict[str, Any]) -> float | None:
    """Parse annual service charge from the tenure info in API response."""
    value = _find_tenure_value(details, "annualServiceCharge")
    if value is not None:
        matches = re.findall(r"([\d,.]+)", str(value))
        if matches:
            return float(matches[0].replace(",", ""))
    return None


def _parse_lease_left(details: dict[str, Any]) -> int | None:
    """Parse remaining lease years from the tenure info in API response."""
    value = _find_tenure_value(details, "lengthOfLease")
    if value is not None:
        matches = re.findall(r"(\d+)", str(value))
        if matches:
            return int(matches[0])
    return None


class FetchListingDetailsStep(Step):
    async def needs_processing(self, listing_id: int) -> bool:
        existing_listings = await self.listing_repository.get_listings(
            only_ids=[listing_id], query_parameters=self._query_params()
        )
        if len(existing_listings) == 0:
            return True
        return False

    async def process(self, listing_id: int) -> Listing:
        logger.debug(f"[{listing_id}] Fetching property details from API")
        logger.info(f"[{listing_id}] Fetching details...")

        existing_listings = await self.listing_repository.get_listings(
            only_ids=[listing_id], query_parameters=self._query_params()
        )
        now = datetime.now()
        if len(existing_listings) > 0:
            # listing exists, do not refresh
            logger.debug(f"[{listing_id}] Already exists, skipping refresh")
            return existing_listings[0]

        listing_details = await detail_query(listing_id)

        photos = listing_details["property"]["photos"]
        common_kwargs = dict(
            id=listing_id,
            price=listing_details["property"]["price"],
            number_of_bedrooms=listing_details["property"]["bedrooms"],
            square_meters=None,  # populated later
            agency=listing_details["property"]["branch"]["brandName"],
            council_tax_band=listing_details["property"]["councilTaxInfo"]["content"][
                0
            ]["value"],
            longitude=listing_details["property"]["longitude"],
            latitude=listing_details["property"]["latitude"],
            price_history_json="{}",  # TODO: should upsert from existing
            listing_site=ListingSite.RIGHTMOVE,
            last_seen=now,
            photo_thumbnail=photos[0]["thumbnailUrl"] if len(photos) > 0 else None,
            additional_info=listing_details,
        )

        listing: Listing
        if self.listing_type == ListingType.BUY:
            listing = BuyListing(
                **common_kwargs,
                service_charge=_parse_service_charge(listing_details),
                lease_left=_parse_lease_left(listing_details),
            )
        else:
            furnish_type = _parse_furnish_type(
                listing_details["property"].get("letFurnishType", "unknown")
            )
            available_from = _parse_available_from(
                listing_details["property"]["letDateAvailable"]
            )
            listing = RentListing(
                **common_kwargs,
                furnish_type=furnish_type,
                available_from=available_from,
            )

        await self.listing_repository.upsert_listings([listing])

        logger.info(
            f"[{listing_id}] Details fetched: £{listing.price}, "
            f"{listing.number_of_bedrooms}BR, {listing.agency}"
        )
        logger.debug(f"[{listing_id}] Details fetch complete")
        # TODO: dump to filesystem
        return listing


class FetchImagesStep(Step):
    async def needs_processing(self, listing_id: int) -> bool:
        existing_listings = await self.listing_repository.get_listings(
            only_ids=[listing_id], query_parameters=self._query_params()
        )
        if len(existing_listings) == 0:
            return False  # if listing doesn't exist, we can't process it
        listing = existing_listings[0]
        return len(listing.floorplan_image_paths) == 0

    async def process(self, listing_id: int) -> Listing:
        logger.debug(f"[{listing_id}] Fetching floorplan images")

        existing_listings = await self.listing_repository.get_listings(
            only_ids=[listing_id], query_parameters=self._query_params()
        )
        if len(existing_listings) == 0:
            raise ValueError(f"Listing {listing_id} not found")
        listing = existing_listings[0]

        base_path = Path("data/rs/")
        all_floorplans = listing.additional_info.get("property", {}).get(
            "floorplans", []
        )

        if len(all_floorplans) == 0:
            logger.debug(f"[{listing_id}] No floorplans available")
            return listing

        downloaded = 0
        client_timeout = aiohttp.ClientTimeout(total=30)
        async with aiohttp.ClientSession() as session:
            for floorplan_obj in all_floorplans:
                url = floorplan_obj["url"]
                picname = Path(urlparse(url).path).name
                floorplan_path = Path(base_path, str(listing.id), "floorplans", picname)
                if floorplan_path.exists():
                    continue
                async with session.get(url, timeout=client_timeout) as response:
                    if response.status == 404:
                        return listing
                    if response.status != 200:
                        raise FloorplanDownloadError(url, response.status)
                    floorplan_path.parent.mkdir(parents=True, exist_ok=True)
                    with open(floorplan_path, "wb") as f:
                        f.write(await response.read())
                    listing.floorplan_image_paths.append(str(floorplan_path))
                    downloaded += 1

        await self.listing_repository.upsert_listings([listing])

        logger.info(f"[{listing_id}] Downloaded {downloaded} floorplan images")
        logger.debug(f"[{listing_id}] Image fetch complete")
        return listing


class DetectFloorplanStep(Step):
    ocr_semaphore: asyncio.Semaphore

    def __init__(self, listing_repository: ListingRepository, listing_type: ListingType = ListingType.RENT):
        super().__init__(listing_repository, listing_type)
        self.ocr_semaphore = asyncio.Semaphore(MAX_OCR_WORKERS)

    async def needs_processing(self, listing_id: int) -> bool:
        listings = await self.listing_repository.get_listings(
            only_ids=[listing_id], query_parameters=self._query_params()
        )
        if len(listings) == 0:
            return False
        return listings[0].square_meters is None

    async def process(self, listing_id: int) -> Listing:
        logger.debug(f"[{listing_id}] Running OCR on floorplans")

        listings = await self.listing_repository.get_listings(
            only_ids=[listing_id], query_parameters=self._query_params()
        )
        if len(listings) == 0:
            raise ValueError(f"Listing {listing_id} does not exist")
        listing = listings[0]

        if len(listing.floorplan_image_paths) == 0:
            logger.debug(f"[{listing_id}] No floorplan images to process")
            listing.square_meters = 0
            await self.listing_repository.upsert_listings([listing])
            return listing

        sqms = []
        for floorplan_path in listing.floorplan_image_paths:
            async with self.ocr_semaphore:
                estimated_sqm, _ = await asyncio.to_thread(
                    floorplan.calculate_ocr, floorplan_path
                )
            if estimated_sqm is not None:
                sqms.append(estimated_sqm)

        max_sqm = max(sqms, default=0)  # try once, if we fail, keep as 0
        listing.square_meters = max_sqm
        await self.listing_repository.upsert_listings([listing])

        if max_sqm > 0:
            logger.info(f"[{listing_id}] OCR detected {max_sqm} sqm")
        else:
            logger.debug(f"[{listing_id}] OCR: no square meters detected")

        logger.debug(f"[{listing_id}] OCR complete")
        return listing