Fix buy listing support: thread ListingType through processing pipeline

The listing processor was hardcoded to create RentListing objects and
query only the rentlisting table. Buy listings fetched from Rightmove
were stored in the wrong table with missing fields. This threads
ListingType through ListingProcessor and all Step subclasses so the
correct model (RentListing/BuyListing) is created, the correct table
is queried, and buy-specific fields (service_charge, lease_left) are
parsed from the API response and included in GeoJSON streaming output.
This commit is contained in:
Viktor Barzin 2026-02-07 23:34:08 +00:00
parent 5e48a26958
commit e5ce8c1201
No known key found for this signature in database
GPG key ID: 0EB088298288D958
6 changed files with 116 additions and 41 deletions

View file

@ -6,9 +6,19 @@ from datetime import datetime
import logging
import multiprocessing
from pathlib import Path
import re
from typing import Any
from urllib.parse import urlparse
import aiohttp
from models.listing import FurnishType, Listing, ListingSite, RentListing
from models.listing import (
BuyListing,
FurnishType,
Listing,
ListingSite,
ListingType,
QueryParameters,
RentListing,
)
from rec import floorplan
from rec.query import detail_query
from repositories.listing_repository import ListingRepository
@ -56,15 +66,16 @@ class ListingProcessor:
"DetectFloorplanStep": "ocr",
}
def __init__(self, listing_repository: ListingRepository):
def __init__(self, listing_repository: ListingRepository, listing_type: ListingType = ListingType.RENT):
self.semaphore = asyncio.Semaphore(20)
self.listing_repository = listing_repository
self.listing_type = listing_type
# Register new processing steps here
# Order is important
self.process_steps = [
FetchListingDetailsStep(listing_repository),
FetchImagesStep(listing_repository),
DetectFloorplanStep(listing_repository),
FetchListingDetailsStep(listing_repository, listing_type),
FetchImagesStep(listing_repository, listing_type),
DetectFloorplanStep(listing_repository, listing_type),
]
async def process_listing(
@ -72,7 +83,7 @@ class ListingProcessor:
listing_id: int,
on_step_complete: Callable[[str], None] | None = None,
) -> Listing | None:
await self.listing_repository.mark_seen(listing_id)
await self.listing_repository.mark_seen(listing_id, self.listing_type)
listing = None
for step in self.process_steps:
if await step.needs_processing(listing_id):
@ -94,9 +105,15 @@ class ListingProcessor:
class Step:
listing_repository: ListingRepository
listing_type: ListingType
def __init__(self, listing_repository: ListingRepository):
def __init__(self, listing_repository: ListingRepository, listing_type: ListingType = ListingType.RENT):
self.listing_repository = listing_repository
self.listing_type = listing_type
def _query_params(self) -> QueryParameters:
"""Build minimal QueryParameters for ID-based lookups in the correct table."""
return QueryParameters(listing_type=self.listing_type)
@abstractmethod
async def process(self, listing_id: int) -> Listing: ...
@ -106,10 +123,36 @@ class Step:
return True
def _parse_service_charge(details: dict[str, Any]) -> float | None:
"""Parse annual service charge from the tenure info in API response."""
tenure_content = (
details.get("property", {}).get("tenureInfo", {}).get("content", [])
)
for item in tenure_content:
if item.get("type") == "annualServiceCharge":
matches = re.findall(r"([\d,.]+)", str(item.get("value", "")))
if matches:
return float(matches[0].replace(",", ""))
return None
def _parse_lease_left(details: dict[str, Any]) -> int | None:
"""Parse remaining lease years from the tenure info in API response."""
tenure_content = (
details.get("property", {}).get("tenureInfo", {}).get("content", [])
)
for item in tenure_content:
if item.get("type") == "lengthOfLease":
matches = re.findall(r"(\d+)", str(item.get("value", "")))
if matches:
return int(matches[0])
return None
class FetchListingDetailsStep(Step):
async def needs_processing(self, listing_id: int) -> bool:
existing_listings = await self.listing_repository.get_listings(
only_ids=[listing_id]
only_ids=[listing_id], query_parameters=self._query_params()
)
if len(existing_listings) == 0:
return True
@ -120,7 +163,7 @@ class FetchListingDetailsStep(Step):
logger.info(f"[{listing_id}] Fetching details...")
existing_listings = await self.listing_repository.get_listings(
only_ids=[listing_id]
only_ids=[listing_id], query_parameters=self._query_params()
)
now = datetime.now()
if len(existing_listings) > 0:
@ -130,16 +173,8 @@ class FetchListingDetailsStep(Step):
listing_details = await detail_query(listing_id)
furnish_type = _parse_furnish_type(
listing_details["property"].get("letFurnishType", "unknown")
)
available_from = _parse_available_from(
listing_details["property"]["letDateAvailable"]
)
photos = listing_details["property"]["photos"]
listing = RentListing( # TODO: should pick based on price?
common_kwargs = dict(
id=listing_id,
price=listing_details["property"]["price"],
number_of_bedrooms=listing_details["property"]["bedrooms"],
@ -154,10 +189,29 @@ class FetchListingDetailsStep(Step):
listing_site=ListingSite.RIGHTMOVE,
last_seen=now,
photo_thumbnail=photos[0]["thumbnailUrl"] if len(photos) > 0 else None,
furnish_type=furnish_type,
available_from=available_from,
additional_info=listing_details,
)
listing: Listing
if self.listing_type == ListingType.BUY:
listing = BuyListing(
**common_kwargs,
service_charge=_parse_service_charge(listing_details),
lease_left=_parse_lease_left(listing_details),
)
else:
furnish_type = _parse_furnish_type(
listing_details["property"].get("letFurnishType", "unknown")
)
available_from = _parse_available_from(
listing_details["property"]["letDateAvailable"]
)
listing = RentListing(
**common_kwargs,
furnish_type=furnish_type,
available_from=available_from,
)
await self.listing_repository.upsert_listings([listing])
logger.info(
@ -172,7 +226,7 @@ class FetchListingDetailsStep(Step):
class FetchImagesStep(Step):
async def needs_processing(self, listing_id: int) -> bool:
existing_listings = await self.listing_repository.get_listings(
only_ids=[listing_id]
only_ids=[listing_id], query_parameters=self._query_params()
)
if len(existing_listings) == 0:
return False # if listing doesn't exist, we can't process it
@ -183,7 +237,7 @@ class FetchImagesStep(Step):
logger.debug(f"[{listing_id}] Fetching floorplan images")
existing_listings = await self.listing_repository.get_listings(
only_ids=[listing_id]
only_ids=[listing_id], query_parameters=self._query_params()
)
if len(existing_listings) == 0:
raise ValueError(f"Listing {listing_id} not found")
@ -228,12 +282,14 @@ class FetchImagesStep(Step):
class DetectFloorplanStep(Step):
ocr_semaphore: asyncio.Semaphore
def __init__(self, listing_repository: ListingRepository):
super().__init__(listing_repository)
def __init__(self, listing_repository: ListingRepository, listing_type: ListingType = ListingType.RENT):
super().__init__(listing_repository, listing_type)
self.ocr_semaphore = asyncio.Semaphore(MAX_OCR_WORKERS)
async def needs_processing(self, listing_id: int) -> bool:
listings = await self.listing_repository.get_listings(only_ids=[listing_id])
listings = await self.listing_repository.get_listings(
only_ids=[listing_id], query_parameters=self._query_params()
)
if len(listings) == 0:
return False
return listings[0].square_meters is None
@ -241,7 +297,9 @@ class DetectFloorplanStep(Step):
async def process(self, listing_id: int) -> Listing:
logger.debug(f"[{listing_id}] Running OCR on floorplans")
listings = await self.listing_repository.get_listings(only_ids=[listing_id])
listings = await self.listing_repository.get_listings(
only_ids=[listing_id], query_parameters=self._query_params()
)
if len(listings) == 0:
raise ValueError(f"Listing {listing_id} does not exist")
listing = listings[0]