wrongmove/repositories/listing_repository.py
Viktor Barzin e5ce8c1201
Fix buy listing support: thread ListingType through processing pipeline
The listing processor was hardcoded to create RentListing objects and
query only the rentlisting table. Buy listings fetched from Rightmove
were stored in the wrong table with missing fields. This threads
ListingType through ListingProcessor and all Step subclasses so the
correct model (RentListing/BuyListing) is created, the correct table
is queried, and buy-specific fields (service_charge, lease_left) are
parsed from the API response and included in GeoJSON streaming output.
2026-02-07 23:34:08 +00:00

360 lines
13 KiB
Python

from datetime import datetime, timedelta
import logging
from typing import Any, Generator
from data_access import Listing
from models.listing import (
BuyListing,
FurnishType,
Listing as modelListing,
ListingType,
QueryParameters,
RentListing,
)
from sqlalchemy import Engine, func, select as sa_select
from sqlmodel import Session, select
from tqdm import tqdm
logger = logging.getLogger("uvicorn.error")
# Columns needed for GeoJSON streaming (excludes routing_info_json, additional_info)
STREAMING_COLUMNS = [
'id', 'price', 'number_of_bedrooms', 'square_meters',
'longitude', 'latitude', 'photo_thumbnail', 'last_seen',
'agency', 'price_history_json', 'available_from',
'service_charge', 'lease_left',
]
class ListingRepository:
engine: Engine
# Monthly rent prices in the UK are always below 20,000 GBP.
# Any listing priced at or above this threshold is treated as a purchase (buy) listing.
BUY_LISTING_PRICE_THRESHOLD: int = 20_000
def __init__(self, engine: Engine):
self.engine = engine
async def get_listings(
self,
query_parameters: QueryParameters | None = None,
only_ids: list[int] | None = None,
limit: int | None = None,
) -> list[modelListing]:
"""
Get all listings from the database.
"""
only_ids = only_ids or []
model = self._get_model_for_query(query_parameters)
query = select(model)
if only_ids:
query = query.where(model.id.in_(only_ids)) # type: ignore
query = self._apply_query_filters(query, model, query_parameters)
if limit:
query = query.limit(limit)
with Session(self.engine) as session:
rows = list(session.exec(query).all())
logging.debug(f"Found {len(rows)} listings")
return rows
def stream_listings(
self,
query_parameters: QueryParameters | None = None,
limit: int | None = None,
chunk_size: int = 100,
) -> Generator[modelListing, None, None]:
"""Yield listings one at a time for streaming.
Uses yield_per for memory-efficient iteration over large result sets.
Args:
query_parameters: Filtering parameters
limit: Maximum number of listings to yield
chunk_size: Number of rows to fetch at a time from the database
"""
model = self._get_model_for_query(query_parameters)
query = select(model)
query = self._apply_query_filters(query, model, query_parameters)
if limit:
query = query.limit(limit)
with Session(self.engine) as session:
for listing in session.exec(query).yield_per(chunk_size):
yield listing
def _get_model_for_query(
self, query_parameters: QueryParameters | None
) -> type[RentListing] | type[BuyListing]:
"""Get the appropriate model class based on query parameters."""
if query_parameters and query_parameters.listing_type == ListingType.BUY:
return BuyListing
return RentListing
def count_listings(self, query_parameters: QueryParameters | None = None) -> int:
"""Fast count for progress estimation."""
model = self._get_model_for_query(query_parameters)
query = sa_select(func.count(model.id))
query = self._apply_query_filters(query, model, query_parameters)
with Session(self.engine) as session:
return session.execute(query).scalar() or 0
def stream_listings_optimized(
self,
query_parameters: QueryParameters | None = None,
limit: int | None = None,
page_size: int = 100,
) -> Generator[dict, None, None]:
"""Stream listings with keyset pagination and column projection.
Uses keyset pagination for O(1) performance at any offset, and only
fetches columns needed for GeoJSON (excludes large JSON blobs).
Args:
query_parameters: Filtering parameters
limit: Maximum number of listings to yield
page_size: Number of rows to fetch per database round-trip
"""
model = self._get_model_for_query(query_parameters)
# Select only needed columns (excludes routing_info_json, additional_info)
columns = [
getattr(model, col) for col in STREAMING_COLUMNS if hasattr(model, col)
]
last_id: int | None = None
total_yielded = 0
while True:
if limit and total_yielded >= limit:
break
query = sa_select(*columns)
query = self._apply_query_filters(
query, model, query_parameters
)
# Keyset pagination: WHERE id > last_id (O(1) performance)
if last_id is not None:
query = query.where(model.id > last_id)
batch_limit = page_size
if limit:
batch_limit = min(page_size, limit - total_yielded)
query = query.order_by(model.id).limit(batch_limit)
with Session(self.engine) as session:
results = session.execute(query).fetchall()
if not results:
break
for row in results:
yield row._asdict()
last_id = row.id
total_yielded += 1
if len(results) < page_size:
break
def _apply_query_filters(
self,
query: Any,
model: type[RentListing] | type[BuyListing],
query_parameters: QueryParameters | None = None,
) -> Any:
"""Apply WHERE clauses from query parameters to a query.
Works with both SQLModel select() and raw SQLAlchemy sa_select() queries,
since both support the .where() interface.
Args:
query: A SQLModel or SQLAlchemy select query
model: The listing model class (RentListing or BuyListing)
query_parameters: Optional filtering parameters
Returns:
The query with WHERE clauses applied
"""
if query_parameters is None:
return query
query = query.where(
model.number_of_bedrooms.between(
query_parameters.min_bedrooms, query_parameters.max_bedrooms
),
model.price.between(query_parameters.min_price, query_parameters.max_price),
)
if query_parameters.min_sqm is not None:
query = query.where(model.square_meters >= query_parameters.min_sqm)
if query_parameters.furnish_types and model == RentListing:
query = query.where(model.furnish_type.in_(query_parameters.furnish_types))
if (
model == RentListing
and query_parameters.let_date_available_from is not None
):
query = query.where(
model.available_from >= query_parameters.let_date_available_from
)
if query_parameters.last_seen_days is not None:
last_seen_threshold = datetime.now() - timedelta(
days=query_parameters.last_seen_days
)
query = query.where(model.last_seen >= last_seen_threshold)
return query
async def upsert_listings(
self,
listings: list[modelListing],
) -> list[modelListing]:
"""
Upsert listings into the database.
"""
models = []
with Session(self.engine) as session:
for listing in listings:
session.merge(listing)
models.append(listing)
session.commit()
return models
async def upsert_listings_legacy(
self,
listings: list[Listing],
) -> list[modelListing]:
"""Upsert legacy Listing objects into the database.
.. deprecated::
This method converts legacy data_access.Listing objects to SQLModel
entities. Use upsert_listings() with RentListing/BuyListing directly.
Legacy Listing objects from filesystem-based data may contain malformed
or incomplete data, so conversion errors are logged and skipped rather
than aborting the entire batch.
"""
models = []
failed_to_upsert = []
with Session(self.engine) as session:
for listing in tqdm(listings, desc="Upserting listings"):
# Convert legacy Listing to the appropriate SQLModel entity
try:
model_listing = await self._get_concrete_listing(listing)
except Exception as e:
# Legacy Listing -> model conversion may fail for malformed data
# (e.g. missing required fields, invalid types). Log and skip.
logger.error(f"Error converting listing {listing.identifier}: {e}")
failed_to_upsert.append(listing)
continue
session.merge(model_listing)
models.append(model_listing)
session.commit()
if failed_to_upsert:
logger.warning(f"Failed to upsert {len(failed_to_upsert)} listings.")
return models
@staticmethod
def _parse_furnish_type(listing: Listing) -> FurnishType:
"""Extract and normalize the furnish type from a legacy Listing's detail object.
Handles missing/null detailobject, missing property key, missing or null
letFurnishType value, and normalizes "landlord" variants to ASK_LANDLORD.
Args:
listing: A legacy data_access.Listing object
Returns:
The parsed FurnishType enum value
"""
if (
listing.detailobject is None
or listing.detailobject.get("property") is None
or listing.detailobject["property"].get("letFurnishType") is None
):
return FurnishType.UNKNOWN
furnish_type_str = listing.detailobject["property"]["letFurnishType"]
if furnish_type_str is None:
return FurnishType.UNKNOWN
elif "landlord" in furnish_type_str.lower():
furnish_type_str = "ask landlord"
else:
furnish_type_str = furnish_type_str.lower()
return FurnishType(furnish_type_str)
async def _get_concrete_listing(
self,
listing: Listing,
) -> modelListing:
now = datetime.now()
furnish_type = self._parse_furnish_type(listing)
if listing.price < self.BUY_LISTING_PRICE_THRESHOLD:
model_listing = RentListing(
id=listing.identifier,
price=listing.price,
number_of_bedrooms=listing.bedrooms,
square_meters=await listing.sqm_ocr(),
agency=listing.agency,
council_tax_band=listing.councilTaxBand,
longitude=listing.longitude,
latitude=listing.latitude,
price_history_json=modelListing.serialize_price_history(
listing.priceHistory
),
listing_site=listing.listing_site,
last_seen=now,
photo_thumbnail=listing.photoThumbnail,
furnish_type=furnish_type,
available_from=listing.letDateAvailable,
additional_info=listing.detailobject,
)
else:
model_listing = BuyListing(
id=listing.identifier,
price=listing.price,
number_of_bedrooms=listing.bedrooms,
square_meters=await listing.sqm_ocr(),
agency=listing.agency,
council_tax_band=listing.councilTaxBand,
longitude=listing.longitude,
latitude=listing.latitude,
price_history_json=modelListing.serialize_price_history(
listing.priceHistory
),
listing_site=listing.listing_site,
last_seen=now,
photo_thumbnail=listing.photoThumbnail,
service_charge=listing.serviceCharge,
additional_info=listing.detailobject,
)
return model_listing
def get_listing_ids(
self,
listing_type: ListingType = ListingType.RENT,
) -> set[int]:
"""Get all listing IDs from the database (ID-only projection).
Much faster than get_listings() when only IDs are needed for
filtering against API results.
"""
model = RentListing if listing_type == ListingType.RENT else BuyListing
with Session(self.engine) as session:
result = session.execute(sa_select(model.id))
return {row[0] for row in result.fetchall()}
async def mark_seen(self, listing_id: int, listing_type: ListingType = ListingType.RENT) -> None:
query_params = QueryParameters(listing_type=listing_type)
listings = await self.get_listings(only_ids=[listing_id], query_parameters=query_params)
if len(listings) == 0:
return
listing = listings[0]
now = datetime.now()
listing.last_seen = now
await self.upsert_listings([listing])