from datetime import datetime, timedelta import logging from typing import Generator from data_access import Listing from models.listing import ( BuyListing, FurnishType, Listing as modelListing, ListingType, QueryParameters, RentListing, ) from sqlalchemy import Engine, func, select as sa_select from sqlmodel import Session, select from sqlmodel.sql.expression import SelectOfScalar from tqdm import tqdm logger = logging.getLogger("uvicorn.error") # Columns needed for GeoJSON streaming (excludes routing_info_json, additional_info) STREAMING_COLUMNS = [ 'id', 'price', 'number_of_bedrooms', 'square_meters', 'longitude', 'latitude', 'photo_thumbnail', 'last_seen', 'agency', 'price_history_json', 'available_from' ] class ListingRepository: engine: Engine # anything more than 10k is considered buy type buy_listing_price_threshold: int = 20_000 def __init__(self, engine: Engine): self.engine = engine async def get_listings( self, query_parameters: QueryParameters | None = None, only_ids: list[int] | None = None, limit: int | None = None, ) -> list[modelListing]: """ Get all listings from the database. """ only_ids = only_ids or [] model = RentListing # if no query params, default to renting listings if query_parameters: model = ( RentListing if query_parameters.listing_type == ListingType.RENT else BuyListing # else RentListing ) query = select(model) if only_ids: query = query.where(model.id.in_(only_ids)) # type: ignore query = self._add_where_from_query_parameters(query, model, query_parameters) if limit: query = query.limit(limit) with Session(self.engine) as session: # query = select(modelListing) rows = list(session.exec(query).all()) logging.debug(f"Found {len(rows)} listings") return rows def stream_listings( self, query_parameters: QueryParameters | None = None, limit: int | None = None, chunk_size: int = 100, ) -> Generator[modelListing, None, None]: """Yield listings one at a time for streaming. Uses yield_per for memory-efficient iteration over large result sets. Args: query_parameters: Filtering parameters limit: Maximum number of listings to yield chunk_size: Number of rows to fetch at a time from the database """ model = RentListing # if no query params, default to renting listings if query_parameters: model = ( RentListing if query_parameters.listing_type == ListingType.RENT else BuyListing ) query = select(model) query = self._add_where_from_query_parameters(query, model, query_parameters) if limit: query = query.limit(limit) with Session(self.engine) as session: for listing in session.exec(query).yield_per(chunk_size): yield listing def _get_model_for_query( self, query_parameters: QueryParameters | None ) -> type[RentListing] | type[BuyListing]: """Get the appropriate model class based on query parameters.""" if query_parameters and query_parameters.listing_type == ListingType.BUY: return BuyListing return RentListing def count_listings(self, query_parameters: QueryParameters | None = None) -> int: """Fast count for progress estimation.""" model = self._get_model_for_query(query_parameters) query = sa_select(func.count(model.id)) query = self._add_where_from_query_parameters_raw(query, model, query_parameters) with Session(self.engine) as session: return session.execute(query).scalar() or 0 def stream_listings_optimized( self, query_parameters: QueryParameters | None = None, limit: int | None = None, page_size: int = 100, ) -> Generator[dict, None, None]: """Stream listings with keyset pagination and column projection. Uses keyset pagination for O(1) performance at any offset, and only fetches columns needed for GeoJSON (excludes large JSON blobs). Args: query_parameters: Filtering parameters limit: Maximum number of listings to yield page_size: Number of rows to fetch per database round-trip """ model = self._get_model_for_query(query_parameters) # Select only needed columns (excludes routing_info_json, additional_info) columns = [ getattr(model, col) for col in STREAMING_COLUMNS if hasattr(model, col) ] last_id: int | None = None total_yielded = 0 while True: if limit and total_yielded >= limit: break query = sa_select(*columns) query = self._add_where_from_query_parameters_raw( query, model, query_parameters ) # Keyset pagination: WHERE id > last_id (O(1) performance) if last_id is not None: query = query.where(model.id > last_id) batch_limit = page_size if limit: batch_limit = min(page_size, limit - total_yielded) query = query.order_by(model.id).limit(batch_limit) with Session(self.engine) as session: results = session.execute(query).fetchall() if not results: break for row in results: yield row._asdict() last_id = row.id total_yielded += 1 if len(results) < page_size: break def _add_where_from_query_parameters_raw( self, query, model: type[RentListing] | type[BuyListing], query_parameters: QueryParameters | None = None, ): """Add WHERE clauses from query parameters (for raw SQLAlchemy selects).""" if query_parameters is None: return query query = query.where( model.number_of_bedrooms.between( query_parameters.min_bedrooms, query_parameters.max_bedrooms ), model.price.between(query_parameters.min_price, query_parameters.max_price), ) if query_parameters.min_sqm is not None: query = query.where(model.square_meters >= query_parameters.min_sqm) if query_parameters.furnish_types and model == RentListing: query = query.where(model.furnish_type.in_(query_parameters.furnish_types)) if ( model == RentListing and query_parameters.let_date_available_from is not None ): query = query.where( model.available_from >= query_parameters.let_date_available_from ) if query_parameters.last_seen_days is not None: last_seen_threshold = datetime.now() - timedelta( days=query_parameters.last_seen_days ) query = query.where(model.last_seen >= last_seen_threshold) return query def _add_where_from_query_parameters( self, query: SelectOfScalar[Listing], model: type[Listing], query_parameters: QueryParameters | None = None, ) -> SelectOfScalar[Listing]: if query_parameters is None: return query query = query.where( model.number_of_bedrooms.between( query_parameters.min_bedrooms, query_parameters.max_bedrooms ), model.price.between(query_parameters.min_price, query_parameters.max_price), ) if query_parameters.min_sqm is not None: query = query.where(model.square_meters >= query_parameters.min_sqm) if query_parameters.furnish_types and model == RentListing: query = query.where(model.furnish_type.in_(query_parameters.furnish_types)) if ( isinstance(model, RentListing) and query_parameters.let_date_available_from is not None ): query = query.where( model.available_from >= query_parameters.let_date_available_from ) if query_parameters.last_seen_days is not None: last_seen_threshold = datetime.now() - timedelta( days=query_parameters.last_seen_days ) query = query.where(model.last_seen >= last_seen_threshold) return query async def upsert_listings( self, listings: list[modelListing], ) -> list[modelListing]: """ Upsert listings into the database. """ models = [] with Session(self.engine) as session: for listing in listings: session.merge(listing) models.append(listing) session.commit() return models async def upsert_listings_legacy( self, listings: list[Listing], ) -> list[modelListing]: """ Upsert listings into the database. """ models = [] failed_to_upsert = [] with Session(self.engine) as session: for listing in tqdm(listings, desc="Upserting listings"): # Convert Listing to modelListing try: model_listing = await self._get_concrete_listing(listing) except Exception as e: # WHY SO MANY ERORRS?? # If for whatever reason we cannot add listing, ignore and retry print(f"Error converting listing {listing.identifier}: {e}") failed_to_upsert.append(listing) continue session.merge(model_listing) models.append(model_listing) session.commit() print(f"Failed to upsert {len(failed_to_upsert)} listings.") return models async def _get_concrete_listing( self, listing: Listing, ) -> modelListing: now = datetime.now() if ( listing.detailobject is None or listing.detailobject.get("property") is None or listing.detailobject["property"].get("letFurnishType") is None ): furnish_type_str = "unknown" else: furnish_type_str = listing.detailobject["property"]["letFurnishType"] if furnish_type_str is None: furnish_type_str = "unknown" elif "landlord" in furnish_type_str.lower(): furnish_type_str = "ask landlord" else: furnish_type_str = furnish_type_str.lower() furnish_type = FurnishType(furnish_type_str) if listing.price < self.buy_listing_price_threshold: model_listing = RentListing( id=listing.identifier, price=listing.price, number_of_bedrooms=listing.bedrooms, square_meters=await listing.sqm_ocr(), agency=listing.agency, council_tax_band=listing.councilTaxBand, longitude=listing.longitude, latitude=listing.latitude, price_history_json=modelListing.serialize_price_history( listing.priceHistory ), listing_site=listing.listing_site, last_seen=now, photo_thumbnail=listing.photoThumbnail, furnish_type=furnish_type, available_from=listing.letDateAvailable, additional_info=listing.detailobject, ) else: model_listing = BuyListing( id=listing.identifier, price=listing.price, number_of_bedrooms=listing.bedrooms, square_meters=await listing.sqm_ocr(), agency=listing.agency, council_tax_band=listing.councilTaxBand, longitude=listing.longitude, latitude=listing.latitude, price_history_json=modelListing.serialize_price_history( listing.priceHistory ), listing_site=listing.listing_site, last_seen=now, photo_thumbnail=listing.photoThumbnail, service_charge=listing.serviceCharge, additional_info=listing.detailobject, ) return model_listing async def mark_seen(self, listing_id: int) -> None: listings = await self.get_listings(only_ids=[listing_id]) if len(listings) == 0: return listing = listings[0] now = datetime.now() listing.last_seen = now await self.upsert_listings([listing])