diff --git a/crawler/api/app.py b/crawler/api/app.py index c894b19..a96679e 100644 --- a/crawler/api/app.py +++ b/crawler/api/app.py @@ -146,7 +146,7 @@ async def stream_listing_geojson( for row in repository.stream_listings_optimized( query_parameters, limit=limit, page_size=batch_size ): - feature = convert_row_to_geojson(row) + feature = convert_row_to_geojson(row, query_parameters.listing_type.value) batch.append(feature) count += 1 diff --git a/crawler/frontend/src/components/FilterPanel.tsx b/crawler/frontend/src/components/FilterPanel.tsx index 16b39e6..dc9ab3a 100644 --- a/crawler/frontend/src/components/FilterPanel.tsx +++ b/crawler/frontend/src/components/FilterPanel.tsx @@ -101,6 +101,24 @@ export function FilterPanel({ onSubmit, isLoading, listingCount }: FilterPanelPr }, }); + // Watch listing_type to make filters type-aware + const watchedListingType = form.watch('listing_type'); + + // Update price defaults when listing type changes + useEffect(() => { + if (watchedListingType === ListingType.BUY) { + form.setValue('min_price', 300000); + form.setValue('max_price', 600000); + } else { + form.setValue('min_price', 2000); + form.setValue('max_price', 3000); + } + // Clear furnish types when switching to BUY + if (watchedListingType === ListingType.BUY) { + setSelectedFurnishTypes([]); + } + }, [watchedListingType, form]); + const handleFormSubmit = (action: 'fetch-data' | 'visualize') => { return form.handleSubmit((values) => { const params: ParameterValues = { @@ -400,29 +418,31 @@ export function FilterPanel({ onSubmit, isLoading, listingCount }: FilterPanelPr )} /> -
- Furnishing -
- {[ - { value: FurnishType.FURNISHED, label: 'Furnished' }, - { value: FurnishType.PART_FURNISHED, label: 'Part' }, - { value: FurnishType.UNFURNISHED, label: 'Unfurn.' }, - ].map((option) => ( - - ))} + {watchedListingType === ListingType.RENT && ( +
+ Furnishing +
+ {[ + { value: FurnishType.FURNISHED, label: 'Furnished' }, + { value: FurnishType.PART_FURNISHED, label: 'Part' }, + { value: FurnishType.UNFURNISHED, label: 'Unfurn.' }, + ].map((option) => ( + + ))} +
-
+ )}
@@ -456,33 +476,32 @@ export function FilterPanel({ onSubmit, isLoading, listingCount }: FilterPanelPr - {/* Availability */} + {/* Availability / Recency */} - Availability + {watchedListingType === ListingType.RENT ? 'Availability' : 'Recency'}
- ( - - Available From - - - - - Rental listings only - - - )} - /> + {watchedListingType === ListingType.RENT && ( + ( + + Available From + + + + + )} + /> + )}
£{property.total_price.toLocaleString()} - /mo + {property.listing_type !== 'BUY' && ( + /mo + )}
{priceIndicator && ( @@ -119,7 +121,9 @@ export function PropertyCard({
£{property.total_price.toLocaleString()} - /mo + {property.listing_type !== 'BUY' && ( + /mo + )}
{priceIndicator && ( @@ -145,10 +149,18 @@ export function PropertyCard({ £{property.qmprice}/m²
-
- - Available {property.available_from} -
+ {property.listing_type !== 'BUY' && property.available_from && ( +
+ + Available {property.available_from} +
+ )} + {property.listing_type === 'BUY' && ( +
+ + Seen {lastSeenDays}d ago +
+ )}
{/* Agency and last seen */} diff --git a/crawler/frontend/src/types/index.ts b/crawler/frontend/src/types/index.ts new file mode 100644 index 0000000..f538af3 --- /dev/null +++ b/crawler/frontend/src/types/index.ts @@ -0,0 +1,71 @@ +// TypeScript types for the frontend application + +// GeoJSON types +export interface PropertyPriceHistory { + id: number; + price: number; + last_seen: string; +} + +export interface PropertyProperties { + url: string; + city: string; + country: string; + qm: number; + qmprice: number; + total_price: number; + rooms: number; + agency: string; + available_from: string; + last_seen: string; + photo_thumbnail: string; + price_history: PropertyPriceHistory[]; + listing_type?: 'RENT' | 'BUY'; +} + +export interface PropertyFeature { + type: 'Feature'; + geometry: { + type: 'Point'; + coordinates: [number, number]; // [longitude, latitude] + }; + properties: PropertyProperties; +} + +export interface GeoJSONFeatureCollection { + type: 'FeatureCollection'; + features: PropertyFeature[]; +} + +// Task status types +export enum TaskStatus { + PENDING = 'PENDING', + STARTED = 'STARTED', + SUCCESS = 'SUCCESS', + FAILURE = 'FAILURE', + REVOKED = 'REVOKED', +} + +export interface TaskStatusResponse { + status: TaskStatus; + result: string; // JSON string containing { progress: number } +} + +export interface TaskResult { + progress: number; +} + +export interface RefreshListingsResponse { + task_id: string; +} + +// API error type +export class ApiError extends Error { + constructor( + message: string, + public statusCode: number + ) { + super(message); + this.name = 'ApiError'; + } +} diff --git a/crawler/repositories/listing_repository.py b/crawler/repositories/listing_repository.py index c2bcaf8..187998c 100644 --- a/crawler/repositories/listing_repository.py +++ b/crawler/repositories/listing_repository.py @@ -1,5 +1,6 @@ from datetime import datetime, timedelta import logging +from typing import Generator from data_access import Listing from models.listing import ( BuyListing, @@ -9,13 +10,20 @@ from models.listing import ( QueryParameters, RentListing, ) -from sqlalchemy import Engine +from sqlalchemy import Engine, func, select as sa_select from sqlmodel import Session, select from sqlmodel.sql.expression import SelectOfScalar from tqdm import tqdm logger = logging.getLogger("uvicorn.error") +# Columns needed for GeoJSON streaming (excludes routing_info_json, additional_info) +STREAMING_COLUMNS = [ + 'id', 'price', 'number_of_bedrooms', 'square_meters', + 'longitude', 'latitude', 'photo_thumbnail', 'last_seen', + 'agency', 'price_history_json', 'available_from' +] + class ListingRepository: engine: Engine @@ -58,6 +66,147 @@ class ListingRepository: logging.debug(f"Found {len(rows)} listings") return rows + def stream_listings( + self, + query_parameters: QueryParameters | None = None, + limit: int | None = None, + chunk_size: int = 100, + ) -> Generator[modelListing, None, None]: + """Yield listings one at a time for streaming. + + Uses yield_per for memory-efficient iteration over large result sets. + + Args: + query_parameters: Filtering parameters + limit: Maximum number of listings to yield + chunk_size: Number of rows to fetch at a time from the database + """ + model = RentListing # if no query params, default to renting listings + if query_parameters: + model = ( + RentListing + if query_parameters.listing_type == ListingType.RENT + else BuyListing + ) + + query = select(model) + query = self._add_where_from_query_parameters(query, model, query_parameters) + if limit: + query = query.limit(limit) + + with Session(self.engine) as session: + for listing in session.exec(query).yield_per(chunk_size): + yield listing + + def _get_model_for_query( + self, query_parameters: QueryParameters | None + ) -> type[RentListing] | type[BuyListing]: + """Get the appropriate model class based on query parameters.""" + if query_parameters and query_parameters.listing_type == ListingType.BUY: + return BuyListing + return RentListing + + def count_listings(self, query_parameters: QueryParameters | None = None) -> int: + """Fast count for progress estimation.""" + model = self._get_model_for_query(query_parameters) + + query = sa_select(func.count(model.id)) + query = self._add_where_from_query_parameters_raw(query, model, query_parameters) + + with Session(self.engine) as session: + return session.execute(query).scalar() or 0 + + def stream_listings_optimized( + self, + query_parameters: QueryParameters | None = None, + limit: int | None = None, + page_size: int = 100, + ) -> Generator[dict, None, None]: + """Stream listings with keyset pagination and column projection. + + Uses keyset pagination for O(1) performance at any offset, and only + fetches columns needed for GeoJSON (excludes large JSON blobs). + + Args: + query_parameters: Filtering parameters + limit: Maximum number of listings to yield + page_size: Number of rows to fetch per database round-trip + """ + model = self._get_model_for_query(query_parameters) + + # Select only needed columns (excludes routing_info_json, additional_info) + columns = [ + getattr(model, col) for col in STREAMING_COLUMNS if hasattr(model, col) + ] + + last_id: int | None = None + total_yielded = 0 + + while True: + if limit and total_yielded >= limit: + break + + query = sa_select(*columns) + query = self._add_where_from_query_parameters_raw( + query, model, query_parameters + ) + + # Keyset pagination: WHERE id > last_id (O(1) performance) + if last_id is not None: + query = query.where(model.id > last_id) + + batch_limit = page_size + if limit: + batch_limit = min(page_size, limit - total_yielded) + query = query.order_by(model.id).limit(batch_limit) + + with Session(self.engine) as session: + results = session.execute(query).fetchall() + + if not results: + break + + for row in results: + yield row._asdict() + last_id = row.id + total_yielded += 1 + + if len(results) < page_size: + break + + def _add_where_from_query_parameters_raw( + self, + query, + model: type[RentListing] | type[BuyListing], + query_parameters: QueryParameters | None = None, + ): + """Add WHERE clauses from query parameters (for raw SQLAlchemy selects).""" + if query_parameters is None: + return query + query = query.where( + model.number_of_bedrooms.between( + query_parameters.min_bedrooms, query_parameters.max_bedrooms + ), + model.price.between(query_parameters.min_price, query_parameters.max_price), + ) + if query_parameters.min_sqm is not None: + query = query.where(model.square_meters >= query_parameters.min_sqm) + if query_parameters.furnish_types and model == RentListing: + query = query.where(model.furnish_type.in_(query_parameters.furnish_types)) + if ( + model == RentListing + and query_parameters.let_date_available_from is not None + ): + query = query.where( + model.available_from >= query_parameters.let_date_available_from + ) + if query_parameters.last_seen_days is not None: + last_seen_threshold = datetime.now() - timedelta( + days=query_parameters.last_seen_days + ) + query = query.where(model.last_seen >= last_seen_threshold) + return query + def _add_where_from_query_parameters( self, query: SelectOfScalar[Listing], @@ -74,7 +223,7 @@ class ListingRepository: ) if query_parameters.min_sqm is not None: query = query.where(model.square_meters >= query_parameters.min_sqm) - if query_parameters.furnish_types: + if query_parameters.furnish_types and model == RentListing: query = query.where(model.furnish_type.in_(query_parameters.furnish_types)) if ( isinstance(model, RentListing) diff --git a/crawler/ui_exporter.py b/crawler/ui_exporter.py index 173306f..8636d9f 100644 --- a/crawler/ui_exporter.py +++ b/crawler/ui_exporter.py @@ -1,13 +1,122 @@ import json import logging import pathlib +from typing import Any -from models.listing import QueryParameters +from models.listing import QueryParameters, RentListing, BuyListing from repositories.listing_repository import ListingRepository logger = logging.getLogger("uvicorn.error") +def convert_row_to_geojson(row: dict[str, Any], listing_type: str = "RENT") -> dict[str, Any]: + """Convert a projected row dict to GeoJSON Feature format. + + This function handles dict rows from stream_listings_optimized(), + which uses column projection and returns dicts instead of model instances. + + Args: + row: A dict with keys matching STREAMING_COLUMNS + + Returns: + A GeoJSON Feature dict with properties and geometry + """ + # Parse price history from JSON string + price_history = [] + if row.get('price_history_json'): + parsed = json.loads(row['price_history_json']) + price_history = [ + { + "first_seen": p["first_seen"], + "last_seen": p["last_seen"], + "price": p["price"] + } + for p in parsed + ] + + sqm = row.get('square_meters') + price = row['price'] + + # Handle available_from which may be a datetime or None + available_from_val = row.get('available_from') + available_from_str = None + if available_from_val is not None: + if hasattr(available_from_val, 'isoformat'): + available_from_str = available_from_val.isoformat() + else: + available_from_str = str(available_from_val) + + # Handle last_seen which should be a datetime + last_seen_val = row['last_seen'] + if hasattr(last_seen_val, 'isoformat'): + last_seen_str = last_seen_val.isoformat() + else: + last_seen_str = str(last_seen_val) + + return { + "type": "Feature", + "properties": { + "listing_type": listing_type, + "city": "London", + "country": "United Kingdom", + "qm": sqm, + "qmprice": round(price / sqm, 2) if sqm else None, + "rooms": row['number_of_bedrooms'], + "total_price": price, + "url": f"https://www.rightmove.co.uk/properties/{row['id']}", + "photo_thumbnail": row.get('photo_thumbnail'), + "last_seen": last_seen_str, + "price_history": price_history, + "agency": row.get('agency'), + "available_from": available_from_str, + }, + "geometry": { + "coordinates": [row['longitude'], row['latitude']], + "type": "Point", + }, + } + + +def convert_to_geojson_feature(listing: RentListing | BuyListing) -> dict[str, Any]: + """Convert a single listing to GeoJSON Feature format. + + Args: + listing: A RentListing or BuyListing model instance + + Returns: + A GeoJSON Feature dict with properties and geometry + """ + # Safely access nested additional_info + property_info = listing.additional_info.get("property", {}) if listing.additional_info else {} + listing_type = "RENT" if isinstance(listing, RentListing) else "BUY" + + return { + "type": "Feature", + "properties": { + "listing_type": listing_type, + "city": "London", # change me + "country": "United Kingdom", + "qm": listing.square_meters, + "qmprice": listing.price_per_square_meter, + "rooms": listing.number_of_bedrooms, + "total_price": listing.price, + "url": listing.url, + "photo_thumbnail": listing.photo_thumbnail, + "last_seen": listing.last_seen.isoformat(), + "price_history": [item.to_dict() for item in listing.price_history], + "agency": listing.agency, + "available_from": property_info.get("letDateAvailable", None), + }, + "geometry": { + "coordinates": [ + listing.longitude, + listing.latitude, + ], + "type": "Point", + }, + } + + async def export_immoweb( repository: ListingRepository, output_file: str | None = None, @@ -20,39 +129,8 @@ async def export_immoweb( ) logger.info(f"Fetched {len(listings)} listings") - # Convert listings to immoweb format - immoweb_listings = [] - for listing in listings: - immoweb_listing = { - "type": "Feature", - "properties": { - "city": "London", # change me - "country": "United Kingdom", - "qm": listing.square_meters, - "qmprice": listing.price_per_square_meter, - "rooms": listing.number_of_bedrooms, - "total_price": listing.price, - "url": listing.url, - "photo_thumbnail": listing.photo_thumbnail, - "last_seen": listing.last_seen.isoformat(), - "price_history": [item.to_dict() for item in listing.price_history], - "agency": listing.agency, - "available_from": listing.additional_info["property"].get( - "letDateAvailable", None - ), - # All other crap can be found in additional_info - # Prefer pulling out fields here instead of exporting the entire additional_info - # "info": listing.additional_info, - }, - "geometry": { - "coordinates": [ - listing.longitude, - listing.latitude, - ], - "type": "Point", - }, - } - immoweb_listings.append(immoweb_listing) + # Convert listings to GeoJSON features using the helper function + immoweb_listings = [convert_to_geojson_feature(listing) for listing in listings] prefix = "var data = " serialized_data = {"type": "FeatureCollection", "features": immoweb_listings}