Add services layer, tests, streaming UI, and cleanup legacy code

This commit is contained in:
Viktor Barzin 2026-02-06 20:55:10 +00:00
parent 5514fa6381
commit d205d15c74
62 changed files with 3729 additions and 1024 deletions

View file

@ -0,0 +1,41 @@
"""Services package for real estate crawler.
This package contains two layers of services:
## Low-level services (internal implementation):
- listing_fetcher: Fetches listing data from Rightmove API
- image_fetcher: Downloads floorplan images
- floorplan_detector: OCR-based square meter detection from floorplans
- route_calculator: Calculates transit routes using Google Maps API
## High-level services (use these in CLI and API):
- listing_service: Unified listing operations (get, refresh, download images, etc.)
- export_service: Export listings to CSV, GeoJSON
- district_service: District lookup and validation
- task_service: Background task management
"""
# Low-level services (internal)
from services.listing_fetcher import dump_listings, dump_listings_full
from services.image_fetcher import dump_images
from services.floorplan_detector import detect_floorplan
from services.route_calculator import calculate_route
# High-level services (CLI and API should use these)
from services import listing_service
from services import export_service
from services import district_service
from services import task_service
__all__ = [
# Low-level
"dump_listings",
"dump_listings_full",
"dump_images",
"detect_floorplan",
"calculate_route",
# High-level
"listing_service",
"export_service",
"district_service",
"task_service",
]

View file

@ -0,0 +1,38 @@
"""Unified district service - shared between CLI and HTTP API."""
from rec.districts import get_districts as _get_districts
def get_all_districts() -> dict[str, str]:
"""Get all available districts with their region IDs.
Used by:
- CLI: --district option choices
- API: GET /api/get_districts
Returns:
Dictionary mapping district names to region IDs
"""
return _get_districts()
def get_district_names() -> list[str]:
"""Get list of all district names.
Returns:
List of district names
"""
return list(_get_districts().keys())
def validate_districts(district_names: list[str]) -> tuple[bool, list[str]]:
"""Validate that district names exist.
Args:
district_names: List of district names to validate
Returns:
Tuple of (all_valid, invalid_names)
"""
valid_districts = set(_get_districts().keys())
invalid = [d for d in district_names if d not in valid_districts]
return len(invalid) == 0, invalid

View file

@ -0,0 +1,92 @@
"""Unified export service - shared between CLI and HTTP API.
This module provides export functionality for listings in various formats.
"""
from dataclasses import dataclass
from pathlib import Path
from typing import Any
from models.listing import QueryParameters
from repositories.listing_repository import ListingRepository
@dataclass
class ExportResult:
"""Result of an export operation."""
success: bool
output_path: str | None # For file exports
data: Any | None # For in-memory exports (GeoJSON)
record_count: int
message: str
async def export_to_csv(
repository: ListingRepository,
output_path: Path,
query_parameters: QueryParameters | None = None,
) -> ExportResult:
"""Export listings to CSV file.
Used by:
- CLI: export-csv
- API: (could be added as download endpoint)
"""
from csv_exporter import export_to_csv as _export_csv
await _export_csv(repository, output_path, query_parameters)
listings = await repository.get_listings(query_parameters=query_parameters)
return ExportResult(
success=True,
output_path=str(output_path),
data=None,
record_count=len(listings),
message=f"Exported {len(listings)} listings to {output_path}",
)
async def export_to_geojson(
repository: ListingRepository,
query_parameters: QueryParameters | None = None,
output_path: Path | None = None,
limit: int | None = None,
) -> ExportResult:
"""Export listings to GeoJSON format.
Args:
repository: Database repository
query_parameters: Filtering parameters
output_path: If provided, write to file. Otherwise return data.
limit: Maximum number of listings to export
Used by:
- CLI: export-immoweb
- API: GET /api/listing_geojson
"""
from ui_exporter import export_immoweb
geojson_data = await export_immoweb(
repository,
output_file=str(output_path) if output_path else None,
query_parameters=query_parameters,
limit=limit,
)
feature_count = len(geojson_data.get("features", [])) if geojson_data else 0
if output_path:
return ExportResult(
success=True,
output_path=str(output_path),
data=None,
record_count=feature_count,
message=f"Exported {feature_count} listings to {output_path}",
)
return ExportResult(
success=True,
output_path=None,
data=geojson_data,
record_count=feature_count,
message=f"Generated GeoJSON with {feature_count} features",
)

View file

@ -0,0 +1,42 @@
"""Floorplan detector service - OCR-based square meter detection."""
import asyncio
from models import Listing
from rec import floorplan
from repositories.listing_repository import ListingRepository
from tqdm.asyncio import tqdm
import multiprocessing
async def detect_floorplan(repository: ListingRepository) -> None:
"""Detect square meters from floorplan images for all listings."""
listings = await repository.get_listings()
cpu_count = multiprocessing.cpu_count() // 4
semaphore = asyncio.Semaphore(cpu_count)
updated_listings = [
listing
for listing in await tqdm.gather(
*[_calculate_sqm_ocr(listing, semaphore) for listing in listings]
)
if listing is not None
]
await repository.upsert_listings(updated_listings)
async def _calculate_sqm_ocr(
listing: Listing, semaphore: asyncio.Semaphore
) -> Listing | None:
"""Calculate square meters from floorplan images using OCR."""
if listing.square_meters is not None:
return None
sqms: list[float] = []
for floorplan_path in listing.floorplan_image_paths:
async with semaphore:
estimated_sqm, _ = await asyncio.to_thread(
floorplan.calculate_ocr, floorplan_path
)
if estimated_sqm is not None:
sqms.append(estimated_sqm)
max_sqm = max(sqms, default=0) # try once, if we fail, keep as 0
listing.square_meters = max_sqm
return listing

View file

@ -0,0 +1,55 @@
"""Image fetcher service - downloads floorplan images for listings."""
import asyncio
from pathlib import Path
import aiohttp
from repositories import ListingRepository
from tenacity import retry, stop_after_attempt, wait_random
from tqdm.asyncio import tqdm
from models import Listing
# Setting this too high either crashes rightmove or gets us blocked
semaphore = asyncio.Semaphore(5)
async def dump_images(
repository: ListingRepository,
image_base_path: Path = Path("data/rs/"),
) -> None:
"""Download floorplan images for all listings."""
listings = await repository.get_listings()
updated_listings = await tqdm.gather(
*[dump_images_for_listing(listing, image_base_path) for listing in listings]
)
await repository.upsert_listings(
[listing for listing in updated_listings if listing is not None]
)
@retry(wait=wait_random(min=1, max=2), stop=stop_after_attempt(3))
async def dump_images_for_listing(listing: Listing, base_path: Path) -> Listing | None:
"""Download floorplan images for a single listing."""
all_floorplans = listing.additional_info.get("property", {}).get("floorplans", [])
for floorplan in all_floorplans:
url = floorplan["url"]
picname = url.split("/")[-1]
floorplan_path = Path(base_path, str(listing.id), "floorplans", picname)
if floorplan_path.exists():
continue
try:
async with semaphore:
async with aiohttp.ClientSession() as session:
async with session.get(url) as response:
if response.status == 404:
return None
if response.status != 200:
raise Exception(f"Error for {url}: {response.status}")
floorplan_path.parent.mkdir(parents=True, exist_ok=True)
with open(floorplan_path, "wb") as f:
f.write(await response.read())
listing.floorplan_image_paths.append(str(floorplan_path))
return listing
except Exception as e:
tqdm.write(f"Error for {url}: {e}")
raise e # raise so that we retry it
return None

View file

@ -0,0 +1,168 @@
"""Unified listing service - shared between CLI and HTTP API.
This module provides the core business logic for listing operations.
Both the CLI (main.py) and HTTP API (api/app.py) should use these functions.
"""
from dataclasses import dataclass
from datetime import datetime
from pathlib import Path
from typing import Any
from models.listing import Listing, QueryParameters
from repositories.listing_repository import ListingRepository
@dataclass
class ListingResult:
"""Result of a listing operation."""
listings: list[Listing]
total_count: int
message: str | None = None
@dataclass
class RefreshResult:
"""Result of a refresh operation."""
task_id: str | None # None if run synchronously
new_listings_count: int
message: str
async def get_listings(
repository: ListingRepository,
query_parameters: QueryParameters | None = None,
limit: int | None = None,
only_ids: list[int] | None = None,
) -> ListingResult:
"""Get listings from the database with optional filtering.
Used by:
- CLI: export-csv, export-immoweb
- API: GET /api/listing, GET /api/listing_geojson
"""
listings = await repository.get_listings(
query_parameters=query_parameters,
limit=limit,
only_ids=only_ids,
)
return ListingResult(
listings=listings,
total_count=len(listings),
)
async def refresh_listings(
repository: ListingRepository,
query_parameters: QueryParameters,
full: bool = False,
async_mode: bool = False,
user_email: str | None = None,
) -> RefreshResult:
"""Refresh listings by fetching from external API.
Args:
repository: Database repository
query_parameters: Filtering parameters
full: If True, also fetch images and run OCR
async_mode: If True, run as background task and return task_id
user_email: User email for tracking (API mode)
Used by:
- CLI: dump-listings
- API: POST /api/refresh_listings
"""
if async_mode:
# Import here to avoid circular imports
from tasks.listing_tasks import dump_listings_task
from datetime import timedelta
expiry_time = datetime.now() + timedelta(minutes=10)
task = dump_listings_task.apply_async(
args=(query_parameters.model_dump_json(),),
expires=expiry_time,
)
return RefreshResult(
task_id=task.id,
new_listings_count=0,
message=f"Task {task.id} started",
)
# Synchronous mode - run directly
from services.listing_fetcher import dump_listings, dump_listings_full
if full:
new_listings = await dump_listings_full(query_parameters, repository)
else:
new_listings = await dump_listings(query_parameters, repository)
return RefreshResult(
task_id=None,
new_listings_count=len(new_listings),
message=f"Fetched {len(new_listings)} new listings",
)
async def download_images(
repository: ListingRepository,
data_dir: Path = Path("data/rs/"),
) -> int:
"""Download floorplan images for all listings.
Used by:
- CLI: dump-images
- API: (could be added)
Returns:
Number of listings processed
"""
from services.image_fetcher import dump_images
await dump_images(repository, image_base_path=data_dir)
listings = await repository.get_listings()
return len(listings)
async def detect_floorplans(
repository: ListingRepository,
) -> int:
"""Run OCR on floorplan images to detect square meters.
Used by:
- CLI: detect-floorplan
- API: (could be added)
Returns:
Number of listings processed
"""
from services.floorplan_detector import detect_floorplan
await detect_floorplan(repository)
listings = await repository.get_listings()
return len(listings)
async def calculate_routes(
repository: ListingRepository,
destination_address: str,
travel_mode: str,
limit: int | None = None,
) -> int:
"""Calculate transit routes for listings.
Used by:
- CLI: routing
- API: (could be added)
Returns:
Number of listings processed
"""
from services.route_calculator import calculate_route
from rec.routing import TravelMode
await calculate_route(
repository,
destination_address,
TravelMode[travel_mode],
limit=limit,
)
return limit or 0

View file

@ -0,0 +1,66 @@
"""Route calculator service - calculates transit routes using Google Maps API."""
from models.listing import DestinationMode, Route, RouteLegStep
from repositories.listing_repository import ListingRepository
from tqdm.asyncio import tqdm
from rec import routing
from models import Listing
async def calculate_route(
repository: ListingRepository,
destination_address: str,
travel_mode: routing.TravelMode,
limit: int | None = None,
) -> None:
"""Calculate transit routes for listings to a destination."""
listings = await repository.get_listings()
if limit is not None:
listings = listings[:limit]
destimation_mode = DestinationMode(destination_address, travel_mode)
updated_listings = await tqdm.gather(
*[update_routing_info(listing, destimation_mode) for listing in listings],
total=len(listings),
desc="Updating routing info",
)
await repository.upsert_listings(
[listing for listing in updated_listings if listing is not None]
)
async def update_routing_info(
listing: Listing, destination_mode: DestinationMode
) -> Listing | None:
"""Update routing information for a single listing."""
if listing.routing_info.get(destination_mode) is not None:
# already calculated, do not recompute to save API calls
return None
routes_data = routing.transit_route(
listing.latitude,
listing.longitude,
destination_mode.destination_address,
destination_mode.travel_mode,
)
routes: list[Route] = []
for route_data in routes_data["routes"]:
duration_s = int(route_data["duration"].split("s")[0])
route = Route(
legs=[
RouteLegStep(
distance_meters=step_data["distanceMeters"],
duration_s=int(step_data["staticDuration"].split("s")[0]),
travel_mode=routing.TravelMode(step_data["travelMode"]),
)
for step_data in route_data["legs"][0]["steps"]
],
distance_meters=route_data["distanceMeters"],
duration_s=duration_s,
)
routes.append(route)
listing.routing_info_json = listing.serialize_routing_info(
{**listing.routing_info, **{destination_mode: routes}}
)
return listing

View file

@ -11,9 +11,14 @@ import json
class TaskStatus:
"""Status of a background task."""
task_id: str
status: str # PENDING, STARTED, SUCCESS, FAILURE, REVOKED
status: str # PENDING, STARTED, SUCCESS, FAILURE, REVOKED, SKIPPED
result: Any | None
progress: float | None # 0.0 to 1.0
processed: int | None # Number of items processed
total: int | None # Total number of items
message: str | None # Human-readable status message (e.g., "Fetching listings")
error: str | None # Error message if failed
traceback: str | None # Full traceback if failed
def get_task_status(task_id: str) -> TaskStatus:
@ -33,21 +38,50 @@ def get_task_status(task_id: str) -> TaskStatus:
task_result = dump_listings_task.AsyncResult(task_id)
# Try to serialize result
try:
result = json.loads(json.dumps(task_result.result))
except (TypeError, json.JSONDecodeError):
result = str(task_result.result) if task_result.result else None
result = None
error = None
if task_result.failed():
# Extract error message from failed task
error = str(task_result.result) if task_result.result else None
else:
try:
result = json.loads(json.dumps(task_result.result))
except (TypeError, json.JSONDecodeError):
result = str(task_result.result) if task_result.result else None
# Extract progress from task meta if available
# Extract traceback if available
task_traceback = task_result.traceback if task_result.failed() else None
# Extract progress, processed, total, and message from task meta
progress = None
processed = None
total = None
message = None
if task_result.info and isinstance(task_result.info, dict):
progress = task_result.info.get("progress")
processed = task_result.info.get("processed")
total = task_result.info.get("total")
# Use 'message' if available, fall back to 'reason' for SKIPPED tasks
message = task_result.info.get("message") or task_result.info.get("reason")
# For custom states (like "Fetching listings"), use the state as message
# if no message was provided in info
if not message and task_result.status not in (
"PENDING", "STARTED", "SUCCESS", "FAILURE", "REVOKED", "RETRY"
):
message = task_result.status
return TaskStatus(
task_id=task_id,
status=task_result.status,
result=result,
progress=progress,
processed=processed,
total=total,
message=message,
error=error,
traceback=task_traceback,
)