Add services layer, tests, streaming UI, and cleanup legacy code

2026-02-06 20:55:10 +00:00 · 2026-02-06 20:55:10 +00:00 · d205d15c74
commit d205d15c74
parent 5514fa6381
62 changed files with 3729 additions and 1024 deletions
--- a/crawler/services/init.py
+++ b/crawler/services/init.py
@ -0,0 +1,41 @@
+"""Services package for real estate crawler.
+
+This package contains two layers of services:
+
+## Low-level services (internal implementation):
+- listing_fetcher: Fetches listing data from Rightmove API
+- image_fetcher: Downloads floorplan images
+- floorplan_detector: OCR-based square meter detection from floorplans
+- route_calculator: Calculates transit routes using Google Maps API
+
+## High-level services (use these in CLI and API):
+- listing_service: Unified listing operations (get, refresh, download images, etc.)
+- export_service: Export listings to CSV, GeoJSON
+- district_service: District lookup and validation
+- task_service: Background task management
+"""
+# Low-level services (internal)
+from services.listing_fetcher import dump_listings, dump_listings_full
+from services.image_fetcher import dump_images
+from services.floorplan_detector import detect_floorplan
+from services.route_calculator import calculate_route
+
+# High-level services (CLI and API should use these)
+from services import listing_service
+from services import export_service
+from services import district_service
+from services import task_service
+
+__all__ = [
+    # Low-level
+    "dump_listings",
+    "dump_listings_full",
+    "dump_images",
+    "detect_floorplan",
+    "calculate_route",
+    # High-level
+    "listing_service",
+    "export_service",
+    "district_service",
+    "task_service",
+]
--- a/crawler/services/district_service.py
+++ b/crawler/services/district_service.py
@ -0,0 +1,38 @@
+"""Unified district service - shared between CLI and HTTP API."""
+from rec.districts import get_districts as _get_districts
+
+
+def get_all_districts() -> dict[str, str]:
+    """Get all available districts with their region IDs.
+
+    Used by:
+    - CLI: --district option choices
+    - API: GET /api/get_districts
+
+    Returns:
+        Dictionary mapping district names to region IDs
+    """
+    return _get_districts()
+
+
+def get_district_names() -> list[str]:
+    """Get list of all district names.
+
+    Returns:
+        List of district names
+    """
+    return list(_get_districts().keys())
+
+
+def validate_districts(district_names: list[str]) -> tuple[bool, list[str]]:
+    """Validate that district names exist.
+
+    Args:
+        district_names: List of district names to validate
+
+    Returns:
+        Tuple of (all_valid, invalid_names)
+    """
+    valid_districts = set(_get_districts().keys())
+    invalid = [d for d in district_names if d not in valid_districts]
+    return len(invalid) == 0, invalid
--- a/crawler/services/export_service.py
+++ b/crawler/services/export_service.py
@ -0,0 +1,92 @@
+"""Unified export service - shared between CLI and HTTP API.
+
+This module provides export functionality for listings in various formats.
+"""
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any
+
+from models.listing import QueryParameters
+from repositories.listing_repository import ListingRepository
+
+
+@dataclass
+class ExportResult:
+    """Result of an export operation."""
+    success: bool
+    output_path: str | None  # For file exports
+    data: Any | None  # For in-memory exports (GeoJSON)
+    record_count: int
+    message: str
+
+
+async def export_to_csv(
+    repository: ListingRepository,
+    output_path: Path,
+    query_parameters: QueryParameters | None = None,
+) -> ExportResult:
+    """Export listings to CSV file.
+
+    Used by:
+    - CLI: export-csv
+    - API: (could be added as download endpoint)
+    """
+    from csv_exporter import export_to_csv as _export_csv
+
+    await _export_csv(repository, output_path, query_parameters)
+
+    listings = await repository.get_listings(query_parameters=query_parameters)
+    return ExportResult(
+        success=True,
+        output_path=str(output_path),
+        data=None,
+        record_count=len(listings),
+        message=f"Exported {len(listings)} listings to {output_path}",
+    )
+
+
+async def export_to_geojson(
+    repository: ListingRepository,
+    query_parameters: QueryParameters | None = None,
+    output_path: Path | None = None,
+    limit: int | None = None,
+) -> ExportResult:
+    """Export listings to GeoJSON format.
+
+    Args:
+        repository: Database repository
+        query_parameters: Filtering parameters
+        output_path: If provided, write to file. Otherwise return data.
+        limit: Maximum number of listings to export
+
+    Used by:
+    - CLI: export-immoweb
+    - API: GET /api/listing_geojson
+    """
+    from ui_exporter import export_immoweb
+
+    geojson_data = await export_immoweb(
+        repository,
+        output_file=str(output_path) if output_path else None,
+        query_parameters=query_parameters,
+        limit=limit,
+    )
+
+    feature_count = len(geojson_data.get("features", [])) if geojson_data else 0
+
+    if output_path:
+        return ExportResult(
+            success=True,
+            output_path=str(output_path),
+            data=None,
+            record_count=feature_count,
+            message=f"Exported {feature_count} listings to {output_path}",
+        )
+
+    return ExportResult(
+        success=True,
+        output_path=None,
+        data=geojson_data,
+        record_count=feature_count,
+        message=f"Generated GeoJSON with {feature_count} features",
+    )
--- a/crawler/services/floorplan_detector.py
+++ b/crawler/services/floorplan_detector.py
@ -0,0 +1,42 @@
+"""Floorplan detector service - OCR-based square meter detection."""
+import asyncio
+from models import Listing
+from rec import floorplan
+from repositories.listing_repository import ListingRepository
+from tqdm.asyncio import tqdm
+import multiprocessing
+
+
+async def detect_floorplan(repository: ListingRepository) -> None:
+    """Detect square meters from floorplan images for all listings."""
+    listings = await repository.get_listings()
+    cpu_count = multiprocessing.cpu_count() // 4
+    semaphore = asyncio.Semaphore(cpu_count)
+
+    updated_listings = [
+        listing
+        for listing in await tqdm.gather(
+            *[_calculate_sqm_ocr(listing, semaphore) for listing in listings]
+        )
+        if listing is not None
+    ]
+    await repository.upsert_listings(updated_listings)
+
+
+async def _calculate_sqm_ocr(
+    listing: Listing, semaphore: asyncio.Semaphore
+) -> Listing | None:
+    """Calculate square meters from floorplan images using OCR."""
+    if listing.square_meters is not None:
+        return None
+    sqms: list[float] = []
+    for floorplan_path in listing.floorplan_image_paths:
+        async with semaphore:
+            estimated_sqm, _ = await asyncio.to_thread(
+                floorplan.calculate_ocr, floorplan_path
+            )
+            if estimated_sqm is not None:
+                sqms.append(estimated_sqm)
+    max_sqm = max(sqms, default=0)  # try once, if we fail, keep as 0
+    listing.square_meters = max_sqm
+    return listing
--- a/crawler/services/image_fetcher.py
+++ b/crawler/services/image_fetcher.py
@ -0,0 +1,55 @@
+"""Image fetcher service - downloads floorplan images for listings."""
+import asyncio
+from pathlib import Path
+import aiohttp
+from repositories import ListingRepository
+from tenacity import retry, stop_after_attempt, wait_random
+from tqdm.asyncio import tqdm
+
+from models import Listing
+
+# Setting this too high either crashes rightmove or gets us blocked
+semaphore = asyncio.Semaphore(5)
+
+
+async def dump_images(
+    repository: ListingRepository,
+    image_base_path: Path = Path("data/rs/"),
+) -> None:
+    """Download floorplan images for all listings."""
+    listings = await repository.get_listings()
+    updated_listings = await tqdm.gather(
+        *[dump_images_for_listing(listing, image_base_path) for listing in listings]
+    )
+    await repository.upsert_listings(
+        [listing for listing in updated_listings if listing is not None]
+    )
+
+
+@retry(wait=wait_random(min=1, max=2), stop=stop_after_attempt(3))
+async def dump_images_for_listing(listing: Listing, base_path: Path) -> Listing | None:
+    """Download floorplan images for a single listing."""
+    all_floorplans = listing.additional_info.get("property", {}).get("floorplans", [])
+    for floorplan in all_floorplans:
+        url = floorplan["url"]
+        picname = url.split("/")[-1]
+        floorplan_path = Path(base_path, str(listing.id), "floorplans", picname)
+        if floorplan_path.exists():
+            continue
+        try:
+            async with semaphore:
+                async with aiohttp.ClientSession() as session:
+                    async with session.get(url) as response:
+                        if response.status == 404:
+                            return None
+                        if response.status != 200:
+                            raise Exception(f"Error for {url}: {response.status}")
+                        floorplan_path.parent.mkdir(parents=True, exist_ok=True)
+                        with open(floorplan_path, "wb") as f:
+                            f.write(await response.read())
+                        listing.floorplan_image_paths.append(str(floorplan_path))
+                        return listing
+        except Exception as e:
+            tqdm.write(f"Error for {url}: {e}")
+            raise e  # raise so that we retry it
+    return None
--- a/crawler/services/listing_service.py
+++ b/crawler/services/listing_service.py
@ -0,0 +1,168 @@
+"""Unified listing service - shared between CLI and HTTP API.
+
+This module provides the core business logic for listing operations.
+Both the CLI (main.py) and HTTP API (api/app.py) should use these functions.
+"""
+from dataclasses import dataclass
+from datetime import datetime
+from pathlib import Path
+from typing import Any
+
+from models.listing import Listing, QueryParameters
+from repositories.listing_repository import ListingRepository
+
+
+@dataclass
+class ListingResult:
+    """Result of a listing operation."""
+    listings: list[Listing]
+    total_count: int
+    message: str | None = None
+
+
+@dataclass
+class RefreshResult:
+    """Result of a refresh operation."""
+    task_id: str | None  # None if run synchronously
+    new_listings_count: int
+    message: str
+
+
+async def get_listings(
+    repository: ListingRepository,
+    query_parameters: QueryParameters | None = None,
+    limit: int | None = None,
+    only_ids: list[int] | None = None,
+) -> ListingResult:
+    """Get listings from the database with optional filtering.
+
+    Used by:
+    - CLI: export-csv, export-immoweb
+    - API: GET /api/listing, GET /api/listing_geojson
+    """
+    listings = await repository.get_listings(
+        query_parameters=query_parameters,
+        limit=limit,
+        only_ids=only_ids,
+    )
+    return ListingResult(
+        listings=listings,
+        total_count=len(listings),
+    )
+
+
+async def refresh_listings(
+    repository: ListingRepository,
+    query_parameters: QueryParameters,
+    full: bool = False,
+    async_mode: bool = False,
+    user_email: str | None = None,
+) -> RefreshResult:
+    """Refresh listings by fetching from external API.
+
+    Args:
+        repository: Database repository
+        query_parameters: Filtering parameters
+        full: If True, also fetch images and run OCR
+        async_mode: If True, run as background task and return task_id
+        user_email: User email for tracking (API mode)
+
+    Used by:
+    - CLI: dump-listings
+    - API: POST /api/refresh_listings
+    """
+    if async_mode:
+        # Import here to avoid circular imports
+        from tasks.listing_tasks import dump_listings_task
+        from datetime import timedelta
+
+        expiry_time = datetime.now() + timedelta(minutes=10)
+        task = dump_listings_task.apply_async(
+            args=(query_parameters.model_dump_json(),),
+            expires=expiry_time,
+        )
+        return RefreshResult(
+            task_id=task.id,
+            new_listings_count=0,
+            message=f"Task {task.id} started",
+        )
+
+    # Synchronous mode - run directly
+    from services.listing_fetcher import dump_listings, dump_listings_full
+
+    if full:
+        new_listings = await dump_listings_full(query_parameters, repository)
+    else:
+        new_listings = await dump_listings(query_parameters, repository)
+
+    return RefreshResult(
+        task_id=None,
+        new_listings_count=len(new_listings),
+        message=f"Fetched {len(new_listings)} new listings",
+    )
+
+
+async def download_images(
+    repository: ListingRepository,
+    data_dir: Path = Path("data/rs/"),
+) -> int:
+    """Download floorplan images for all listings.
+
+    Used by:
+    - CLI: dump-images
+    - API: (could be added)
+
+    Returns:
+        Number of listings processed
+    """
+    from services.image_fetcher import dump_images
+
+    await dump_images(repository, image_base_path=data_dir)
+    listings = await repository.get_listings()
+    return len(listings)
+
+
+async def detect_floorplans(
+    repository: ListingRepository,
+) -> int:
+    """Run OCR on floorplan images to detect square meters.
+
+    Used by:
+    - CLI: detect-floorplan
+    - API: (could be added)
+
+    Returns:
+        Number of listings processed
+    """
+    from services.floorplan_detector import detect_floorplan
+
+    await detect_floorplan(repository)
+    listings = await repository.get_listings()
+    return len(listings)
+
+
+async def calculate_routes(
+    repository: ListingRepository,
+    destination_address: str,
+    travel_mode: str,
+    limit: int | None = None,
+) -> int:
+    """Calculate transit routes for listings.
+
+    Used by:
+    - CLI: routing
+    - API: (could be added)
+
+    Returns:
+        Number of listings processed
+    """
+    from services.route_calculator import calculate_route
+    from rec.routing import TravelMode
+
+    await calculate_route(
+        repository,
+        destination_address,
+        TravelMode[travel_mode],
+        limit=limit,
+    )
+    return limit or 0
--- a/crawler/services/route_calculator.py
+++ b/crawler/services/route_calculator.py
@ -0,0 +1,66 @@
+"""Route calculator service - calculates transit routes using Google Maps API."""
+from models.listing import DestinationMode, Route, RouteLegStep
+from repositories.listing_repository import ListingRepository
+from tqdm.asyncio import tqdm
+from rec import routing
+from models import Listing
+
+
+async def calculate_route(
+    repository: ListingRepository,
+    destination_address: str,
+    travel_mode: routing.TravelMode,
+    limit: int | None = None,
+) -> None:
+    """Calculate transit routes for listings to a destination."""
+    listings = await repository.get_listings()
+
+    if limit is not None:
+        listings = listings[:limit]
+
+    destimation_mode = DestinationMode(destination_address, travel_mode)
+    updated_listings = await tqdm.gather(
+        *[update_routing_info(listing, destimation_mode) for listing in listings],
+        total=len(listings),
+        desc="Updating routing info",
+    )
+    await repository.upsert_listings(
+        [listing for listing in updated_listings if listing is not None]
+    )
+
+
+async def update_routing_info(
+    listing: Listing, destination_mode: DestinationMode
+) -> Listing | None:
+    """Update routing information for a single listing."""
+    if listing.routing_info.get(destination_mode) is not None:
+        # already calculated, do not recompute to save API calls
+        return None
+
+    routes_data = routing.transit_route(
+        listing.latitude,
+        listing.longitude,
+        destination_mode.destination_address,
+        destination_mode.travel_mode,
+    )
+
+    routes: list[Route] = []
+    for route_data in routes_data["routes"]:
+        duration_s = int(route_data["duration"].split("s")[0])
+        route = Route(
+            legs=[
+                RouteLegStep(
+                    distance_meters=step_data["distanceMeters"],
+                    duration_s=int(step_data["staticDuration"].split("s")[0]),
+                    travel_mode=routing.TravelMode(step_data["travelMode"]),
+                )
+                for step_data in route_data["legs"][0]["steps"]
+            ],
+            distance_meters=route_data["distanceMeters"],
+            duration_s=duration_s,
+        )
+        routes.append(route)
+    listing.routing_info_json = listing.serialize_routing_info(
+        {**listing.routing_info, **{destination_mode: routes}}
+    )
+    return listing
--- a/crawler/services/task_service.py
+++ b/crawler/services/task_service.py
@ -11,9 +11,14 @@ import json
 class TaskStatus:
    """Status of a background task."""
    task_id: str
-    status: str  # PENDING, STARTED, SUCCESS, FAILURE, REVOKED
+    status: str  # PENDING, STARTED, SUCCESS, FAILURE, REVOKED, SKIPPED
    result: Any | None
    progress: float | None  # 0.0 to 1.0
+    processed: int | None  # Number of items processed
+    total: int | None  # Total number of items
+    message: str | None  # Human-readable status message (e.g., "Fetching listings")
+    error: str | None  # Error message if failed
+    traceback: str | None  # Full traceback if failed


 def get_task_status(task_id: str) -> TaskStatus:
@ -33,21 +38,50 @@ def get_task_status(task_id: str) -> TaskStatus:
    task_result = dump_listings_task.AsyncResult(task_id)

    # Try to serialize result
-    try:
-        result = json.loads(json.dumps(task_result.result))
-    except (TypeError, json.JSONDecodeError):
-        result = str(task_result.result) if task_result.result else None
+    result = None
+    error = None
+    if task_result.failed():
+        # Extract error message from failed task
+        error = str(task_result.result) if task_result.result else None
+    else:
+        try:
+            result = json.loads(json.dumps(task_result.result))
+        except (TypeError, json.JSONDecodeError):
+            result = str(task_result.result) if task_result.result else None

-    # Extract progress from task meta if available
+    # Extract traceback if available
+    task_traceback = task_result.traceback if task_result.failed() else None
+
+    # Extract progress, processed, total, and message from task meta
    progress = None
+    processed = None
+    total = None
+    message = None
+
    if task_result.info and isinstance(task_result.info, dict):
        progress = task_result.info.get("progress")
+        processed = task_result.info.get("processed")
+        total = task_result.info.get("total")
+        # Use 'message' if available, fall back to 'reason' for SKIPPED tasks
+        message = task_result.info.get("message") or task_result.info.get("reason")
+
+    # For custom states (like "Fetching listings"), use the state as message
+    # if no message was provided in info
+    if not message and task_result.status not in (
+        "PENDING", "STARTED", "SUCCESS", "FAILURE", "REVOKED", "RETRY"
+    ):
+        message = task_result.status

    return TaskStatus(
        task_id=task_id,
        status=task_result.status,
        result=result,
        progress=progress,
+        processed=processed,
+        total=total,
+        message=message,
+        error=error,
+        traceback=task_traceback,
    )