Add services layer, tests, streaming UI, and cleanup legacy code
This commit is contained in:
parent
5514fa6381
commit
d205d15c74
62 changed files with 3729 additions and 1024 deletions
41
crawler/services/__init__.py
Normal file
41
crawler/services/__init__.py
Normal file
|
|
@ -0,0 +1,41 @@
|
|||
"""Services package for real estate crawler.
|
||||
|
||||
This package contains two layers of services:
|
||||
|
||||
## Low-level services (internal implementation):
|
||||
- listing_fetcher: Fetches listing data from Rightmove API
|
||||
- image_fetcher: Downloads floorplan images
|
||||
- floorplan_detector: OCR-based square meter detection from floorplans
|
||||
- route_calculator: Calculates transit routes using Google Maps API
|
||||
|
||||
## High-level services (use these in CLI and API):
|
||||
- listing_service: Unified listing operations (get, refresh, download images, etc.)
|
||||
- export_service: Export listings to CSV, GeoJSON
|
||||
- district_service: District lookup and validation
|
||||
- task_service: Background task management
|
||||
"""
|
||||
# Low-level services (internal)
|
||||
from services.listing_fetcher import dump_listings, dump_listings_full
|
||||
from services.image_fetcher import dump_images
|
||||
from services.floorplan_detector import detect_floorplan
|
||||
from services.route_calculator import calculate_route
|
||||
|
||||
# High-level services (CLI and API should use these)
|
||||
from services import listing_service
|
||||
from services import export_service
|
||||
from services import district_service
|
||||
from services import task_service
|
||||
|
||||
__all__ = [
|
||||
# Low-level
|
||||
"dump_listings",
|
||||
"dump_listings_full",
|
||||
"dump_images",
|
||||
"detect_floorplan",
|
||||
"calculate_route",
|
||||
# High-level
|
||||
"listing_service",
|
||||
"export_service",
|
||||
"district_service",
|
||||
"task_service",
|
||||
]
|
||||
38
crawler/services/district_service.py
Normal file
38
crawler/services/district_service.py
Normal file
|
|
@ -0,0 +1,38 @@
|
|||
"""Unified district service - shared between CLI and HTTP API."""
|
||||
from rec.districts import get_districts as _get_districts
|
||||
|
||||
|
||||
def get_all_districts() -> dict[str, str]:
|
||||
"""Get all available districts with their region IDs.
|
||||
|
||||
Used by:
|
||||
- CLI: --district option choices
|
||||
- API: GET /api/get_districts
|
||||
|
||||
Returns:
|
||||
Dictionary mapping district names to region IDs
|
||||
"""
|
||||
return _get_districts()
|
||||
|
||||
|
||||
def get_district_names() -> list[str]:
|
||||
"""Get list of all district names.
|
||||
|
||||
Returns:
|
||||
List of district names
|
||||
"""
|
||||
return list(_get_districts().keys())
|
||||
|
||||
|
||||
def validate_districts(district_names: list[str]) -> tuple[bool, list[str]]:
|
||||
"""Validate that district names exist.
|
||||
|
||||
Args:
|
||||
district_names: List of district names to validate
|
||||
|
||||
Returns:
|
||||
Tuple of (all_valid, invalid_names)
|
||||
"""
|
||||
valid_districts = set(_get_districts().keys())
|
||||
invalid = [d for d in district_names if d not in valid_districts]
|
||||
return len(invalid) == 0, invalid
|
||||
92
crawler/services/export_service.py
Normal file
92
crawler/services/export_service.py
Normal file
|
|
@ -0,0 +1,92 @@
|
|||
"""Unified export service - shared between CLI and HTTP API.
|
||||
|
||||
This module provides export functionality for listings in various formats.
|
||||
"""
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from models.listing import QueryParameters
|
||||
from repositories.listing_repository import ListingRepository
|
||||
|
||||
|
||||
@dataclass
|
||||
class ExportResult:
|
||||
"""Result of an export operation."""
|
||||
success: bool
|
||||
output_path: str | None # For file exports
|
||||
data: Any | None # For in-memory exports (GeoJSON)
|
||||
record_count: int
|
||||
message: str
|
||||
|
||||
|
||||
async def export_to_csv(
|
||||
repository: ListingRepository,
|
||||
output_path: Path,
|
||||
query_parameters: QueryParameters | None = None,
|
||||
) -> ExportResult:
|
||||
"""Export listings to CSV file.
|
||||
|
||||
Used by:
|
||||
- CLI: export-csv
|
||||
- API: (could be added as download endpoint)
|
||||
"""
|
||||
from csv_exporter import export_to_csv as _export_csv
|
||||
|
||||
await _export_csv(repository, output_path, query_parameters)
|
||||
|
||||
listings = await repository.get_listings(query_parameters=query_parameters)
|
||||
return ExportResult(
|
||||
success=True,
|
||||
output_path=str(output_path),
|
||||
data=None,
|
||||
record_count=len(listings),
|
||||
message=f"Exported {len(listings)} listings to {output_path}",
|
||||
)
|
||||
|
||||
|
||||
async def export_to_geojson(
|
||||
repository: ListingRepository,
|
||||
query_parameters: QueryParameters | None = None,
|
||||
output_path: Path | None = None,
|
||||
limit: int | None = None,
|
||||
) -> ExportResult:
|
||||
"""Export listings to GeoJSON format.
|
||||
|
||||
Args:
|
||||
repository: Database repository
|
||||
query_parameters: Filtering parameters
|
||||
output_path: If provided, write to file. Otherwise return data.
|
||||
limit: Maximum number of listings to export
|
||||
|
||||
Used by:
|
||||
- CLI: export-immoweb
|
||||
- API: GET /api/listing_geojson
|
||||
"""
|
||||
from ui_exporter import export_immoweb
|
||||
|
||||
geojson_data = await export_immoweb(
|
||||
repository,
|
||||
output_file=str(output_path) if output_path else None,
|
||||
query_parameters=query_parameters,
|
||||
limit=limit,
|
||||
)
|
||||
|
||||
feature_count = len(geojson_data.get("features", [])) if geojson_data else 0
|
||||
|
||||
if output_path:
|
||||
return ExportResult(
|
||||
success=True,
|
||||
output_path=str(output_path),
|
||||
data=None,
|
||||
record_count=feature_count,
|
||||
message=f"Exported {feature_count} listings to {output_path}",
|
||||
)
|
||||
|
||||
return ExportResult(
|
||||
success=True,
|
||||
output_path=None,
|
||||
data=geojson_data,
|
||||
record_count=feature_count,
|
||||
message=f"Generated GeoJSON with {feature_count} features",
|
||||
)
|
||||
42
crawler/services/floorplan_detector.py
Normal file
42
crawler/services/floorplan_detector.py
Normal file
|
|
@ -0,0 +1,42 @@
|
|||
"""Floorplan detector service - OCR-based square meter detection."""
|
||||
import asyncio
|
||||
from models import Listing
|
||||
from rec import floorplan
|
||||
from repositories.listing_repository import ListingRepository
|
||||
from tqdm.asyncio import tqdm
|
||||
import multiprocessing
|
||||
|
||||
|
||||
async def detect_floorplan(repository: ListingRepository) -> None:
|
||||
"""Detect square meters from floorplan images for all listings."""
|
||||
listings = await repository.get_listings()
|
||||
cpu_count = multiprocessing.cpu_count() // 4
|
||||
semaphore = asyncio.Semaphore(cpu_count)
|
||||
|
||||
updated_listings = [
|
||||
listing
|
||||
for listing in await tqdm.gather(
|
||||
*[_calculate_sqm_ocr(listing, semaphore) for listing in listings]
|
||||
)
|
||||
if listing is not None
|
||||
]
|
||||
await repository.upsert_listings(updated_listings)
|
||||
|
||||
|
||||
async def _calculate_sqm_ocr(
|
||||
listing: Listing, semaphore: asyncio.Semaphore
|
||||
) -> Listing | None:
|
||||
"""Calculate square meters from floorplan images using OCR."""
|
||||
if listing.square_meters is not None:
|
||||
return None
|
||||
sqms: list[float] = []
|
||||
for floorplan_path in listing.floorplan_image_paths:
|
||||
async with semaphore:
|
||||
estimated_sqm, _ = await asyncio.to_thread(
|
||||
floorplan.calculate_ocr, floorplan_path
|
||||
)
|
||||
if estimated_sqm is not None:
|
||||
sqms.append(estimated_sqm)
|
||||
max_sqm = max(sqms, default=0) # try once, if we fail, keep as 0
|
||||
listing.square_meters = max_sqm
|
||||
return listing
|
||||
55
crawler/services/image_fetcher.py
Normal file
55
crawler/services/image_fetcher.py
Normal file
|
|
@ -0,0 +1,55 @@
|
|||
"""Image fetcher service - downloads floorplan images for listings."""
|
||||
import asyncio
|
||||
from pathlib import Path
|
||||
import aiohttp
|
||||
from repositories import ListingRepository
|
||||
from tenacity import retry, stop_after_attempt, wait_random
|
||||
from tqdm.asyncio import tqdm
|
||||
|
||||
from models import Listing
|
||||
|
||||
# Setting this too high either crashes rightmove or gets us blocked
|
||||
semaphore = asyncio.Semaphore(5)
|
||||
|
||||
|
||||
async def dump_images(
|
||||
repository: ListingRepository,
|
||||
image_base_path: Path = Path("data/rs/"),
|
||||
) -> None:
|
||||
"""Download floorplan images for all listings."""
|
||||
listings = await repository.get_listings()
|
||||
updated_listings = await tqdm.gather(
|
||||
*[dump_images_for_listing(listing, image_base_path) for listing in listings]
|
||||
)
|
||||
await repository.upsert_listings(
|
||||
[listing for listing in updated_listings if listing is not None]
|
||||
)
|
||||
|
||||
|
||||
@retry(wait=wait_random(min=1, max=2), stop=stop_after_attempt(3))
|
||||
async def dump_images_for_listing(listing: Listing, base_path: Path) -> Listing | None:
|
||||
"""Download floorplan images for a single listing."""
|
||||
all_floorplans = listing.additional_info.get("property", {}).get("floorplans", [])
|
||||
for floorplan in all_floorplans:
|
||||
url = floorplan["url"]
|
||||
picname = url.split("/")[-1]
|
||||
floorplan_path = Path(base_path, str(listing.id), "floorplans", picname)
|
||||
if floorplan_path.exists():
|
||||
continue
|
||||
try:
|
||||
async with semaphore:
|
||||
async with aiohttp.ClientSession() as session:
|
||||
async with session.get(url) as response:
|
||||
if response.status == 404:
|
||||
return None
|
||||
if response.status != 200:
|
||||
raise Exception(f"Error for {url}: {response.status}")
|
||||
floorplan_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(floorplan_path, "wb") as f:
|
||||
f.write(await response.read())
|
||||
listing.floorplan_image_paths.append(str(floorplan_path))
|
||||
return listing
|
||||
except Exception as e:
|
||||
tqdm.write(f"Error for {url}: {e}")
|
||||
raise e # raise so that we retry it
|
||||
return None
|
||||
168
crawler/services/listing_service.py
Normal file
168
crawler/services/listing_service.py
Normal file
|
|
@ -0,0 +1,168 @@
|
|||
"""Unified listing service - shared between CLI and HTTP API.
|
||||
|
||||
This module provides the core business logic for listing operations.
|
||||
Both the CLI (main.py) and HTTP API (api/app.py) should use these functions.
|
||||
"""
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from models.listing import Listing, QueryParameters
|
||||
from repositories.listing_repository import ListingRepository
|
||||
|
||||
|
||||
@dataclass
|
||||
class ListingResult:
|
||||
"""Result of a listing operation."""
|
||||
listings: list[Listing]
|
||||
total_count: int
|
||||
message: str | None = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class RefreshResult:
|
||||
"""Result of a refresh operation."""
|
||||
task_id: str | None # None if run synchronously
|
||||
new_listings_count: int
|
||||
message: str
|
||||
|
||||
|
||||
async def get_listings(
|
||||
repository: ListingRepository,
|
||||
query_parameters: QueryParameters | None = None,
|
||||
limit: int | None = None,
|
||||
only_ids: list[int] | None = None,
|
||||
) -> ListingResult:
|
||||
"""Get listings from the database with optional filtering.
|
||||
|
||||
Used by:
|
||||
- CLI: export-csv, export-immoweb
|
||||
- API: GET /api/listing, GET /api/listing_geojson
|
||||
"""
|
||||
listings = await repository.get_listings(
|
||||
query_parameters=query_parameters,
|
||||
limit=limit,
|
||||
only_ids=only_ids,
|
||||
)
|
||||
return ListingResult(
|
||||
listings=listings,
|
||||
total_count=len(listings),
|
||||
)
|
||||
|
||||
|
||||
async def refresh_listings(
|
||||
repository: ListingRepository,
|
||||
query_parameters: QueryParameters,
|
||||
full: bool = False,
|
||||
async_mode: bool = False,
|
||||
user_email: str | None = None,
|
||||
) -> RefreshResult:
|
||||
"""Refresh listings by fetching from external API.
|
||||
|
||||
Args:
|
||||
repository: Database repository
|
||||
query_parameters: Filtering parameters
|
||||
full: If True, also fetch images and run OCR
|
||||
async_mode: If True, run as background task and return task_id
|
||||
user_email: User email for tracking (API mode)
|
||||
|
||||
Used by:
|
||||
- CLI: dump-listings
|
||||
- API: POST /api/refresh_listings
|
||||
"""
|
||||
if async_mode:
|
||||
# Import here to avoid circular imports
|
||||
from tasks.listing_tasks import dump_listings_task
|
||||
from datetime import timedelta
|
||||
|
||||
expiry_time = datetime.now() + timedelta(minutes=10)
|
||||
task = dump_listings_task.apply_async(
|
||||
args=(query_parameters.model_dump_json(),),
|
||||
expires=expiry_time,
|
||||
)
|
||||
return RefreshResult(
|
||||
task_id=task.id,
|
||||
new_listings_count=0,
|
||||
message=f"Task {task.id} started",
|
||||
)
|
||||
|
||||
# Synchronous mode - run directly
|
||||
from services.listing_fetcher import dump_listings, dump_listings_full
|
||||
|
||||
if full:
|
||||
new_listings = await dump_listings_full(query_parameters, repository)
|
||||
else:
|
||||
new_listings = await dump_listings(query_parameters, repository)
|
||||
|
||||
return RefreshResult(
|
||||
task_id=None,
|
||||
new_listings_count=len(new_listings),
|
||||
message=f"Fetched {len(new_listings)} new listings",
|
||||
)
|
||||
|
||||
|
||||
async def download_images(
|
||||
repository: ListingRepository,
|
||||
data_dir: Path = Path("data/rs/"),
|
||||
) -> int:
|
||||
"""Download floorplan images for all listings.
|
||||
|
||||
Used by:
|
||||
- CLI: dump-images
|
||||
- API: (could be added)
|
||||
|
||||
Returns:
|
||||
Number of listings processed
|
||||
"""
|
||||
from services.image_fetcher import dump_images
|
||||
|
||||
await dump_images(repository, image_base_path=data_dir)
|
||||
listings = await repository.get_listings()
|
||||
return len(listings)
|
||||
|
||||
|
||||
async def detect_floorplans(
|
||||
repository: ListingRepository,
|
||||
) -> int:
|
||||
"""Run OCR on floorplan images to detect square meters.
|
||||
|
||||
Used by:
|
||||
- CLI: detect-floorplan
|
||||
- API: (could be added)
|
||||
|
||||
Returns:
|
||||
Number of listings processed
|
||||
"""
|
||||
from services.floorplan_detector import detect_floorplan
|
||||
|
||||
await detect_floorplan(repository)
|
||||
listings = await repository.get_listings()
|
||||
return len(listings)
|
||||
|
||||
|
||||
async def calculate_routes(
|
||||
repository: ListingRepository,
|
||||
destination_address: str,
|
||||
travel_mode: str,
|
||||
limit: int | None = None,
|
||||
) -> int:
|
||||
"""Calculate transit routes for listings.
|
||||
|
||||
Used by:
|
||||
- CLI: routing
|
||||
- API: (could be added)
|
||||
|
||||
Returns:
|
||||
Number of listings processed
|
||||
"""
|
||||
from services.route_calculator import calculate_route
|
||||
from rec.routing import TravelMode
|
||||
|
||||
await calculate_route(
|
||||
repository,
|
||||
destination_address,
|
||||
TravelMode[travel_mode],
|
||||
limit=limit,
|
||||
)
|
||||
return limit or 0
|
||||
66
crawler/services/route_calculator.py
Normal file
66
crawler/services/route_calculator.py
Normal file
|
|
@ -0,0 +1,66 @@
|
|||
"""Route calculator service - calculates transit routes using Google Maps API."""
|
||||
from models.listing import DestinationMode, Route, RouteLegStep
|
||||
from repositories.listing_repository import ListingRepository
|
||||
from tqdm.asyncio import tqdm
|
||||
from rec import routing
|
||||
from models import Listing
|
||||
|
||||
|
||||
async def calculate_route(
|
||||
repository: ListingRepository,
|
||||
destination_address: str,
|
||||
travel_mode: routing.TravelMode,
|
||||
limit: int | None = None,
|
||||
) -> None:
|
||||
"""Calculate transit routes for listings to a destination."""
|
||||
listings = await repository.get_listings()
|
||||
|
||||
if limit is not None:
|
||||
listings = listings[:limit]
|
||||
|
||||
destimation_mode = DestinationMode(destination_address, travel_mode)
|
||||
updated_listings = await tqdm.gather(
|
||||
*[update_routing_info(listing, destimation_mode) for listing in listings],
|
||||
total=len(listings),
|
||||
desc="Updating routing info",
|
||||
)
|
||||
await repository.upsert_listings(
|
||||
[listing for listing in updated_listings if listing is not None]
|
||||
)
|
||||
|
||||
|
||||
async def update_routing_info(
|
||||
listing: Listing, destination_mode: DestinationMode
|
||||
) -> Listing | None:
|
||||
"""Update routing information for a single listing."""
|
||||
if listing.routing_info.get(destination_mode) is not None:
|
||||
# already calculated, do not recompute to save API calls
|
||||
return None
|
||||
|
||||
routes_data = routing.transit_route(
|
||||
listing.latitude,
|
||||
listing.longitude,
|
||||
destination_mode.destination_address,
|
||||
destination_mode.travel_mode,
|
||||
)
|
||||
|
||||
routes: list[Route] = []
|
||||
for route_data in routes_data["routes"]:
|
||||
duration_s = int(route_data["duration"].split("s")[0])
|
||||
route = Route(
|
||||
legs=[
|
||||
RouteLegStep(
|
||||
distance_meters=step_data["distanceMeters"],
|
||||
duration_s=int(step_data["staticDuration"].split("s")[0]),
|
||||
travel_mode=routing.TravelMode(step_data["travelMode"]),
|
||||
)
|
||||
for step_data in route_data["legs"][0]["steps"]
|
||||
],
|
||||
distance_meters=route_data["distanceMeters"],
|
||||
duration_s=duration_s,
|
||||
)
|
||||
routes.append(route)
|
||||
listing.routing_info_json = listing.serialize_routing_info(
|
||||
{**listing.routing_info, **{destination_mode: routes}}
|
||||
)
|
||||
return listing
|
||||
|
|
@ -11,9 +11,14 @@ import json
|
|||
class TaskStatus:
|
||||
"""Status of a background task."""
|
||||
task_id: str
|
||||
status: str # PENDING, STARTED, SUCCESS, FAILURE, REVOKED
|
||||
status: str # PENDING, STARTED, SUCCESS, FAILURE, REVOKED, SKIPPED
|
||||
result: Any | None
|
||||
progress: float | None # 0.0 to 1.0
|
||||
processed: int | None # Number of items processed
|
||||
total: int | None # Total number of items
|
||||
message: str | None # Human-readable status message (e.g., "Fetching listings")
|
||||
error: str | None # Error message if failed
|
||||
traceback: str | None # Full traceback if failed
|
||||
|
||||
|
||||
def get_task_status(task_id: str) -> TaskStatus:
|
||||
|
|
@ -33,21 +38,50 @@ def get_task_status(task_id: str) -> TaskStatus:
|
|||
task_result = dump_listings_task.AsyncResult(task_id)
|
||||
|
||||
# Try to serialize result
|
||||
try:
|
||||
result = json.loads(json.dumps(task_result.result))
|
||||
except (TypeError, json.JSONDecodeError):
|
||||
result = str(task_result.result) if task_result.result else None
|
||||
result = None
|
||||
error = None
|
||||
if task_result.failed():
|
||||
# Extract error message from failed task
|
||||
error = str(task_result.result) if task_result.result else None
|
||||
else:
|
||||
try:
|
||||
result = json.loads(json.dumps(task_result.result))
|
||||
except (TypeError, json.JSONDecodeError):
|
||||
result = str(task_result.result) if task_result.result else None
|
||||
|
||||
# Extract progress from task meta if available
|
||||
# Extract traceback if available
|
||||
task_traceback = task_result.traceback if task_result.failed() else None
|
||||
|
||||
# Extract progress, processed, total, and message from task meta
|
||||
progress = None
|
||||
processed = None
|
||||
total = None
|
||||
message = None
|
||||
|
||||
if task_result.info and isinstance(task_result.info, dict):
|
||||
progress = task_result.info.get("progress")
|
||||
processed = task_result.info.get("processed")
|
||||
total = task_result.info.get("total")
|
||||
# Use 'message' if available, fall back to 'reason' for SKIPPED tasks
|
||||
message = task_result.info.get("message") or task_result.info.get("reason")
|
||||
|
||||
# For custom states (like "Fetching listings"), use the state as message
|
||||
# if no message was provided in info
|
||||
if not message and task_result.status not in (
|
||||
"PENDING", "STARTED", "SUCCESS", "FAILURE", "REVOKED", "RETRY"
|
||||
):
|
||||
message = task_result.status
|
||||
|
||||
return TaskStatus(
|
||||
task_id=task_id,
|
||||
status=task_result.status,
|
||||
result=result,
|
||||
progress=progress,
|
||||
processed=processed,
|
||||
total=total,
|
||||
message=message,
|
||||
error=error,
|
||||
traceback=task_traceback,
|
||||
)
|
||||
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue