2026-02-06 20:55:10 +00:00
|
|
|
"""Floorplan detector service - OCR-based square meter detection."""
|
|
|
|
|
import asyncio
|
2026-02-21 19:39:49 +00:00
|
|
|
import logging
|
Refactor backend for cleaner error handling, DRY, and type safety
- Extract rate limiter DRY: consolidate 3 duplicated check/respond paths
into _check_counter and _enforce_limit helpers, add proper type annotations
- Replace bare Exception raises with FloorplanDownloadError and
RightmoveApiError; narrow catch clauses to specific exception types;
fix Step base class to inherit from ABC
- Consolidate MAX_OCR_WORKERS into config/scraper_config.py; extract
_find_tenure_value helper to deduplicate tenure parsing
- Extract _build_poi_distances_lookup from stream endpoint to reduce nesting
- Fix csv_exporter: optional decisions.json, NaN instead of -1 sentinels,
guard against division by zero on missing square meters
- Fix notifications.py broken list[Surface]() constructor, database.py
stale comments and missing type annotation, auth.py type:ignore,
ui_exporter.py stale TODO
- Fix 3 pre-existing test failures: mock cache layer in streaming tests,
bypass rate limiter for test isolation, fix cache invalidation test to
account for two-pattern scan loop
2026-02-10 22:19:24 +00:00
|
|
|
from config.scraper_config import MAX_OCR_WORKERS
|
2026-02-06 20:55:10 +00:00
|
|
|
from models import Listing
|
|
|
|
|
from rec import floorplan
|
|
|
|
|
from repositories.listing_repository import ListingRepository
|
2026-02-21 19:39:49 +00:00
|
|
|
|
|
|
|
|
logger = logging.getLogger(__name__)
|
2026-02-07 20:19:57 +00:00
|
|
|
|
2026-02-06 20:55:10 +00:00
|
|
|
|
|
|
|
|
async def detect_floorplan(repository: ListingRepository) -> None:
|
|
|
|
|
"""Detect square meters from floorplan images for all listings."""
|
|
|
|
|
listings = await repository.get_listings()
|
2026-02-07 20:19:57 +00:00
|
|
|
semaphore = asyncio.Semaphore(MAX_OCR_WORKERS)
|
2026-02-06 20:55:10 +00:00
|
|
|
|
2026-02-21 19:39:49 +00:00
|
|
|
logger.info("Detecting floorplans for %d listings", len(listings))
|
2026-02-06 20:55:10 +00:00
|
|
|
updated_listings = [
|
|
|
|
|
listing
|
2026-02-21 19:39:49 +00:00
|
|
|
for listing in await asyncio.gather(
|
2026-02-06 20:55:10 +00:00
|
|
|
*[_calculate_sqm_ocr(listing, semaphore) for listing in listings]
|
|
|
|
|
)
|
|
|
|
|
if listing is not None
|
|
|
|
|
]
|
2026-02-21 19:39:49 +00:00
|
|
|
logger.info("Finished floorplan detection, %d listings updated", len(updated_listings))
|
2026-02-06 20:55:10 +00:00
|
|
|
await repository.upsert_listings(updated_listings)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
async def _calculate_sqm_ocr(
|
|
|
|
|
listing: Listing, semaphore: asyncio.Semaphore
|
|
|
|
|
) -> Listing | None:
|
|
|
|
|
"""Calculate square meters from floorplan images using OCR."""
|
|
|
|
|
if listing.square_meters is not None:
|
|
|
|
|
return None
|
2026-02-07 20:19:57 +00:00
|
|
|
if not listing.floorplan_image_paths:
|
|
|
|
|
listing.square_meters = 0
|
|
|
|
|
return listing
|
2026-02-06 20:55:10 +00:00
|
|
|
sqms: list[float] = []
|
|
|
|
|
for floorplan_path in listing.floorplan_image_paths:
|
|
|
|
|
async with semaphore:
|
|
|
|
|
estimated_sqm, _ = await asyncio.to_thread(
|
|
|
|
|
floorplan.calculate_ocr, floorplan_path
|
|
|
|
|
)
|
|
|
|
|
if estimated_sqm is not None:
|
|
|
|
|
sqms.append(estimated_sqm)
|
|
|
|
|
max_sqm = max(sqms, default=0) # try once, if we fail, keep as 0
|
|
|
|
|
listing.square_meters = max_sqm
|
|
|
|
|
return listing
|