Refactor backend for cleaner error handling, DRY, and type safety

- Extract rate limiter DRY: consolidate 3 duplicated check/respond paths into _check_counter and _enforce_limit helpers, add proper type annotations - Replace bare Exception raises with FloorplanDownloadError and RightmoveApiError; narrow catch clauses to specific exception types; fix Step base class to inherit from ABC - Consolidate MAX_OCR_WORKERS into config/scraper_config.py; extract _find_tenure_value helper to deduplicate tenure parsing - Extract _build_poi_distances_lookup from stream endpoint to reduce nesting - Fix csv_exporter: optional decisions.json, NaN instead of -1 sentinels, guard against division by zero on missing square meters - Fix notifications.py broken list[Surface]() constructor, database.py stale comments and missing type annotation, auth.py type:ignore, ui_exporter.py stale TODO - Fix 3 pre-existing test failures: mock cache layer in streaming tests, bypass rate limiter for test isolation, fix cache invalidation test to account for two-pattern scan loop
2026-02-10 22:19:24 +00:00 · 2026-02-10 22:19:24 +00:00 · f833309297
commit f833309297
parent 6897820cc7
20 changed files with 199 additions and 178 deletions
--- a/services/floorplan_detector.py
+++ b/services/floorplan_detector.py
@ -1,13 +1,10 @@
 """Floorplan detector service - OCR-based square meter detection."""
 import asyncio
+from config.scraper_config import MAX_OCR_WORKERS
 from models import Listing
 from rec import floorplan
 from repositories.listing_repository import ListingRepository
 from tqdm.asyncio import tqdm
-import multiprocessing
-
-# Use a quarter of available CPUs to avoid starving other processes
-MAX_OCR_WORKERS = max(1, multiprocessing.cpu_count() // 4)


 async def detect_floorplan(repository: ListingRepository) -> None:
--- a/services/image_fetcher.py
+++ b/services/image_fetcher.py
@ -5,6 +5,7 @@ from pathlib import Path
 from urllib.parse import urlparse

 import aiohttp
+from rec.exceptions import FloorplanDownloadError
 from repositories import ListingRepository
 from tenacity import retry, stop_after_attempt, wait_random
 from tqdm.asyncio import tqdm
@ -65,10 +66,7 @@ async def dump_images_for_listing(
                            )
                            return None
                        if response.status != 200:
-                            raise Exception(
-                                f"Error downloading floorplan for listing {listing.id} "
-                                f"from {url}: HTTP {response.status}"
-                            )
+                            raise FloorplanDownloadError(url, response.status)
                        floorplan_path.parent.mkdir(parents=True, exist_ok=True)
                        with open(floorplan_path, "wb") as f:
                            f.write(await response.read())
--- a/services/listing_fetcher.py
+++ b/services/listing_fetcher.py
@ -5,7 +5,7 @@ import logging
 from config.scraper_config import ScraperConfig
 from listing_processor import ListingProcessor
 from rec.query import create_session, listing_query
-from rec.exceptions import CircuitBreakerOpenError, ThrottlingError
+from rec.exceptions import CircuitBreakerOpenError, InvalidResponseError, ThrottlingError
 from rec.throttle_detector import get_throttle_metrics, reset_throttle_metrics
 from models.listing import Listing, QueryParameters
 from repositories import ListingRepository
@ -107,15 +107,15 @@ async def _fetch_subquery(
                    f"{sq.district}: {e}"
                )
                break
-            except Exception as e:
+            except InvalidResponseError:
                # Rightmove returns GENERIC_ERROR when requesting pages
                # past the last page of results. This is expected behavior
                # and signals we've exhausted this subquery's results.
-                if "GENERIC_ERROR" in str(e):
-                    logger.debug(
-                        f"Max page for {sq.district}: {page_id - 1}"
-                    )
-                    break
+                logger.debug(
+                    f"Max page for {sq.district}: {page_id - 1}"
+                )
+                break
+            except Exception as e:
                logger.warning(
                    f"Error fetching page {page_id} for "
                    f"{sq.district}: {e}"