Refactor backend for cleaner error handling, DRY, and type safety

- Extract rate limiter DRY: consolidate 3 duplicated check/respond paths
  into _check_counter and _enforce_limit helpers, add proper type annotations
- Replace bare Exception raises with FloorplanDownloadError and
  RightmoveApiError; narrow catch clauses to specific exception types;
  fix Step base class to inherit from ABC
- Consolidate MAX_OCR_WORKERS into config/scraper_config.py; extract
  _find_tenure_value helper to deduplicate tenure parsing
- Extract _build_poi_distances_lookup from stream endpoint to reduce nesting
- Fix csv_exporter: optional decisions.json, NaN instead of -1 sentinels,
  guard against division by zero on missing square meters
- Fix notifications.py broken list[Surface]() constructor, database.py
  stale comments and missing type annotation, auth.py type:ignore,
  ui_exporter.py stale TODO
- Fix 3 pre-existing test failures: mock cache layer in streaming tests,
  bypass rate limiter for test isolation, fix cache invalidation test to
  account for two-pattern scan loop
This commit is contained in:
Viktor Barzin 2026-02-10 22:19:24 +00:00
parent 6897820cc7
commit f833309297
No known key found for this signature in database
GPG key ID: 0EB088298288D958
20 changed files with 199 additions and 178 deletions

View file

@ -1,13 +1,10 @@
"""Floorplan detector service - OCR-based square meter detection."""
import asyncio
from config.scraper_config import MAX_OCR_WORKERS
from models import Listing
from rec import floorplan
from repositories.listing_repository import ListingRepository
from tqdm.asyncio import tqdm
import multiprocessing
# Use a quarter of available CPUs to avoid starving other processes
MAX_OCR_WORKERS = max(1, multiprocessing.cpu_count() // 4)
async def detect_floorplan(repository: ListingRepository) -> None:

View file

@ -5,6 +5,7 @@ from pathlib import Path
from urllib.parse import urlparse
import aiohttp
from rec.exceptions import FloorplanDownloadError
from repositories import ListingRepository
from tenacity import retry, stop_after_attempt, wait_random
from tqdm.asyncio import tqdm
@ -65,10 +66,7 @@ async def dump_images_for_listing(
)
return None
if response.status != 200:
raise Exception(
f"Error downloading floorplan for listing {listing.id} "
f"from {url}: HTTP {response.status}"
)
raise FloorplanDownloadError(url, response.status)
floorplan_path.parent.mkdir(parents=True, exist_ok=True)
with open(floorplan_path, "wb") as f:
f.write(await response.read())

View file

@ -5,7 +5,7 @@ import logging
from config.scraper_config import ScraperConfig
from listing_processor import ListingProcessor
from rec.query import create_session, listing_query
from rec.exceptions import CircuitBreakerOpenError, ThrottlingError
from rec.exceptions import CircuitBreakerOpenError, InvalidResponseError, ThrottlingError
from rec.throttle_detector import get_throttle_metrics, reset_throttle_metrics
from models.listing import Listing, QueryParameters
from repositories import ListingRepository
@ -107,15 +107,15 @@ async def _fetch_subquery(
f"{sq.district}: {e}"
)
break
except Exception as e:
except InvalidResponseError:
# Rightmove returns GENERIC_ERROR when requesting pages
# past the last page of results. This is expected behavior
# and signals we've exhausted this subquery's results.
if "GENERIC_ERROR" in str(e):
logger.debug(
f"Max page for {sq.district}: {page_id - 1}"
)
break
logger.debug(
f"Max page for {sq.district}: {page_id - 1}"
)
break
except Exception as e:
logger.warning(
f"Error fetching page {page_id} for "
f"{sq.district}: {e}"