Refactor backend for cleaner error handling, DRY, and type safety
- Extract rate limiter DRY: consolidate 3 duplicated check/respond paths into _check_counter and _enforce_limit helpers, add proper type annotations - Replace bare Exception raises with FloorplanDownloadError and RightmoveApiError; narrow catch clauses to specific exception types; fix Step base class to inherit from ABC - Consolidate MAX_OCR_WORKERS into config/scraper_config.py; extract _find_tenure_value helper to deduplicate tenure parsing - Extract _build_poi_distances_lookup from stream endpoint to reduce nesting - Fix csv_exporter: optional decisions.json, NaN instead of -1 sentinels, guard against division by zero on missing square meters - Fix notifications.py broken list[Surface]() constructor, database.py stale comments and missing type annotation, auth.py type:ignore, ui_exporter.py stale TODO - Fix 3 pre-existing test failures: mock cache layer in streaming tests, bypass rate limiter for test isolation, fix cache invalidation test to account for two-pattern scan loop
This commit is contained in:
parent
6897820cc7
commit
f833309297
20 changed files with 199 additions and 178 deletions
|
|
@ -1,15 +1,15 @@
|
|||
from __future__ import annotations
|
||||
from abc import abstractmethod
|
||||
from abc import ABC, abstractmethod
|
||||
import asyncio
|
||||
from collections.abc import Callable
|
||||
from datetime import datetime
|
||||
import logging
|
||||
import multiprocessing
|
||||
from pathlib import Path
|
||||
import re
|
||||
from typing import Any
|
||||
from urllib.parse import urlparse
|
||||
import aiohttp
|
||||
from config.scraper_config import MAX_OCR_WORKERS
|
||||
from models.listing import (
|
||||
BuyListing,
|
||||
FurnishType,
|
||||
|
|
@ -20,14 +20,12 @@ from models.listing import (
|
|||
RentListing,
|
||||
)
|
||||
from rec import floorplan
|
||||
from rec.exceptions import FloorplanDownloadError
|
||||
from rec.query import detail_query
|
||||
from repositories.listing_repository import ListingRepository
|
||||
|
||||
logger = logging.getLogger("uvicorn.error")
|
||||
|
||||
# Limit OCR threads to 25% of available cores to avoid starving other work.
|
||||
MAX_OCR_WORKERS = max(1, multiprocessing.cpu_count() // 4)
|
||||
|
||||
|
||||
def _parse_furnish_type(raw: str | None) -> FurnishType:
|
||||
"""Normalise the raw furnish-type string from the API into a FurnishType enum."""
|
||||
|
|
@ -97,13 +95,13 @@ class ListingProcessor:
|
|||
step_class_name, step_class_name
|
||||
)
|
||||
on_step_complete(short_name)
|
||||
except Exception as e:
|
||||
except (ValueError, KeyError, aiohttp.ClientError, FloorplanDownloadError) as e:
|
||||
logger.error(f"[{listing_id}] {step_class_name} failed: {e}")
|
||||
return None
|
||||
return listing
|
||||
|
||||
|
||||
class Step:
|
||||
class Step(ABC):
|
||||
listing_repository: ListingRepository
|
||||
listing_type: ListingType
|
||||
|
||||
|
|
@ -123,29 +121,32 @@ class Step:
|
|||
return True
|
||||
|
||||
|
||||
def _find_tenure_value(details: dict[str, Any], tenure_type: str) -> str | None:
|
||||
"""Find a value in the tenure info content by type key."""
|
||||
tenure_content = details.get("property", {}).get("tenureInfo", {}).get("content", [])
|
||||
for item in tenure_content:
|
||||
if item.get("type") == tenure_type:
|
||||
return item.get("value")
|
||||
return None
|
||||
|
||||
|
||||
def _parse_service_charge(details: dict[str, Any]) -> float | None:
|
||||
"""Parse annual service charge from the tenure info in API response."""
|
||||
tenure_content = (
|
||||
details.get("property", {}).get("tenureInfo", {}).get("content", [])
|
||||
)
|
||||
for item in tenure_content:
|
||||
if item.get("type") == "annualServiceCharge":
|
||||
matches = re.findall(r"([\d,.]+)", str(item.get("value", "")))
|
||||
if matches:
|
||||
return float(matches[0].replace(",", ""))
|
||||
value = _find_tenure_value(details, "annualServiceCharge")
|
||||
if value is not None:
|
||||
matches = re.findall(r"([\d,.]+)", str(value))
|
||||
if matches:
|
||||
return float(matches[0].replace(",", ""))
|
||||
return None
|
||||
|
||||
|
||||
def _parse_lease_left(details: dict[str, Any]) -> int | None:
|
||||
"""Parse remaining lease years from the tenure info in API response."""
|
||||
tenure_content = (
|
||||
details.get("property", {}).get("tenureInfo", {}).get("content", [])
|
||||
)
|
||||
for item in tenure_content:
|
||||
if item.get("type") == "lengthOfLease":
|
||||
matches = re.findall(r"(\d+)", str(item.get("value", "")))
|
||||
if matches:
|
||||
return int(matches[0])
|
||||
value = _find_tenure_value(details, "lengthOfLease")
|
||||
if value is not None:
|
||||
matches = re.findall(r"(\d+)", str(value))
|
||||
if matches:
|
||||
return int(matches[0])
|
||||
return None
|
||||
|
||||
|
||||
|
|
@ -265,7 +266,7 @@ class FetchImagesStep(Step):
|
|||
if response.status == 404:
|
||||
return listing
|
||||
if response.status != 200:
|
||||
raise Exception(f"Error for {url}: {response.status}")
|
||||
raise FloorplanDownloadError(url, response.status)
|
||||
floorplan_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(floorplan_path, "wb") as f:
|
||||
f.write(await response.read())
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue