Flatten repo structure: move crawler/ to root, remove vqa/ and immoweb/

The crawler subdirectory was the only active project. Moving it to the
repo root simplifies paths and removes the unnecessary nesting. The
vqa/ and immoweb/ directories were legacy/unused and have been removed.

Updated .drone.yml, .gitignore, .claude/ docs, and skills to reflect
the new flat structure.
This commit is contained in:
Viktor Barzin 2026-02-07 23:01:20 +00:00
parent e2247be700
commit eafbc1ac52
No known key found for this signature in database
GPG key ID: 0EB088298288D958
221 changed files with 70 additions and 146140 deletions

0
rec/__init__.py Normal file
View file

137
rec/circuit_breaker.py Normal file
View file

@ -0,0 +1,137 @@
"""Circuit breaker pattern for protecting against cascading failures."""
from __future__ import annotations
import enum
import logging
import time
from dataclasses import dataclass
from rec.exceptions import CircuitBreakerOpenError
logger = logging.getLogger("uvicorn.error")
class CircuitState(enum.Enum):
"""Circuit breaker states."""
CLOSED = "closed" # Normal operation
OPEN = "open" # Too many failures, blocking requests
HALF_OPEN = "half_open" # Testing if service recovered
@dataclass
class CircuitBreaker:
"""Circuit breaker for protecting against cascading failures.
Implements the circuit breaker pattern:
- CLOSED: Requests pass through normally, failures are counted
- OPEN: After N consecutive failures, circuit opens and blocks all requests
- HALF_OPEN: After recovery timeout, allow one request to test if service recovered
Attributes:
failure_threshold: Number of consecutive failures before opening.
recovery_timeout: Seconds to wait before attempting half-open state.
state: Current circuit state.
failure_count: Count of consecutive failures.
last_failure_time: Timestamp of last failure.
last_state_change: Timestamp of last state change.
"""
failure_threshold: int
recovery_timeout: float
state: CircuitState = CircuitState.CLOSED
failure_count: int = 0
last_failure_time: float = 0.0
last_state_change: float = 0.0
def __post_init__(self) -> None:
"""Initialize state change timestamp."""
self.last_state_change = time.time()
def call(self) -> None:
"""Check if a request should be allowed.
Raises:
CircuitBreakerOpenError: If circuit is open and blocking requests.
"""
current_time = time.time()
if self.state == CircuitState.OPEN:
# Check if we should transition to half-open
if current_time - self.last_failure_time >= self.recovery_timeout:
self._transition_to_half_open()
else:
raise CircuitBreakerOpenError(
f"Circuit breaker is open. "
f"Waiting {self.recovery_timeout - (current_time - self.last_failure_time):.1f}s "
f"before retry."
)
# Allow request to proceed (CLOSED or HALF_OPEN)
def record_success(self) -> None:
"""Record a successful request."""
if self.state == CircuitState.HALF_OPEN:
# Service has recovered, close the circuit
self._transition_to_closed()
# Reset failure count on success
self.failure_count = 0
def record_failure(self) -> None:
"""Record a failed request."""
self.failure_count += 1
self.last_failure_time = time.time()
if self.state == CircuitState.HALF_OPEN:
# Test request failed, reopen circuit
self._transition_to_open()
elif self.state == CircuitState.CLOSED:
# Check if we should open the circuit
if self.failure_count >= self.failure_threshold:
self._transition_to_open()
def _transition_to_open(self) -> None:
"""Transition to OPEN state."""
self.state = CircuitState.OPEN
self.last_state_change = time.time()
logger.warning(
f"Circuit breaker OPENED after {self.failure_count} consecutive failures. "
f"Will retry in {self.recovery_timeout}s"
)
def _transition_to_half_open(self) -> None:
"""Transition to HALF_OPEN state."""
self.state = CircuitState.HALF_OPEN
self.last_state_change = time.time()
logger.info("Circuit breaker entering HALF_OPEN state, testing service recovery")
def _transition_to_closed(self) -> None:
"""Transition to CLOSED state."""
self.state = CircuitState.CLOSED
self.last_state_change = time.time()
self.failure_count = 0
logger.info("Circuit breaker CLOSED, service recovered")
def reset(self) -> None:
"""Manually reset the circuit breaker to CLOSED state."""
self.state = CircuitState.CLOSED
self.failure_count = 0
self.last_failure_time = 0.0
self.last_state_change = time.time()
logger.info("Circuit breaker manually reset to CLOSED state")
@property
def is_open(self) -> bool:
"""Check if circuit is currently open."""
return self.state == CircuitState.OPEN
@property
def is_closed(self) -> bool:
"""Check if circuit is currently closed."""
return self.state == CircuitState.CLOSED
@property
def is_half_open(self) -> bool:
"""Check if circuit is currently half-open."""
return self.state == CircuitState.HALF_OPEN

43
rec/districts.py Normal file
View file

@ -0,0 +1,43 @@
def get_districts() -> dict[str, str]:
return {
"Barking and Dagenham": "REGION^61400",
"Barnet": "REGION^93929",
"Bexley": "REGION^93932",
"Brent": "REGION^93935",
"Bromley": "REGION^93938",
"Camden": "REGION^93941",
"City of London": "REGION^61224",
"Croydon": "REGION^93944",
"Ealing": "REGION^93947",
"Enfield": "REGION^93950",
"Greenwich": "REGION^61226",
"Hackney": "REGION^93953",
"Hammersmith and Fulham": "REGION^61407",
"Haringey": "REGION^61227",
"Harrow": "REGION^93956",
"Havering": "REGION^61228",
"Hillingdon": "REGION^93959",
"Hounslow": "REGION^93962",
"Islington": "REGION^93965",
"London": "REGION^87490",
"Kensington and Chelsea": "REGION^61229",
"Kingston upon Thames": "REGION^93968",
"Lambeth": "REGION^93971",
"Lewisham": "REGION^61413",
"Merton": "REGION^61414",
"Newham": "REGION^61231",
"Redbridge": "REGION^61537",
"Richmond upon Thames": "REGION^61415",
"Southwark": "REGION^61518",
"Stratford": "REGION^85312",
"Sutton": "REGION^93974",
"Tower Hamlets": "REGION^61417",
"Waltham Forest": "REGION^61232",
"Wandsworth": "REGION^93977",
"Westminster": "REGION^93980",
}
def get_district_by_name(name: str) -> str | None:
"""Return the region ID for a district name, or None if not found."""
return get_districts().get(name)

85
rec/exceptions.py Normal file
View file

@ -0,0 +1,85 @@
"""Custom exceptions for Rightmove API errors."""
class RightmoveAPIError(Exception):
"""Base exception for all Rightmove API errors."""
pass
class ThrottlingError(RightmoveAPIError):
"""Base exception for throttling-related errors.
Indicates that Rightmove is limiting our requests and we should back off.
"""
pass
class RateLimitError(ThrottlingError):
"""HTTP 429 - Too Many Requests.
Rightmove is explicitly rate limiting our requests.
"""
pass
class ServiceUnavailableError(ThrottlingError):
"""HTTP 503 - Service Unavailable.
Rightmove's service is temporarily unavailable, possibly due to overload.
"""
pass
class IPBlockedError(ThrottlingError):
"""HTTP 403 - Forbidden (IP blocked).
Our IP may be blocked or blacklisted by Rightmove.
"""
pass
class SlowResponseError(ThrottlingError):
"""Response time exceeded threshold.
API is responding very slowly, indicating potential throttling or overload.
"""
pass
class UnexpectedEmptyResponseError(RightmoveAPIError):
"""Empty response received when data was expected."""
pass
class InvalidResponseError(RightmoveAPIError):
"""Response contains error messages or invalid data."""
pass
class CircuitBreakerOpenError(RightmoveAPIError):
"""Circuit breaker is open, requests are being blocked.
The circuit breaker has detected too many failures and is preventing
further requests to allow the service to recover.
"""
pass
class RoutingApiError(Exception):
"""Error from the Google Routes API."""
def __init__(self, status_code: int, response_body: dict):
self.status_code = status_code
self.response_body = response_body
super().__init__(
f"Routes API returned status {status_code}: {response_body}"
)

67
rec/floorplan.py Normal file
View file

@ -0,0 +1,67 @@
import logging
import re
from pathlib import Path
from typing import Any
from PIL import Image
import cv2
import numpy as np
logger = logging.getLogger(__name__)
MIN_SQM = 30
MAX_SQM = 160
def inference(image_path: str | Path) -> tuple[str, Any]:
from transformers import Pix2StructProcessor, Pix2StructForConditionalGeneration
image = Image.open(image_path)
question = "How many living rooms are displayed on this floor plan?" # not sure if it even has an effect
processor = Pix2StructProcessor.from_pretrained("google/deplot")
model = Pix2StructForConditionalGeneration.from_pretrained("google/deplot")
inputs = processor(images=image, text=question, return_tensors="pt")
predictions = model.generate(**inputs, max_new_tokens=512)
output = processor.decode(predictions[0], skip_special_tokens=True)
return output, predictions
def extract_total_sqm(input_str: str) -> float | None:
# Note: can be used on the output of inference() to extract sqm from model predictions.
sqmregex = r"(\d+\.?\d*) ?(sq ?m|sq. ?m)"
matches = re.findall(sqmregex, input_str.lower())
sqms = [float(m[0]) for m in matches]
filtered = [sqm for sqm in sqms if MIN_SQM < sqm < MAX_SQM]
if len(filtered) == 0:
return None
return max(filtered)
def improve_img_for_ocr(img: Image.Image) -> Image.Image:
grayscale_image = np.array(img.convert("L"))
grayscale_image = cv2.resize(grayscale_image, None, fx=1.2, fy=1.2, interpolation=cv2.INTER_CUBIC)
thresh = cv2.adaptiveThreshold(
grayscale_image, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2
)
return Image.fromarray(thresh)
def calculate_ocr(image_path: str | Path) -> tuple[float | None, str]:
import pytesseract
path = Path(image_path)
if not path.exists():
raise FileNotFoundError(f"Image not found: {image_path}")
img = Image.open(path)
text = pytesseract.image_to_string(img)
estimated_sqm = extract_total_sqm(text)
if estimated_sqm is None:
improved_img = improve_img_for_ocr(img)
text2 = pytesseract.image_to_string(improved_img)
estimated_sqm2 = extract_total_sqm(text2)
logger.debug(f"before: {estimated_sqm} after: {estimated_sqm2} - {image_path}")
return estimated_sqm2, text2
return estimated_sqm, text

507
rec/query.py Normal file
View file

@ -0,0 +1,507 @@
import enum
import logging
import time
from typing import Any
from contextlib import asynccontextmanager
from collections.abc import AsyncIterator
import aiohttp
from models.listing import FurnishType, ListingType
from rec import districts
from rec.exceptions import (
CircuitBreakerOpenError,
ThrottlingError,
)
from rec.throttle_detector import get_throttle_metrics, validate_response
from rec.circuit_breaker import CircuitBreaker
from tenacity import (
retry,
retry_if_exception_type,
stop_after_attempt,
wait_exponential,
wait_random,
)
from config.scraper_config import ScraperConfig
logger = logging.getLogger("uvicorn.error")
# Global circuit breaker instance
_circuit_breaker: CircuitBreaker | None = None
# API constants
ANDROID_APP_VERSION = "3.70.0"
ANDROID_APP_VERSION_LISTING = "4.28.0"
RIGHTMOVE_API_BASE = "https://api.rightmove.co.uk/api"
PROPERTY_LISTING_ENDPOINT = f"{RIGHTMOVE_API_BASE}/property-listing"
DEFAULT_HEADERS = {
"Host": "api.rightmove.co.uk",
"User-Agent": "okhttp/4.12.0",
"Connection": "keep-alive",
}
LISTING_HEADERS = {
**DEFAULT_HEADERS,
"Accept-Encoding": "gzip, deflate, br",
}
class PropertyType(enum.StrEnum):
BUNGALOW = "bungalow"
DETACHED = "detached"
FLAT = "flat"
LAND = "land"
PARK_HOME = "park-home"
SEMI_DETACHED = "semi-detached"
TERRACED = "terraced"
@asynccontextmanager
async def create_session(
config: ScraperConfig | None = None,
) -> AsyncIterator[aiohttp.ClientSession]:
"""Create an aiohttp session with optional proxy support.
Args:
config: Scraper configuration. Loads from environment if not provided.
Yields:
Configured aiohttp ClientSession.
"""
if config is None:
config = ScraperConfig.from_env()
connector = None
if config.proxy_url:
try:
from aiohttp_socks import ProxyConnector
connector = ProxyConnector.from_url(config.proxy_url)
except ImportError:
raise ImportError(
"aiohttp-socks is required for proxy support. "
"Install with: pip install aiohttp-socks"
)
session = aiohttp.ClientSession(
trust_env=True,
connector=connector,
headers=DEFAULT_HEADERS,
)
try:
yield session
finally:
await session.close()
def get_circuit_breaker(config: ScraperConfig | None = None) -> CircuitBreaker | None:
"""Get the global circuit breaker instance.
Args:
config: Configuration for initializing the circuit breaker.
Returns:
CircuitBreaker instance if enabled, None otherwise.
"""
global _circuit_breaker
if config is None:
config = ScraperConfig.from_env()
if not config.enable_circuit_breaker:
return None
if _circuit_breaker is None:
_circuit_breaker = CircuitBreaker(
failure_threshold=config.circuit_breaker_failure_threshold,
recovery_timeout=config.circuit_breaker_recovery_timeout,
)
return _circuit_breaker
def reset_circuit_breaker() -> None:
"""Reset the global circuit breaker."""
global _circuit_breaker
if _circuit_breaker is not None:
_circuit_breaker.reset()
def check_circuit_breaker(config: ScraperConfig | None = None) -> None:
"""Check if the circuit breaker allows requests.
Args:
config: Configuration for the circuit breaker.
Raises:
CircuitBreakerOpenError: If the circuit is open.
"""
cb = get_circuit_breaker(config)
if cb is not None:
cb.call()
def _build_base_params(
*,
channel: ListingType,
page: int,
page_size: int,
radius: float,
min_price: int,
max_price: int,
min_bedrooms: int,
max_bedrooms: int,
district: str,
) -> dict[str, str]:
return {
"locationIdentifier": districts.get_districts()[district],
"channel": str(channel).upper(),
"page": str(page),
"numberOfPropertiesPerPage": str(page_size),
"radius": str(radius),
"sortBy": "distance",
"includeUnavailableProperties": "false",
"minPrice": str(min_price),
"maxPrice": str(max_price),
"minBedrooms": str(min_bedrooms),
"maxBedrooms": str(max_bedrooms),
"apiApplication": "ANDROID",
"appVersion": ANDROID_APP_VERSION_LISTING,
}
def _build_listing_params(
*,
page: int,
channel: ListingType,
min_bedrooms: int,
max_bedrooms: int,
radius: float,
min_price: int,
max_price: int,
district: str,
mustNewHome: bool,
max_days_since_added: int,
property_type: list[PropertyType],
page_size: int,
furnish_types: list[FurnishType],
) -> dict[str, str]:
params = _build_base_params(
channel=channel,
page=page,
page_size=page_size,
radius=radius,
min_price=min_price,
max_price=max_price,
min_bedrooms=min_bedrooms,
max_bedrooms=max_bedrooms,
district=district,
)
if channel is ListingType.BUY:
params["dontShow"] = "sharedOwnership,retirement"
if len(property_type) > 0:
params["propertyTypes"] = ",".join(property_type)
if max_days_since_added is not None and max_days_since_added not in [
1,
3,
7,
14,
]:
raise Exception(
f"Invalid max days - {max_days_since_added} Can only be got",
[1, 3, 7, 14],
)
params["maxDaysSinceAdded"] = str(max_days_since_added)
if mustNewHome:
params["mustHave"] = "newHome"
if channel is ListingType.RENT:
if furnish_types:
params["furnishTypes"] = ",".join(furnish_types)
return params
def _build_probe_params(
*,
channel: ListingType,
min_bedrooms: int,
max_bedrooms: int,
radius: float,
min_price: int,
max_price: int,
district: str,
max_days_since_added: int,
furnish_types: list[FurnishType],
) -> dict[str, str]:
params = _build_base_params(
channel=channel,
page=1,
page_size=1, # Minimal page size for probing
radius=radius,
min_price=min_price,
max_price=max_price,
min_bedrooms=min_bedrooms,
max_bedrooms=max_bedrooms,
district=district,
)
if channel is ListingType.BUY:
params["dontShow"] = "sharedOwnership,retirement"
if max_days_since_added is not None and max_days_since_added in [
1,
3,
7,
14,
]:
params["maxDaysSinceAdded"] = str(max_days_since_added)
if channel is ListingType.RENT:
if furnish_types:
params["furnishTypes"] = ",".join(furnish_types)
return params
async def _execute_api_request(
*,
url: str,
params: dict[str, str],
headers: dict[str, str],
session: aiohttp.ClientSession | None,
config: ScraperConfig,
expect_data: bool = True,
error_context: str = "",
) -> dict[str, Any]:
check_circuit_breaker(config)
cb = get_circuit_breaker(config)
async def do_request(s: aiohttp.ClientSession) -> dict[str, Any]:
start_time = time.time()
try:
async with s.get(url, params=params, headers=headers) as response:
response_time = time.time() - start_time
body = await response.json() if response.status == 200 else None
validate_response(
response,
response_time,
body,
config.slow_response_threshold,
expect_data=expect_data,
)
if response.status != 200:
raise Exception(
f"{error_context}Failed due to: {await response.text()}"
)
if cb is not None:
cb.record_success()
return body # type: ignore
except ThrottlingError:
if cb is not None:
cb.record_failure()
raise
except Exception as e:
if cb is not None:
cb.record_failure()
raise e
if session:
return await do_request(session)
else:
async with aiohttp.ClientSession(trust_env=True) as new_session:
return await do_request(new_session)
@retry(
retry=retry_if_exception_type(ThrottlingError),
wait=wait_exponential(multiplier=2, min=2, max=120),
stop=stop_after_attempt(5),
)
async def detail_query(
detail_id: int,
session: aiohttp.ClientSession | None = None,
config: ScraperConfig | None = None,
) -> dict[str, Any]:
"""Fetch detailed property information.
Args:
detail_id: The property identifier.
session: Optional aiohttp session. Creates new one if not provided.
config: Scraper configuration. Loads from environment if not provided.
Returns:
Property details as a dictionary.
Raises:
CircuitBreakerOpenError: If the circuit breaker is open.
ThrottlingError: If the request is throttled.
"""
if config is None:
config = ScraperConfig.from_env()
params = {
"apiApplication": "ANDROID",
"appVersion": ANDROID_APP_VERSION,
}
url = f"{RIGHTMOVE_API_BASE}/property/{detail_id}"
return await _execute_api_request(
url=url,
params=params,
headers=DEFAULT_HEADERS,
session=session,
config=config,
expect_data=True,
error_context=f"id: {detail_id}. Status Code: ",
)
@retry(
retry=retry_if_exception_type(ThrottlingError),
wait=wait_exponential(multiplier=2, min=2, max=120),
stop=stop_after_attempt(5),
)
async def listing_query(
*,
page: int,
channel: ListingType,
min_bedrooms: int,
max_bedrooms: int,
radius: float,
min_price: int,
max_price: int,
district: str, # = "STATION^5168", # kings cross station
mustNewHome: bool = False,
max_days_since_added: int = 30,
property_type: list[PropertyType] | None = None,
page_size: int = 25,
furnish_types: list[FurnishType] | None = None,
session: aiohttp.ClientSession | None = None,
config: ScraperConfig | None = None,
) -> dict[str, Any]:
"""Execute a listing search query.
Args:
page: Page number to fetch (1-indexed).
channel: Listing type (BUY or RENT).
min_bedrooms: Minimum number of bedrooms.
max_bedrooms: Maximum number of bedrooms.
radius: Search radius.
min_price: Minimum price.
max_price: Maximum price.
district: District identifier string.
mustNewHome: Filter for new homes only (BUY only).
max_days_since_added: Maximum days since listing was added (BUY only).
property_type: List of property types to filter (BUY only).
page_size: Number of results per page (default 25).
furnish_types: List of furnish types to filter (RENT only).
session: Optional aiohttp session. Creates new one if not provided.
config: Scraper configuration. Loads from environment if not provided.
Returns:
API response as a dictionary.
Raises:
CircuitBreakerOpenError: If the circuit breaker is open.
ThrottlingError: If the request is throttled.
"""
if config is None:
config = ScraperConfig.from_env()
if property_type is None:
property_type = []
if furnish_types is None:
furnish_types = []
params = _build_listing_params(
page=page,
channel=channel,
min_bedrooms=min_bedrooms,
max_bedrooms=max_bedrooms,
radius=radius,
min_price=min_price,
max_price=max_price,
district=district,
mustNewHome=mustNewHome,
max_days_since_added=max_days_since_added,
property_type=property_type,
page_size=page_size,
furnish_types=furnish_types,
)
return await _execute_api_request(
url=PROPERTY_LISTING_ENDPOINT,
params=params,
headers=LISTING_HEADERS,
session=session,
config=config,
expect_data=(page == 1),
)
@retry(
retry=retry_if_exception_type(ThrottlingError),
wait=wait_exponential(multiplier=2, min=2, max=60),
stop=stop_after_attempt(5),
)
async def probe_query(
*,
session: aiohttp.ClientSession,
channel: ListingType,
min_bedrooms: int,
max_bedrooms: int,
radius: float,
min_price: int,
max_price: int,
district: str,
max_days_since_added: int = 30,
furnish_types: list[FurnishType] | None = None,
config: ScraperConfig | None = None,
) -> dict[str, Any]:
"""Probe the API to get result count without fetching full results.
Makes a minimal request (page_size=1) to efficiently get totalAvailableResults.
Args:
session: aiohttp session for making requests.
channel: Listing type (BUY or RENT).
min_bedrooms: Minimum number of bedrooms.
max_bedrooms: Maximum number of bedrooms.
radius: Search radius.
min_price: Minimum price.
max_price: Maximum price.
district: District identifier string.
max_days_since_added: Maximum days since listing was added (BUY only).
furnish_types: List of furnish types to filter (RENT only).
config: Scraper configuration. Loads from environment if not provided.
Returns:
API response containing totalAvailableResults.
Raises:
CircuitBreakerOpenError: If the circuit breaker is open.
ThrottlingError: If the request is throttled.
"""
if config is None:
config = ScraperConfig.from_env()
if furnish_types is None:
furnish_types = []
params = _build_probe_params(
channel=channel,
min_bedrooms=min_bedrooms,
max_bedrooms=max_bedrooms,
radius=radius,
min_price=min_price,
max_price=max_price,
district=district,
max_days_since_added=max_days_since_added,
furnish_types=furnish_types,
)
return await _execute_api_request(
url=PROPERTY_LISTING_ENDPOINT,
params=params,
headers=LISTING_HEADERS,
session=session,
config=config,
expect_data=False,
error_context="Probe failed: ",
)

47
rec/route_serializer.py Normal file
View file

@ -0,0 +1,47 @@
import dataclasses
import json
from typing import List
from models.listing import DestinationMode, Route, RouteLegStep
from rec import routing
class RouteSerializer:
@staticmethod
def serialize(routing_info: dict[DestinationMode, list[Route]]) -> str:
return json.dumps(
{
json.dumps(dataclasses.asdict(destination_mode)): [
json.dumps(dataclasses.asdict(route)) for route in routes
]
for destination_mode, routes in routing_info.items()
}
)
@staticmethod
def deserialize(route_data_json: str) -> dict[DestinationMode, List[Route]]:
json_data = json.loads(route_data_json)
destination_routes = {}
for destination_mode_str, routes_json in json_data.items():
parsed_destination = json.loads(destination_mode_str)
destination_mode = DestinationMode(
destination_address=parsed_destination["destination_address"],
travel_mode=routing.TravelMode(parsed_destination["travel_mode"]),
)
parsed_route = json.loads(routes_json[0])
routes = [
Route(
legs=[
RouteLegStep(
distance_meters=step["distance_meters"],
duration_s=step["duration_s"],
travel_mode=routing.TravelMode(step["travel_mode"]),
)
for step in parsed_route["legs"]
],
distance_meters=parsed_route["distance_meters"],
duration_s=int(parsed_route["duration_s"]),
)
]
destination_routes[destination_mode] = routes
return destination_routes

63
rec/routing.py Normal file
View file

@ -0,0 +1,63 @@
import enum
import os
from typing import Any
import requests
from rec.utils import nextMonday
from rec.exceptions import RoutingApiError
ROUTES_API_URL = "https://routes.googleapis.com/directions/v2:computeRoutes"
API_KEY_ENVIRONMENT_VARIABLE = "ROUTING_API_KEY"
ROUTES_FIELD_MASK = (
"routes.distanceMeters,"
"routes.duration,"
"routes.staticDuration,"
"routes.legs.steps.distanceMeters,"
"routes.legs.steps.staticDuration,"
"routes.legs.steps.travelMode"
)
class TravelMode(enum.StrEnum):
TRANSIT = "TRANSIT"
BICYCLE = "BICYCLE"
WALK = "WALK"
DRIVE = "DRIVE"
def transit_route(
origin_lat: float,
origin_lon: float,
dest_address: str,
travel_mode: TravelMode,
compute_alternative_routes: bool = True,
) -> dict[str, Any]:
monday9am = nextMonday()
# must be set
api_key = os.environ[API_KEY_ENVIRONMENT_VARIABLE]
header = {
"X-Goog-Api-Key": api_key,
"Content-Type": "application/json",
"X-Goog-FieldMask": ROUTES_FIELD_MASK,
}
body = {
"origin": {
"location": {"latLng": {"latitude": origin_lat, "longitude": origin_lon}}
},
"destination": {
"address": dest_address
},
"travelMode": travel_mode.value,
"departureTime": monday9am.strftime("%Y-%m-%dT%H:%M:%S.%fZ"),
"computeAlternativeRoutes": compute_alternative_routes,
"languageCode": "en-US",
"units": "METRIC",
}
r = requests.post(ROUTES_API_URL, json=body, headers=header)
if r.status_code == 200:
return r.json()
raise RoutingApiError(r.status_code, r.json())

232
rec/throttle_detector.py Normal file
View file

@ -0,0 +1,232 @@
"""Throttling detection and metrics for Rightmove API."""
from __future__ import annotations
import time
from dataclasses import dataclass, field
from typing import Any
import aiohttp
from rec.exceptions import (
InvalidResponseError,
IPBlockedError,
RateLimitError,
ServiceUnavailableError,
SlowResponseError,
UnexpectedEmptyResponseError,
)
@dataclass
class ThrottleMetrics:
"""Tracks throttling events and metrics.
Attributes:
rate_limit_count: Number of HTTP 429 errors.
service_unavailable_count: Number of HTTP 503 errors.
ip_blocked_count: Number of HTTP 403 errors.
slow_response_count: Number of slow responses.
empty_response_count: Number of unexpected empty responses.
invalid_response_count: Number of invalid/error responses.
total_requests: Total number of requests made.
total_response_time: Cumulative response time in seconds.
"""
rate_limit_count: int = 0
service_unavailable_count: int = 0
ip_blocked_count: int = 0
slow_response_count: int = 0
empty_response_count: int = 0
invalid_response_count: int = 0
total_requests: int = 0
total_response_time: float = 0.0
_start_time: float = field(default_factory=time.time)
def record_rate_limit(self) -> None:
"""Record a rate limit error (HTTP 429)."""
self.rate_limit_count += 1
def record_service_unavailable(self) -> None:
"""Record a service unavailable error (HTTP 503)."""
self.service_unavailable_count += 1
def record_ip_blocked(self) -> None:
"""Record an IP blocked error (HTTP 403)."""
self.ip_blocked_count += 1
def record_slow_response(self, response_time: float) -> None:
"""Record a slow response.
Args:
response_time: Response time in seconds.
"""
self.slow_response_count += 1
self.total_response_time += response_time
self.total_requests += 1
def record_empty_response(self) -> None:
"""Record an unexpected empty response."""
self.empty_response_count += 1
def record_invalid_response(self) -> None:
"""Record an invalid or error response."""
self.invalid_response_count += 1
def record_request(self, response_time: float) -> None:
"""Record a successful request.
Args:
response_time: Response time in seconds.
"""
self.total_requests += 1
self.total_response_time += response_time
@property
def average_response_time(self) -> float:
"""Calculate average response time in seconds."""
if self.total_requests == 0:
return 0.0
return self.total_response_time / self.total_requests
@property
def total_throttling_events(self) -> int:
"""Total number of throttling events."""
return (
self.rate_limit_count
+ self.service_unavailable_count
+ self.ip_blocked_count
+ self.slow_response_count
)
@property
def throttle_rate(self) -> float:
"""Percentage of requests that were throttled."""
if self.total_requests == 0:
return 0.0
return (self.total_throttling_events / self.total_requests) * 100
@property
def elapsed_time(self) -> float:
"""Time elapsed since metrics started tracking."""
return time.time() - self._start_time
def summary(self) -> str:
"""Generate a summary of throttling metrics."""
return (
f"Throttle Metrics Summary:\n"
f" Total Requests: {self.total_requests}\n"
f" Total Throttling Events: {self.total_throttling_events}\n"
f" Throttle Rate: {self.throttle_rate:.2f}%\n"
f" Rate Limit (429): {self.rate_limit_count}\n"
f" Service Unavailable (503): {self.service_unavailable_count}\n"
f" IP Blocked (403): {self.ip_blocked_count}\n"
f" Slow Responses: {self.slow_response_count}\n"
f" Empty Responses: {self.empty_response_count}\n"
f" Invalid Responses: {self.invalid_response_count}\n"
f" Average Response Time: {self.average_response_time:.2f}s\n"
f" Elapsed Time: {self.elapsed_time:.2f}s"
)
# Global metrics instance
_global_metrics: ThrottleMetrics | None = None
def get_throttle_metrics() -> ThrottleMetrics:
"""Get the global throttle metrics instance.
Returns:
Global ThrottleMetrics instance.
"""
global _global_metrics
if _global_metrics is None:
_global_metrics = ThrottleMetrics()
return _global_metrics
def reset_throttle_metrics() -> None:
"""Reset the global throttle metrics."""
global _global_metrics
_global_metrics = ThrottleMetrics()
def validate_response(
response: aiohttp.ClientResponse,
response_time: float,
response_body: dict[str, Any] | None,
slow_response_threshold: float,
expect_data: bool = True,
) -> None:
"""Validate an API response and raise appropriate exceptions for throttling.
Args:
response: The aiohttp response object.
response_time: Time taken for the request in seconds.
response_body: Parsed JSON response body (if available).
slow_response_threshold: Threshold in seconds for slow responses.
expect_data: Whether we expect data in the response.
Raises:
RateLimitError: If HTTP 429 is returned.
ServiceUnavailableError: If HTTP 503 is returned.
IPBlockedError: If HTTP 403 is returned.
SlowResponseError: If response time exceeds threshold.
UnexpectedEmptyResponseError: If response is empty when data is expected.
InvalidResponseError: If response contains error messages.
"""
metrics = get_throttle_metrics()
# Check HTTP status codes
if response.status == 429:
metrics.record_rate_limit()
raise RateLimitError(
f"Rate limit exceeded (HTTP 429). "
f"Response time: {response_time:.2f}s"
)
if response.status == 503:
metrics.record_service_unavailable()
raise ServiceUnavailableError(
f"Service unavailable (HTTP 503). "
f"Response time: {response_time:.2f}s"
)
if response.status == 403:
metrics.record_ip_blocked()
raise IPBlockedError(
f"Access forbidden, possible IP block (HTTP 403). "
f"Response time: {response_time:.2f}s"
)
# Check response time
if response_time > slow_response_threshold:
metrics.record_slow_response(response_time)
raise SlowResponseError(
f"Slow response detected: {response_time:.2f}s "
f"(threshold: {slow_response_threshold}s)"
)
# Check response body if available
if response_body is not None:
# Check for error messages
if "error" in response_body or "GENERIC_ERROR" in str(response_body):
metrics.record_invalid_response()
raise InvalidResponseError(
f"Error in response body: {response_body}"
)
# Check for unexpected empty responses
if expect_data:
properties = response_body.get("properties", [])
total_results = response_body.get("totalAvailableResults", 0)
# If we expect data but got none (and total shows there should be some)
if total_results > 0 and len(properties) == 0:
metrics.record_empty_response()
raise UnexpectedEmptyResponseError(
f"Expected data but got empty response. "
f"Total available: {total_results}"
)
# Record successful request
metrics.record_request(response_time)

21
rec/utils.py Normal file
View file

@ -0,0 +1,21 @@
from datetime import datetime, timedelta, timezone
def nextMonday():
"""
I think this function doesnt work when the day is monday itself.
Returns:
_type_: _description_
"""
now = datetime.now(timezone.utc)
days_until_monday = (0 - now.weekday() + 7) % 7
monday = now + timedelta(days=days_until_monday)
monday_9am = monday.replace(
hour=9, minute=0, second=0, microsecond=0, tzinfo=timezone.utc
)
return monday_9am
if __name__ == "__main__":
print(nextMonday())