Flatten repo structure: move crawler/ to root, remove vqa/ and immoweb/
The crawler subdirectory was the only active project. Moving it to the repo root simplifies paths and removes the unnecessary nesting. The vqa/ and immoweb/ directories were legacy/unused and have been removed. Updated .drone.yml, .gitignore, .claude/ docs, and skills to reflect the new flat structure.
This commit is contained in:
parent
e2247be700
commit
eafbc1ac52
221 changed files with 70 additions and 146140 deletions
0
rec/__init__.py
Normal file
0
rec/__init__.py
Normal file
137
rec/circuit_breaker.py
Normal file
137
rec/circuit_breaker.py
Normal file
|
|
@ -0,0 +1,137 @@
|
|||
"""Circuit breaker pattern for protecting against cascading failures."""
|
||||
from __future__ import annotations
|
||||
|
||||
import enum
|
||||
import logging
|
||||
import time
|
||||
from dataclasses import dataclass
|
||||
|
||||
from rec.exceptions import CircuitBreakerOpenError
|
||||
|
||||
logger = logging.getLogger("uvicorn.error")
|
||||
|
||||
|
||||
class CircuitState(enum.Enum):
|
||||
"""Circuit breaker states."""
|
||||
|
||||
CLOSED = "closed" # Normal operation
|
||||
OPEN = "open" # Too many failures, blocking requests
|
||||
HALF_OPEN = "half_open" # Testing if service recovered
|
||||
|
||||
|
||||
@dataclass
|
||||
class CircuitBreaker:
|
||||
"""Circuit breaker for protecting against cascading failures.
|
||||
|
||||
Implements the circuit breaker pattern:
|
||||
- CLOSED: Requests pass through normally, failures are counted
|
||||
- OPEN: After N consecutive failures, circuit opens and blocks all requests
|
||||
- HALF_OPEN: After recovery timeout, allow one request to test if service recovered
|
||||
|
||||
Attributes:
|
||||
failure_threshold: Number of consecutive failures before opening.
|
||||
recovery_timeout: Seconds to wait before attempting half-open state.
|
||||
state: Current circuit state.
|
||||
failure_count: Count of consecutive failures.
|
||||
last_failure_time: Timestamp of last failure.
|
||||
last_state_change: Timestamp of last state change.
|
||||
"""
|
||||
|
||||
failure_threshold: int
|
||||
recovery_timeout: float
|
||||
state: CircuitState = CircuitState.CLOSED
|
||||
failure_count: int = 0
|
||||
last_failure_time: float = 0.0
|
||||
last_state_change: float = 0.0
|
||||
|
||||
def __post_init__(self) -> None:
|
||||
"""Initialize state change timestamp."""
|
||||
self.last_state_change = time.time()
|
||||
|
||||
def call(self) -> None:
|
||||
"""Check if a request should be allowed.
|
||||
|
||||
Raises:
|
||||
CircuitBreakerOpenError: If circuit is open and blocking requests.
|
||||
"""
|
||||
current_time = time.time()
|
||||
|
||||
if self.state == CircuitState.OPEN:
|
||||
# Check if we should transition to half-open
|
||||
if current_time - self.last_failure_time >= self.recovery_timeout:
|
||||
self._transition_to_half_open()
|
||||
else:
|
||||
raise CircuitBreakerOpenError(
|
||||
f"Circuit breaker is open. "
|
||||
f"Waiting {self.recovery_timeout - (current_time - self.last_failure_time):.1f}s "
|
||||
f"before retry."
|
||||
)
|
||||
|
||||
# Allow request to proceed (CLOSED or HALF_OPEN)
|
||||
|
||||
def record_success(self) -> None:
|
||||
"""Record a successful request."""
|
||||
if self.state == CircuitState.HALF_OPEN:
|
||||
# Service has recovered, close the circuit
|
||||
self._transition_to_closed()
|
||||
|
||||
# Reset failure count on success
|
||||
self.failure_count = 0
|
||||
|
||||
def record_failure(self) -> None:
|
||||
"""Record a failed request."""
|
||||
self.failure_count += 1
|
||||
self.last_failure_time = time.time()
|
||||
|
||||
if self.state == CircuitState.HALF_OPEN:
|
||||
# Test request failed, reopen circuit
|
||||
self._transition_to_open()
|
||||
elif self.state == CircuitState.CLOSED:
|
||||
# Check if we should open the circuit
|
||||
if self.failure_count >= self.failure_threshold:
|
||||
self._transition_to_open()
|
||||
|
||||
def _transition_to_open(self) -> None:
|
||||
"""Transition to OPEN state."""
|
||||
self.state = CircuitState.OPEN
|
||||
self.last_state_change = time.time()
|
||||
logger.warning(
|
||||
f"Circuit breaker OPENED after {self.failure_count} consecutive failures. "
|
||||
f"Will retry in {self.recovery_timeout}s"
|
||||
)
|
||||
|
||||
def _transition_to_half_open(self) -> None:
|
||||
"""Transition to HALF_OPEN state."""
|
||||
self.state = CircuitState.HALF_OPEN
|
||||
self.last_state_change = time.time()
|
||||
logger.info("Circuit breaker entering HALF_OPEN state, testing service recovery")
|
||||
|
||||
def _transition_to_closed(self) -> None:
|
||||
"""Transition to CLOSED state."""
|
||||
self.state = CircuitState.CLOSED
|
||||
self.last_state_change = time.time()
|
||||
self.failure_count = 0
|
||||
logger.info("Circuit breaker CLOSED, service recovered")
|
||||
|
||||
def reset(self) -> None:
|
||||
"""Manually reset the circuit breaker to CLOSED state."""
|
||||
self.state = CircuitState.CLOSED
|
||||
self.failure_count = 0
|
||||
self.last_failure_time = 0.0
|
||||
self.last_state_change = time.time()
|
||||
logger.info("Circuit breaker manually reset to CLOSED state")
|
||||
|
||||
@property
|
||||
def is_open(self) -> bool:
|
||||
"""Check if circuit is currently open."""
|
||||
return self.state == CircuitState.OPEN
|
||||
|
||||
@property
|
||||
def is_closed(self) -> bool:
|
||||
"""Check if circuit is currently closed."""
|
||||
return self.state == CircuitState.CLOSED
|
||||
|
||||
@property
|
||||
def is_half_open(self) -> bool:
|
||||
"""Check if circuit is currently half-open."""
|
||||
return self.state == CircuitState.HALF_OPEN
|
||||
43
rec/districts.py
Normal file
43
rec/districts.py
Normal file
|
|
@ -0,0 +1,43 @@
|
|||
def get_districts() -> dict[str, str]:
|
||||
return {
|
||||
"Barking and Dagenham": "REGION^61400",
|
||||
"Barnet": "REGION^93929",
|
||||
"Bexley": "REGION^93932",
|
||||
"Brent": "REGION^93935",
|
||||
"Bromley": "REGION^93938",
|
||||
"Camden": "REGION^93941",
|
||||
"City of London": "REGION^61224",
|
||||
"Croydon": "REGION^93944",
|
||||
"Ealing": "REGION^93947",
|
||||
"Enfield": "REGION^93950",
|
||||
"Greenwich": "REGION^61226",
|
||||
"Hackney": "REGION^93953",
|
||||
"Hammersmith and Fulham": "REGION^61407",
|
||||
"Haringey": "REGION^61227",
|
||||
"Harrow": "REGION^93956",
|
||||
"Havering": "REGION^61228",
|
||||
"Hillingdon": "REGION^93959",
|
||||
"Hounslow": "REGION^93962",
|
||||
"Islington": "REGION^93965",
|
||||
"London": "REGION^87490",
|
||||
"Kensington and Chelsea": "REGION^61229",
|
||||
"Kingston upon Thames": "REGION^93968",
|
||||
"Lambeth": "REGION^93971",
|
||||
"Lewisham": "REGION^61413",
|
||||
"Merton": "REGION^61414",
|
||||
"Newham": "REGION^61231",
|
||||
"Redbridge": "REGION^61537",
|
||||
"Richmond upon Thames": "REGION^61415",
|
||||
"Southwark": "REGION^61518",
|
||||
"Stratford": "REGION^85312",
|
||||
"Sutton": "REGION^93974",
|
||||
"Tower Hamlets": "REGION^61417",
|
||||
"Waltham Forest": "REGION^61232",
|
||||
"Wandsworth": "REGION^93977",
|
||||
"Westminster": "REGION^93980",
|
||||
}
|
||||
|
||||
|
||||
def get_district_by_name(name: str) -> str | None:
|
||||
"""Return the region ID for a district name, or None if not found."""
|
||||
return get_districts().get(name)
|
||||
85
rec/exceptions.py
Normal file
85
rec/exceptions.py
Normal file
|
|
@ -0,0 +1,85 @@
|
|||
"""Custom exceptions for Rightmove API errors."""
|
||||
|
||||
|
||||
class RightmoveAPIError(Exception):
|
||||
"""Base exception for all Rightmove API errors."""
|
||||
|
||||
pass
|
||||
|
||||
|
||||
class ThrottlingError(RightmoveAPIError):
|
||||
"""Base exception for throttling-related errors.
|
||||
|
||||
Indicates that Rightmove is limiting our requests and we should back off.
|
||||
"""
|
||||
|
||||
pass
|
||||
|
||||
|
||||
class RateLimitError(ThrottlingError):
|
||||
"""HTTP 429 - Too Many Requests.
|
||||
|
||||
Rightmove is explicitly rate limiting our requests.
|
||||
"""
|
||||
|
||||
pass
|
||||
|
||||
|
||||
class ServiceUnavailableError(ThrottlingError):
|
||||
"""HTTP 503 - Service Unavailable.
|
||||
|
||||
Rightmove's service is temporarily unavailable, possibly due to overload.
|
||||
"""
|
||||
|
||||
pass
|
||||
|
||||
|
||||
class IPBlockedError(ThrottlingError):
|
||||
"""HTTP 403 - Forbidden (IP blocked).
|
||||
|
||||
Our IP may be blocked or blacklisted by Rightmove.
|
||||
"""
|
||||
|
||||
pass
|
||||
|
||||
|
||||
class SlowResponseError(ThrottlingError):
|
||||
"""Response time exceeded threshold.
|
||||
|
||||
API is responding very slowly, indicating potential throttling or overload.
|
||||
"""
|
||||
|
||||
pass
|
||||
|
||||
|
||||
class UnexpectedEmptyResponseError(RightmoveAPIError):
|
||||
"""Empty response received when data was expected."""
|
||||
|
||||
pass
|
||||
|
||||
|
||||
class InvalidResponseError(RightmoveAPIError):
|
||||
"""Response contains error messages or invalid data."""
|
||||
|
||||
pass
|
||||
|
||||
|
||||
class CircuitBreakerOpenError(RightmoveAPIError):
|
||||
"""Circuit breaker is open, requests are being blocked.
|
||||
|
||||
The circuit breaker has detected too many failures and is preventing
|
||||
further requests to allow the service to recover.
|
||||
"""
|
||||
|
||||
pass
|
||||
|
||||
|
||||
class RoutingApiError(Exception):
|
||||
"""Error from the Google Routes API."""
|
||||
|
||||
def __init__(self, status_code: int, response_body: dict):
|
||||
self.status_code = status_code
|
||||
self.response_body = response_body
|
||||
super().__init__(
|
||||
f"Routes API returned status {status_code}: {response_body}"
|
||||
)
|
||||
67
rec/floorplan.py
Normal file
67
rec/floorplan.py
Normal file
|
|
@ -0,0 +1,67 @@
|
|||
import logging
|
||||
import re
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
from PIL import Image
|
||||
import cv2
|
||||
import numpy as np
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
MIN_SQM = 30
|
||||
MAX_SQM = 160
|
||||
|
||||
|
||||
def inference(image_path: str | Path) -> tuple[str, Any]:
|
||||
from transformers import Pix2StructProcessor, Pix2StructForConditionalGeneration
|
||||
|
||||
image = Image.open(image_path)
|
||||
question = "How many living rooms are displayed on this floor plan?" # not sure if it even has an effect
|
||||
processor = Pix2StructProcessor.from_pretrained("google/deplot")
|
||||
model = Pix2StructForConditionalGeneration.from_pretrained("google/deplot")
|
||||
|
||||
inputs = processor(images=image, text=question, return_tensors="pt")
|
||||
predictions = model.generate(**inputs, max_new_tokens=512)
|
||||
output = processor.decode(predictions[0], skip_special_tokens=True)
|
||||
|
||||
return output, predictions
|
||||
|
||||
|
||||
def extract_total_sqm(input_str: str) -> float | None:
|
||||
# Note: can be used on the output of inference() to extract sqm from model predictions.
|
||||
sqmregex = r"(\d+\.?\d*) ?(sq ?m|sq. ?m)"
|
||||
matches = re.findall(sqmregex, input_str.lower())
|
||||
sqms = [float(m[0]) for m in matches]
|
||||
filtered = [sqm for sqm in sqms if MIN_SQM < sqm < MAX_SQM]
|
||||
if len(filtered) == 0:
|
||||
return None
|
||||
return max(filtered)
|
||||
|
||||
|
||||
def improve_img_for_ocr(img: Image.Image) -> Image.Image:
|
||||
grayscale_image = np.array(img.convert("L"))
|
||||
grayscale_image = cv2.resize(grayscale_image, None, fx=1.2, fy=1.2, interpolation=cv2.INTER_CUBIC)
|
||||
thresh = cv2.adaptiveThreshold(
|
||||
grayscale_image, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2
|
||||
)
|
||||
return Image.fromarray(thresh)
|
||||
|
||||
|
||||
def calculate_ocr(image_path: str | Path) -> tuple[float | None, str]:
|
||||
import pytesseract
|
||||
|
||||
path = Path(image_path)
|
||||
if not path.exists():
|
||||
raise FileNotFoundError(f"Image not found: {image_path}")
|
||||
|
||||
img = Image.open(path)
|
||||
text = pytesseract.image_to_string(img)
|
||||
estimated_sqm = extract_total_sqm(text)
|
||||
if estimated_sqm is None:
|
||||
improved_img = improve_img_for_ocr(img)
|
||||
text2 = pytesseract.image_to_string(improved_img)
|
||||
estimated_sqm2 = extract_total_sqm(text2)
|
||||
logger.debug(f"before: {estimated_sqm} after: {estimated_sqm2} - {image_path}")
|
||||
return estimated_sqm2, text2
|
||||
|
||||
return estimated_sqm, text
|
||||
507
rec/query.py
Normal file
507
rec/query.py
Normal file
|
|
@ -0,0 +1,507 @@
|
|||
import enum
|
||||
import logging
|
||||
import time
|
||||
from typing import Any
|
||||
from contextlib import asynccontextmanager
|
||||
from collections.abc import AsyncIterator
|
||||
|
||||
import aiohttp
|
||||
from models.listing import FurnishType, ListingType
|
||||
from rec import districts
|
||||
from rec.exceptions import (
|
||||
CircuitBreakerOpenError,
|
||||
ThrottlingError,
|
||||
)
|
||||
from rec.throttle_detector import get_throttle_metrics, validate_response
|
||||
from rec.circuit_breaker import CircuitBreaker
|
||||
from tenacity import (
|
||||
retry,
|
||||
retry_if_exception_type,
|
||||
stop_after_attempt,
|
||||
wait_exponential,
|
||||
wait_random,
|
||||
)
|
||||
from config.scraper_config import ScraperConfig
|
||||
|
||||
logger = logging.getLogger("uvicorn.error")
|
||||
|
||||
# Global circuit breaker instance
|
||||
_circuit_breaker: CircuitBreaker | None = None
|
||||
|
||||
# API constants
|
||||
ANDROID_APP_VERSION = "3.70.0"
|
||||
ANDROID_APP_VERSION_LISTING = "4.28.0"
|
||||
RIGHTMOVE_API_BASE = "https://api.rightmove.co.uk/api"
|
||||
PROPERTY_LISTING_ENDPOINT = f"{RIGHTMOVE_API_BASE}/property-listing"
|
||||
|
||||
DEFAULT_HEADERS = {
|
||||
"Host": "api.rightmove.co.uk",
|
||||
"User-Agent": "okhttp/4.12.0",
|
||||
"Connection": "keep-alive",
|
||||
}
|
||||
|
||||
LISTING_HEADERS = {
|
||||
**DEFAULT_HEADERS,
|
||||
"Accept-Encoding": "gzip, deflate, br",
|
||||
}
|
||||
|
||||
|
||||
class PropertyType(enum.StrEnum):
|
||||
BUNGALOW = "bungalow"
|
||||
DETACHED = "detached"
|
||||
FLAT = "flat"
|
||||
LAND = "land"
|
||||
PARK_HOME = "park-home"
|
||||
SEMI_DETACHED = "semi-detached"
|
||||
TERRACED = "terraced"
|
||||
|
||||
|
||||
@asynccontextmanager
|
||||
async def create_session(
|
||||
config: ScraperConfig | None = None,
|
||||
) -> AsyncIterator[aiohttp.ClientSession]:
|
||||
"""Create an aiohttp session with optional proxy support.
|
||||
|
||||
Args:
|
||||
config: Scraper configuration. Loads from environment if not provided.
|
||||
|
||||
Yields:
|
||||
Configured aiohttp ClientSession.
|
||||
"""
|
||||
if config is None:
|
||||
config = ScraperConfig.from_env()
|
||||
|
||||
connector = None
|
||||
if config.proxy_url:
|
||||
try:
|
||||
from aiohttp_socks import ProxyConnector
|
||||
|
||||
connector = ProxyConnector.from_url(config.proxy_url)
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"aiohttp-socks is required for proxy support. "
|
||||
"Install with: pip install aiohttp-socks"
|
||||
)
|
||||
|
||||
session = aiohttp.ClientSession(
|
||||
trust_env=True,
|
||||
connector=connector,
|
||||
headers=DEFAULT_HEADERS,
|
||||
)
|
||||
try:
|
||||
yield session
|
||||
finally:
|
||||
await session.close()
|
||||
|
||||
|
||||
def get_circuit_breaker(config: ScraperConfig | None = None) -> CircuitBreaker | None:
|
||||
"""Get the global circuit breaker instance.
|
||||
|
||||
Args:
|
||||
config: Configuration for initializing the circuit breaker.
|
||||
|
||||
Returns:
|
||||
CircuitBreaker instance if enabled, None otherwise.
|
||||
"""
|
||||
global _circuit_breaker
|
||||
if config is None:
|
||||
config = ScraperConfig.from_env()
|
||||
|
||||
if not config.enable_circuit_breaker:
|
||||
return None
|
||||
|
||||
if _circuit_breaker is None:
|
||||
_circuit_breaker = CircuitBreaker(
|
||||
failure_threshold=config.circuit_breaker_failure_threshold,
|
||||
recovery_timeout=config.circuit_breaker_recovery_timeout,
|
||||
)
|
||||
return _circuit_breaker
|
||||
|
||||
|
||||
def reset_circuit_breaker() -> None:
|
||||
"""Reset the global circuit breaker."""
|
||||
global _circuit_breaker
|
||||
if _circuit_breaker is not None:
|
||||
_circuit_breaker.reset()
|
||||
|
||||
|
||||
def check_circuit_breaker(config: ScraperConfig | None = None) -> None:
|
||||
"""Check if the circuit breaker allows requests.
|
||||
|
||||
Args:
|
||||
config: Configuration for the circuit breaker.
|
||||
|
||||
Raises:
|
||||
CircuitBreakerOpenError: If the circuit is open.
|
||||
"""
|
||||
cb = get_circuit_breaker(config)
|
||||
if cb is not None:
|
||||
cb.call()
|
||||
|
||||
|
||||
def _build_base_params(
|
||||
*,
|
||||
channel: ListingType,
|
||||
page: int,
|
||||
page_size: int,
|
||||
radius: float,
|
||||
min_price: int,
|
||||
max_price: int,
|
||||
min_bedrooms: int,
|
||||
max_bedrooms: int,
|
||||
district: str,
|
||||
) -> dict[str, str]:
|
||||
return {
|
||||
"locationIdentifier": districts.get_districts()[district],
|
||||
"channel": str(channel).upper(),
|
||||
"page": str(page),
|
||||
"numberOfPropertiesPerPage": str(page_size),
|
||||
"radius": str(radius),
|
||||
"sortBy": "distance",
|
||||
"includeUnavailableProperties": "false",
|
||||
"minPrice": str(min_price),
|
||||
"maxPrice": str(max_price),
|
||||
"minBedrooms": str(min_bedrooms),
|
||||
"maxBedrooms": str(max_bedrooms),
|
||||
"apiApplication": "ANDROID",
|
||||
"appVersion": ANDROID_APP_VERSION_LISTING,
|
||||
}
|
||||
|
||||
|
||||
def _build_listing_params(
|
||||
*,
|
||||
page: int,
|
||||
channel: ListingType,
|
||||
min_bedrooms: int,
|
||||
max_bedrooms: int,
|
||||
radius: float,
|
||||
min_price: int,
|
||||
max_price: int,
|
||||
district: str,
|
||||
mustNewHome: bool,
|
||||
max_days_since_added: int,
|
||||
property_type: list[PropertyType],
|
||||
page_size: int,
|
||||
furnish_types: list[FurnishType],
|
||||
) -> dict[str, str]:
|
||||
params = _build_base_params(
|
||||
channel=channel,
|
||||
page=page,
|
||||
page_size=page_size,
|
||||
radius=radius,
|
||||
min_price=min_price,
|
||||
max_price=max_price,
|
||||
min_bedrooms=min_bedrooms,
|
||||
max_bedrooms=max_bedrooms,
|
||||
district=district,
|
||||
)
|
||||
if channel is ListingType.BUY:
|
||||
params["dontShow"] = "sharedOwnership,retirement"
|
||||
if len(property_type) > 0:
|
||||
params["propertyTypes"] = ",".join(property_type)
|
||||
if max_days_since_added is not None and max_days_since_added not in [
|
||||
1,
|
||||
3,
|
||||
7,
|
||||
14,
|
||||
]:
|
||||
raise Exception(
|
||||
f"Invalid max days - {max_days_since_added} Can only be got",
|
||||
[1, 3, 7, 14],
|
||||
)
|
||||
params["maxDaysSinceAdded"] = str(max_days_since_added)
|
||||
|
||||
if mustNewHome:
|
||||
params["mustHave"] = "newHome"
|
||||
if channel is ListingType.RENT:
|
||||
if furnish_types:
|
||||
params["furnishTypes"] = ",".join(furnish_types)
|
||||
return params
|
||||
|
||||
|
||||
def _build_probe_params(
|
||||
*,
|
||||
channel: ListingType,
|
||||
min_bedrooms: int,
|
||||
max_bedrooms: int,
|
||||
radius: float,
|
||||
min_price: int,
|
||||
max_price: int,
|
||||
district: str,
|
||||
max_days_since_added: int,
|
||||
furnish_types: list[FurnishType],
|
||||
) -> dict[str, str]:
|
||||
params = _build_base_params(
|
||||
channel=channel,
|
||||
page=1,
|
||||
page_size=1, # Minimal page size for probing
|
||||
radius=radius,
|
||||
min_price=min_price,
|
||||
max_price=max_price,
|
||||
min_bedrooms=min_bedrooms,
|
||||
max_bedrooms=max_bedrooms,
|
||||
district=district,
|
||||
)
|
||||
if channel is ListingType.BUY:
|
||||
params["dontShow"] = "sharedOwnership,retirement"
|
||||
if max_days_since_added is not None and max_days_since_added in [
|
||||
1,
|
||||
3,
|
||||
7,
|
||||
14,
|
||||
]:
|
||||
params["maxDaysSinceAdded"] = str(max_days_since_added)
|
||||
|
||||
if channel is ListingType.RENT:
|
||||
if furnish_types:
|
||||
params["furnishTypes"] = ",".join(furnish_types)
|
||||
return params
|
||||
|
||||
|
||||
async def _execute_api_request(
|
||||
*,
|
||||
url: str,
|
||||
params: dict[str, str],
|
||||
headers: dict[str, str],
|
||||
session: aiohttp.ClientSession | None,
|
||||
config: ScraperConfig,
|
||||
expect_data: bool = True,
|
||||
error_context: str = "",
|
||||
) -> dict[str, Any]:
|
||||
check_circuit_breaker(config)
|
||||
cb = get_circuit_breaker(config)
|
||||
|
||||
async def do_request(s: aiohttp.ClientSession) -> dict[str, Any]:
|
||||
start_time = time.time()
|
||||
try:
|
||||
async with s.get(url, params=params, headers=headers) as response:
|
||||
response_time = time.time() - start_time
|
||||
body = await response.json() if response.status == 200 else None
|
||||
|
||||
validate_response(
|
||||
response,
|
||||
response_time,
|
||||
body,
|
||||
config.slow_response_threshold,
|
||||
expect_data=expect_data,
|
||||
)
|
||||
|
||||
if response.status != 200:
|
||||
raise Exception(
|
||||
f"{error_context}Failed due to: {await response.text()}"
|
||||
)
|
||||
|
||||
if cb is not None:
|
||||
cb.record_success()
|
||||
return body # type: ignore
|
||||
except ThrottlingError:
|
||||
if cb is not None:
|
||||
cb.record_failure()
|
||||
raise
|
||||
except Exception as e:
|
||||
if cb is not None:
|
||||
cb.record_failure()
|
||||
raise e
|
||||
|
||||
if session:
|
||||
return await do_request(session)
|
||||
else:
|
||||
async with aiohttp.ClientSession(trust_env=True) as new_session:
|
||||
return await do_request(new_session)
|
||||
|
||||
|
||||
@retry(
|
||||
retry=retry_if_exception_type(ThrottlingError),
|
||||
wait=wait_exponential(multiplier=2, min=2, max=120),
|
||||
stop=stop_after_attempt(5),
|
||||
)
|
||||
async def detail_query(
|
||||
detail_id: int,
|
||||
session: aiohttp.ClientSession | None = None,
|
||||
config: ScraperConfig | None = None,
|
||||
) -> dict[str, Any]:
|
||||
"""Fetch detailed property information.
|
||||
|
||||
Args:
|
||||
detail_id: The property identifier.
|
||||
session: Optional aiohttp session. Creates new one if not provided.
|
||||
config: Scraper configuration. Loads from environment if not provided.
|
||||
|
||||
Returns:
|
||||
Property details as a dictionary.
|
||||
|
||||
Raises:
|
||||
CircuitBreakerOpenError: If the circuit breaker is open.
|
||||
ThrottlingError: If the request is throttled.
|
||||
"""
|
||||
if config is None:
|
||||
config = ScraperConfig.from_env()
|
||||
|
||||
params = {
|
||||
"apiApplication": "ANDROID",
|
||||
"appVersion": ANDROID_APP_VERSION,
|
||||
}
|
||||
url = f"{RIGHTMOVE_API_BASE}/property/{detail_id}"
|
||||
|
||||
return await _execute_api_request(
|
||||
url=url,
|
||||
params=params,
|
||||
headers=DEFAULT_HEADERS,
|
||||
session=session,
|
||||
config=config,
|
||||
expect_data=True,
|
||||
error_context=f"id: {detail_id}. Status Code: ",
|
||||
)
|
||||
|
||||
|
||||
@retry(
|
||||
retry=retry_if_exception_type(ThrottlingError),
|
||||
wait=wait_exponential(multiplier=2, min=2, max=120),
|
||||
stop=stop_after_attempt(5),
|
||||
)
|
||||
async def listing_query(
|
||||
*,
|
||||
page: int,
|
||||
channel: ListingType,
|
||||
min_bedrooms: int,
|
||||
max_bedrooms: int,
|
||||
radius: float,
|
||||
min_price: int,
|
||||
max_price: int,
|
||||
district: str, # = "STATION^5168", # kings cross station
|
||||
mustNewHome: bool = False,
|
||||
max_days_since_added: int = 30,
|
||||
property_type: list[PropertyType] | None = None,
|
||||
page_size: int = 25,
|
||||
furnish_types: list[FurnishType] | None = None,
|
||||
session: aiohttp.ClientSession | None = None,
|
||||
config: ScraperConfig | None = None,
|
||||
) -> dict[str, Any]:
|
||||
"""Execute a listing search query.
|
||||
|
||||
Args:
|
||||
page: Page number to fetch (1-indexed).
|
||||
channel: Listing type (BUY or RENT).
|
||||
min_bedrooms: Minimum number of bedrooms.
|
||||
max_bedrooms: Maximum number of bedrooms.
|
||||
radius: Search radius.
|
||||
min_price: Minimum price.
|
||||
max_price: Maximum price.
|
||||
district: District identifier string.
|
||||
mustNewHome: Filter for new homes only (BUY only).
|
||||
max_days_since_added: Maximum days since listing was added (BUY only).
|
||||
property_type: List of property types to filter (BUY only).
|
||||
page_size: Number of results per page (default 25).
|
||||
furnish_types: List of furnish types to filter (RENT only).
|
||||
session: Optional aiohttp session. Creates new one if not provided.
|
||||
config: Scraper configuration. Loads from environment if not provided.
|
||||
|
||||
Returns:
|
||||
API response as a dictionary.
|
||||
|
||||
Raises:
|
||||
CircuitBreakerOpenError: If the circuit breaker is open.
|
||||
ThrottlingError: If the request is throttled.
|
||||
"""
|
||||
if config is None:
|
||||
config = ScraperConfig.from_env()
|
||||
if property_type is None:
|
||||
property_type = []
|
||||
if furnish_types is None:
|
||||
furnish_types = []
|
||||
|
||||
params = _build_listing_params(
|
||||
page=page,
|
||||
channel=channel,
|
||||
min_bedrooms=min_bedrooms,
|
||||
max_bedrooms=max_bedrooms,
|
||||
radius=radius,
|
||||
min_price=min_price,
|
||||
max_price=max_price,
|
||||
district=district,
|
||||
mustNewHome=mustNewHome,
|
||||
max_days_since_added=max_days_since_added,
|
||||
property_type=property_type,
|
||||
page_size=page_size,
|
||||
furnish_types=furnish_types,
|
||||
)
|
||||
|
||||
return await _execute_api_request(
|
||||
url=PROPERTY_LISTING_ENDPOINT,
|
||||
params=params,
|
||||
headers=LISTING_HEADERS,
|
||||
session=session,
|
||||
config=config,
|
||||
expect_data=(page == 1),
|
||||
)
|
||||
|
||||
|
||||
@retry(
|
||||
retry=retry_if_exception_type(ThrottlingError),
|
||||
wait=wait_exponential(multiplier=2, min=2, max=60),
|
||||
stop=stop_after_attempt(5),
|
||||
)
|
||||
async def probe_query(
|
||||
*,
|
||||
session: aiohttp.ClientSession,
|
||||
channel: ListingType,
|
||||
min_bedrooms: int,
|
||||
max_bedrooms: int,
|
||||
radius: float,
|
||||
min_price: int,
|
||||
max_price: int,
|
||||
district: str,
|
||||
max_days_since_added: int = 30,
|
||||
furnish_types: list[FurnishType] | None = None,
|
||||
config: ScraperConfig | None = None,
|
||||
) -> dict[str, Any]:
|
||||
"""Probe the API to get result count without fetching full results.
|
||||
|
||||
Makes a minimal request (page_size=1) to efficiently get totalAvailableResults.
|
||||
|
||||
Args:
|
||||
session: aiohttp session for making requests.
|
||||
channel: Listing type (BUY or RENT).
|
||||
min_bedrooms: Minimum number of bedrooms.
|
||||
max_bedrooms: Maximum number of bedrooms.
|
||||
radius: Search radius.
|
||||
min_price: Minimum price.
|
||||
max_price: Maximum price.
|
||||
district: District identifier string.
|
||||
max_days_since_added: Maximum days since listing was added (BUY only).
|
||||
furnish_types: List of furnish types to filter (RENT only).
|
||||
config: Scraper configuration. Loads from environment if not provided.
|
||||
|
||||
Returns:
|
||||
API response containing totalAvailableResults.
|
||||
|
||||
Raises:
|
||||
CircuitBreakerOpenError: If the circuit breaker is open.
|
||||
ThrottlingError: If the request is throttled.
|
||||
"""
|
||||
if config is None:
|
||||
config = ScraperConfig.from_env()
|
||||
if furnish_types is None:
|
||||
furnish_types = []
|
||||
|
||||
params = _build_probe_params(
|
||||
channel=channel,
|
||||
min_bedrooms=min_bedrooms,
|
||||
max_bedrooms=max_bedrooms,
|
||||
radius=radius,
|
||||
min_price=min_price,
|
||||
max_price=max_price,
|
||||
district=district,
|
||||
max_days_since_added=max_days_since_added,
|
||||
furnish_types=furnish_types,
|
||||
)
|
||||
|
||||
return await _execute_api_request(
|
||||
url=PROPERTY_LISTING_ENDPOINT,
|
||||
params=params,
|
||||
headers=LISTING_HEADERS,
|
||||
session=session,
|
||||
config=config,
|
||||
expect_data=False,
|
||||
error_context="Probe failed: ",
|
||||
)
|
||||
47
rec/route_serializer.py
Normal file
47
rec/route_serializer.py
Normal file
|
|
@ -0,0 +1,47 @@
|
|||
import dataclasses
|
||||
import json
|
||||
from typing import List
|
||||
|
||||
from models.listing import DestinationMode, Route, RouteLegStep
|
||||
from rec import routing
|
||||
|
||||
|
||||
class RouteSerializer:
|
||||
@staticmethod
|
||||
def serialize(routing_info: dict[DestinationMode, list[Route]]) -> str:
|
||||
return json.dumps(
|
||||
{
|
||||
json.dumps(dataclasses.asdict(destination_mode)): [
|
||||
json.dumps(dataclasses.asdict(route)) for route in routes
|
||||
]
|
||||
for destination_mode, routes in routing_info.items()
|
||||
}
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def deserialize(route_data_json: str) -> dict[DestinationMode, List[Route]]:
|
||||
json_data = json.loads(route_data_json)
|
||||
destination_routes = {}
|
||||
for destination_mode_str, routes_json in json_data.items():
|
||||
parsed_destination = json.loads(destination_mode_str)
|
||||
destination_mode = DestinationMode(
|
||||
destination_address=parsed_destination["destination_address"],
|
||||
travel_mode=routing.TravelMode(parsed_destination["travel_mode"]),
|
||||
)
|
||||
parsed_route = json.loads(routes_json[0])
|
||||
routes = [
|
||||
Route(
|
||||
legs=[
|
||||
RouteLegStep(
|
||||
distance_meters=step["distance_meters"],
|
||||
duration_s=step["duration_s"],
|
||||
travel_mode=routing.TravelMode(step["travel_mode"]),
|
||||
)
|
||||
for step in parsed_route["legs"]
|
||||
],
|
||||
distance_meters=parsed_route["distance_meters"],
|
||||
duration_s=int(parsed_route["duration_s"]),
|
||||
)
|
||||
]
|
||||
destination_routes[destination_mode] = routes
|
||||
return destination_routes
|
||||
63
rec/routing.py
Normal file
63
rec/routing.py
Normal file
|
|
@ -0,0 +1,63 @@
|
|||
import enum
|
||||
import os
|
||||
from typing import Any
|
||||
import requests
|
||||
from rec.utils import nextMonday
|
||||
from rec.exceptions import RoutingApiError
|
||||
|
||||
ROUTES_API_URL = "https://routes.googleapis.com/directions/v2:computeRoutes"
|
||||
API_KEY_ENVIRONMENT_VARIABLE = "ROUTING_API_KEY"
|
||||
ROUTES_FIELD_MASK = (
|
||||
"routes.distanceMeters,"
|
||||
"routes.duration,"
|
||||
"routes.staticDuration,"
|
||||
"routes.legs.steps.distanceMeters,"
|
||||
"routes.legs.steps.staticDuration,"
|
||||
"routes.legs.steps.travelMode"
|
||||
)
|
||||
|
||||
|
||||
class TravelMode(enum.StrEnum):
|
||||
TRANSIT = "TRANSIT"
|
||||
BICYCLE = "BICYCLE"
|
||||
WALK = "WALK"
|
||||
DRIVE = "DRIVE"
|
||||
|
||||
|
||||
def transit_route(
|
||||
origin_lat: float,
|
||||
origin_lon: float,
|
||||
dest_address: str,
|
||||
travel_mode: TravelMode,
|
||||
compute_alternative_routes: bool = True,
|
||||
) -> dict[str, Any]:
|
||||
monday9am = nextMonday()
|
||||
|
||||
# must be set
|
||||
api_key = os.environ[API_KEY_ENVIRONMENT_VARIABLE]
|
||||
|
||||
header = {
|
||||
"X-Goog-Api-Key": api_key,
|
||||
"Content-Type": "application/json",
|
||||
"X-Goog-FieldMask": ROUTES_FIELD_MASK,
|
||||
}
|
||||
|
||||
body = {
|
||||
"origin": {
|
||||
"location": {"latLng": {"latitude": origin_lat, "longitude": origin_lon}}
|
||||
},
|
||||
"destination": {
|
||||
"address": dest_address
|
||||
},
|
||||
"travelMode": travel_mode.value,
|
||||
"departureTime": monday9am.strftime("%Y-%m-%dT%H:%M:%S.%fZ"),
|
||||
"computeAlternativeRoutes": compute_alternative_routes,
|
||||
"languageCode": "en-US",
|
||||
"units": "METRIC",
|
||||
}
|
||||
|
||||
r = requests.post(ROUTES_API_URL, json=body, headers=header)
|
||||
if r.status_code == 200:
|
||||
return r.json()
|
||||
|
||||
raise RoutingApiError(r.status_code, r.json())
|
||||
232
rec/throttle_detector.py
Normal file
232
rec/throttle_detector.py
Normal file
|
|
@ -0,0 +1,232 @@
|
|||
"""Throttling detection and metrics for Rightmove API."""
|
||||
from __future__ import annotations
|
||||
|
||||
import time
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Any
|
||||
|
||||
import aiohttp
|
||||
|
||||
from rec.exceptions import (
|
||||
InvalidResponseError,
|
||||
IPBlockedError,
|
||||
RateLimitError,
|
||||
ServiceUnavailableError,
|
||||
SlowResponseError,
|
||||
UnexpectedEmptyResponseError,
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class ThrottleMetrics:
|
||||
"""Tracks throttling events and metrics.
|
||||
|
||||
Attributes:
|
||||
rate_limit_count: Number of HTTP 429 errors.
|
||||
service_unavailable_count: Number of HTTP 503 errors.
|
||||
ip_blocked_count: Number of HTTP 403 errors.
|
||||
slow_response_count: Number of slow responses.
|
||||
empty_response_count: Number of unexpected empty responses.
|
||||
invalid_response_count: Number of invalid/error responses.
|
||||
total_requests: Total number of requests made.
|
||||
total_response_time: Cumulative response time in seconds.
|
||||
"""
|
||||
|
||||
rate_limit_count: int = 0
|
||||
service_unavailable_count: int = 0
|
||||
ip_blocked_count: int = 0
|
||||
slow_response_count: int = 0
|
||||
empty_response_count: int = 0
|
||||
invalid_response_count: int = 0
|
||||
total_requests: int = 0
|
||||
total_response_time: float = 0.0
|
||||
_start_time: float = field(default_factory=time.time)
|
||||
|
||||
def record_rate_limit(self) -> None:
|
||||
"""Record a rate limit error (HTTP 429)."""
|
||||
self.rate_limit_count += 1
|
||||
|
||||
def record_service_unavailable(self) -> None:
|
||||
"""Record a service unavailable error (HTTP 503)."""
|
||||
self.service_unavailable_count += 1
|
||||
|
||||
def record_ip_blocked(self) -> None:
|
||||
"""Record an IP blocked error (HTTP 403)."""
|
||||
self.ip_blocked_count += 1
|
||||
|
||||
def record_slow_response(self, response_time: float) -> None:
|
||||
"""Record a slow response.
|
||||
|
||||
Args:
|
||||
response_time: Response time in seconds.
|
||||
"""
|
||||
self.slow_response_count += 1
|
||||
self.total_response_time += response_time
|
||||
self.total_requests += 1
|
||||
|
||||
def record_empty_response(self) -> None:
|
||||
"""Record an unexpected empty response."""
|
||||
self.empty_response_count += 1
|
||||
|
||||
def record_invalid_response(self) -> None:
|
||||
"""Record an invalid or error response."""
|
||||
self.invalid_response_count += 1
|
||||
|
||||
def record_request(self, response_time: float) -> None:
|
||||
"""Record a successful request.
|
||||
|
||||
Args:
|
||||
response_time: Response time in seconds.
|
||||
"""
|
||||
self.total_requests += 1
|
||||
self.total_response_time += response_time
|
||||
|
||||
@property
|
||||
def average_response_time(self) -> float:
|
||||
"""Calculate average response time in seconds."""
|
||||
if self.total_requests == 0:
|
||||
return 0.0
|
||||
return self.total_response_time / self.total_requests
|
||||
|
||||
@property
|
||||
def total_throttling_events(self) -> int:
|
||||
"""Total number of throttling events."""
|
||||
return (
|
||||
self.rate_limit_count
|
||||
+ self.service_unavailable_count
|
||||
+ self.ip_blocked_count
|
||||
+ self.slow_response_count
|
||||
)
|
||||
|
||||
@property
|
||||
def throttle_rate(self) -> float:
|
||||
"""Percentage of requests that were throttled."""
|
||||
if self.total_requests == 0:
|
||||
return 0.0
|
||||
return (self.total_throttling_events / self.total_requests) * 100
|
||||
|
||||
@property
|
||||
def elapsed_time(self) -> float:
|
||||
"""Time elapsed since metrics started tracking."""
|
||||
return time.time() - self._start_time
|
||||
|
||||
def summary(self) -> str:
|
||||
"""Generate a summary of throttling metrics."""
|
||||
return (
|
||||
f"Throttle Metrics Summary:\n"
|
||||
f" Total Requests: {self.total_requests}\n"
|
||||
f" Total Throttling Events: {self.total_throttling_events}\n"
|
||||
f" Throttle Rate: {self.throttle_rate:.2f}%\n"
|
||||
f" Rate Limit (429): {self.rate_limit_count}\n"
|
||||
f" Service Unavailable (503): {self.service_unavailable_count}\n"
|
||||
f" IP Blocked (403): {self.ip_blocked_count}\n"
|
||||
f" Slow Responses: {self.slow_response_count}\n"
|
||||
f" Empty Responses: {self.empty_response_count}\n"
|
||||
f" Invalid Responses: {self.invalid_response_count}\n"
|
||||
f" Average Response Time: {self.average_response_time:.2f}s\n"
|
||||
f" Elapsed Time: {self.elapsed_time:.2f}s"
|
||||
)
|
||||
|
||||
|
||||
# Global metrics instance
|
||||
_global_metrics: ThrottleMetrics | None = None
|
||||
|
||||
|
||||
def get_throttle_metrics() -> ThrottleMetrics:
|
||||
"""Get the global throttle metrics instance.
|
||||
|
||||
Returns:
|
||||
Global ThrottleMetrics instance.
|
||||
"""
|
||||
global _global_metrics
|
||||
if _global_metrics is None:
|
||||
_global_metrics = ThrottleMetrics()
|
||||
return _global_metrics
|
||||
|
||||
|
||||
def reset_throttle_metrics() -> None:
|
||||
"""Reset the global throttle metrics."""
|
||||
global _global_metrics
|
||||
_global_metrics = ThrottleMetrics()
|
||||
|
||||
|
||||
def validate_response(
|
||||
response: aiohttp.ClientResponse,
|
||||
response_time: float,
|
||||
response_body: dict[str, Any] | None,
|
||||
slow_response_threshold: float,
|
||||
expect_data: bool = True,
|
||||
) -> None:
|
||||
"""Validate an API response and raise appropriate exceptions for throttling.
|
||||
|
||||
Args:
|
||||
response: The aiohttp response object.
|
||||
response_time: Time taken for the request in seconds.
|
||||
response_body: Parsed JSON response body (if available).
|
||||
slow_response_threshold: Threshold in seconds for slow responses.
|
||||
expect_data: Whether we expect data in the response.
|
||||
|
||||
Raises:
|
||||
RateLimitError: If HTTP 429 is returned.
|
||||
ServiceUnavailableError: If HTTP 503 is returned.
|
||||
IPBlockedError: If HTTP 403 is returned.
|
||||
SlowResponseError: If response time exceeds threshold.
|
||||
UnexpectedEmptyResponseError: If response is empty when data is expected.
|
||||
InvalidResponseError: If response contains error messages.
|
||||
"""
|
||||
metrics = get_throttle_metrics()
|
||||
|
||||
# Check HTTP status codes
|
||||
if response.status == 429:
|
||||
metrics.record_rate_limit()
|
||||
raise RateLimitError(
|
||||
f"Rate limit exceeded (HTTP 429). "
|
||||
f"Response time: {response_time:.2f}s"
|
||||
)
|
||||
|
||||
if response.status == 503:
|
||||
metrics.record_service_unavailable()
|
||||
raise ServiceUnavailableError(
|
||||
f"Service unavailable (HTTP 503). "
|
||||
f"Response time: {response_time:.2f}s"
|
||||
)
|
||||
|
||||
if response.status == 403:
|
||||
metrics.record_ip_blocked()
|
||||
raise IPBlockedError(
|
||||
f"Access forbidden, possible IP block (HTTP 403). "
|
||||
f"Response time: {response_time:.2f}s"
|
||||
)
|
||||
|
||||
# Check response time
|
||||
if response_time > slow_response_threshold:
|
||||
metrics.record_slow_response(response_time)
|
||||
raise SlowResponseError(
|
||||
f"Slow response detected: {response_time:.2f}s "
|
||||
f"(threshold: {slow_response_threshold}s)"
|
||||
)
|
||||
|
||||
# Check response body if available
|
||||
if response_body is not None:
|
||||
# Check for error messages
|
||||
if "error" in response_body or "GENERIC_ERROR" in str(response_body):
|
||||
metrics.record_invalid_response()
|
||||
raise InvalidResponseError(
|
||||
f"Error in response body: {response_body}"
|
||||
)
|
||||
|
||||
# Check for unexpected empty responses
|
||||
if expect_data:
|
||||
properties = response_body.get("properties", [])
|
||||
total_results = response_body.get("totalAvailableResults", 0)
|
||||
|
||||
# If we expect data but got none (and total shows there should be some)
|
||||
if total_results > 0 and len(properties) == 0:
|
||||
metrics.record_empty_response()
|
||||
raise UnexpectedEmptyResponseError(
|
||||
f"Expected data but got empty response. "
|
||||
f"Total available: {total_results}"
|
||||
)
|
||||
|
||||
# Record successful request
|
||||
metrics.record_request(response_time)
|
||||
21
rec/utils.py
Normal file
21
rec/utils.py
Normal file
|
|
@ -0,0 +1,21 @@
|
|||
from datetime import datetime, timedelta, timezone
|
||||
|
||||
|
||||
def nextMonday():
|
||||
"""
|
||||
I think this function doesnt work when the day is monday itself.
|
||||
|
||||
Returns:
|
||||
_type_: _description_
|
||||
"""
|
||||
now = datetime.now(timezone.utc)
|
||||
days_until_monday = (0 - now.weekday() + 7) % 7
|
||||
monday = now + timedelta(days=days_until_monday)
|
||||
monday_9am = monday.replace(
|
||||
hour=9, minute=0, second=0, microsecond=0, tzinfo=timezone.utc
|
||||
)
|
||||
return monday_9am
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
print(nextMonday())
|
||||
Loading…
Add table
Add a link
Reference in a new issue