Add throttling detection and circuit breaker for Rightmove scraper

This commit is contained in:
Viktor Barzin 2026-02-02 22:50:19 +00:00
parent e8293c6042
commit f880664a98
10 changed files with 1428 additions and 86 deletions

View file

@ -1,4 +1,6 @@
import enum
import logging
import time
from typing import Any
from contextlib import asynccontextmanager
from collections.abc import AsyncIterator
@ -6,9 +8,26 @@ from collections.abc import AsyncIterator
import aiohttp
from models.listing import FurnishType, ListingType
from rec import districts
from tenacity import retry, stop_after_attempt, wait_random
from rec.exceptions import (
CircuitBreakerOpenError,
ThrottlingError,
)
from rec.throttle_detector import get_throttle_metrics, validate_response
from rec.circuit_breaker import CircuitBreaker
from tenacity import (
retry,
retry_if_exception_type,
stop_after_attempt,
wait_exponential,
wait_random,
)
from config.scraper_config import ScraperConfig
logger = logging.getLogger("uvicorn.error")
# Global circuit breaker instance
_circuit_breaker: CircuitBreaker | None = None
DEFAULT_HEADERS = {
"Host": "api.rightmove.co.uk",
@ -65,20 +84,81 @@ async def create_session(
await session.close()
@retry(wait=wait_random(min=1, max=2), stop=stop_after_attempt(3))
def get_circuit_breaker(config: ScraperConfig | None = None) -> CircuitBreaker | None:
"""Get the global circuit breaker instance.
Args:
config: Configuration for initializing the circuit breaker.
Returns:
CircuitBreaker instance if enabled, None otherwise.
"""
global _circuit_breaker
if config is None:
config = ScraperConfig.from_env()
if not config.enable_circuit_breaker:
return None
if _circuit_breaker is None:
_circuit_breaker = CircuitBreaker(
failure_threshold=config.circuit_breaker_failure_threshold,
recovery_timeout=config.circuit_breaker_recovery_timeout,
)
return _circuit_breaker
def reset_circuit_breaker() -> None:
"""Reset the global circuit breaker."""
global _circuit_breaker
if _circuit_breaker is not None:
_circuit_breaker.reset()
def check_circuit_breaker(config: ScraperConfig | None = None) -> None:
"""Check if the circuit breaker allows requests.
Args:
config: Configuration for the circuit breaker.
Raises:
CircuitBreakerOpenError: If the circuit is open.
"""
cb = get_circuit_breaker(config)
if cb is not None:
cb.call()
@retry(
retry=retry_if_exception_type(ThrottlingError),
wait=wait_exponential(multiplier=2, min=2, max=120),
stop=stop_after_attempt(5),
)
async def detail_query(
detail_id: int,
session: aiohttp.ClientSession | None = None,
config: ScraperConfig | None = None,
) -> dict[str, Any]:
"""Fetch detailed property information.
Args:
detail_id: The property identifier.
session: Optional aiohttp session. Creates new one if not provided.
config: Scraper configuration. Loads from environment if not provided.
Returns:
Property details as a dictionary.
Raises:
CircuitBreakerOpenError: If the circuit breaker is open.
ThrottlingError: If the request is throttled.
"""
if config is None:
config = ScraperConfig.from_env()
check_circuit_breaker(config)
cb = get_circuit_breaker(config)
params = {
"apiApplication": "ANDROID",
"appVersion": "3.70.0",
@ -86,13 +166,38 @@ async def detail_query(
url = f"https://api.rightmove.co.uk/api/property/{detail_id}"
async def do_request(s: aiohttp.ClientSession) -> dict[str, Any]:
async with s.get(url, params=params, headers=DEFAULT_HEADERS) as response:
if response.status != 200:
raise Exception(
f"""id: {detail_id}. Status Code: {response.status}."""
f"""Failed due to: {await response.text()}"""
start_time = time.time()
try:
async with s.get(url, params=params, headers=DEFAULT_HEADERS) as response:
response_time = time.time() - start_time
body = await response.json() if response.status == 200 else None
# Validate response for throttling
validate_response(
response,
response_time,
body,
config.slow_response_threshold,
expect_data=True,
)
return await response.json()
if response.status != 200:
raise Exception(
f"""id: {detail_id}. Status Code: {response.status}."""
f"""Failed due to: {await response.text()}"""
)
if cb is not None:
cb.record_success()
return body # type: ignore
except ThrottlingError:
if cb is not None:
cb.record_failure()
raise
except Exception as e:
if cb is not None:
cb.record_failure()
raise e
if session:
return await do_request(session)
@ -101,7 +206,11 @@ async def detail_query(
return await do_request(new_session)
@retry(wait=wait_random(min=1, max=60), stop=stop_after_attempt(3))
@retry(
retry=retry_if_exception_type(ThrottlingError),
wait=wait_exponential(multiplier=2, min=2, max=120),
stop=stop_after_attempt(5),
)
async def listing_query(
*,
page: int,
@ -118,6 +227,7 @@ async def listing_query(
page_size: int = 25,
furnish_types: list[FurnishType] = [],
session: aiohttp.ClientSession | None = None,
config: ScraperConfig | None = None,
) -> dict[str, Any]:
"""Execute a listing search query.
@ -136,10 +246,21 @@ async def listing_query(
page_size: Number of results per page (default 25).
furnish_types: List of furnish types to filter (RENT only).
session: Optional aiohttp session. Creates new one if not provided.
config: Scraper configuration. Loads from environment if not provided.
Returns:
API response as a dictionary.
Raises:
CircuitBreakerOpenError: If the circuit breaker is open.
ThrottlingError: If the request is throttled.
"""
if config is None:
config = ScraperConfig.from_env()
check_circuit_breaker(config)
cb = get_circuit_breaker(config)
params: dict[str, str] = {
"locationIdentifier": districts.get_districts()[district],
"channel": str(channel).upper(),
@ -185,14 +306,39 @@ async def listing_query(
}
async def do_request(s: aiohttp.ClientSession) -> dict[str, Any]:
async with s.get(
"https://api.rightmove.co.uk/api/property-listing",
params=params,
headers=request_headers,
) as response:
if response.status != 200:
raise Exception(f"Failed due to: {await response.text()}")
return await response.json()
start_time = time.time()
try:
async with s.get(
"https://api.rightmove.co.uk/api/property-listing",
params=params,
headers=request_headers,
) as response:
response_time = time.time() - start_time
body = await response.json() if response.status == 200 else None
# Validate response for throttling
validate_response(
response,
response_time,
body,
config.slow_response_threshold,
expect_data=(page == 1), # Only expect data on first page
)
if response.status != 200:
raise Exception(f"Failed due to: {await response.text()}")
if cb is not None:
cb.record_success()
return body # type: ignore
except ThrottlingError:
if cb is not None:
cb.record_failure()
raise
except Exception as e:
if cb is not None:
cb.record_failure()
raise e
if session:
return await do_request(session)
@ -201,7 +347,11 @@ async def listing_query(
return await do_request(new_session)
@retry(wait=wait_random(min=1, max=10), stop=stop_after_attempt(3))
@retry(
retry=retry_if_exception_type(ThrottlingError),
wait=wait_exponential(multiplier=2, min=2, max=60),
stop=stop_after_attempt(5),
)
async def probe_query(
*,
session: aiohttp.ClientSession,
@ -214,6 +364,7 @@ async def probe_query(
district: str,
max_days_since_added: int = 30,
furnish_types: list[FurnishType] = [],
config: ScraperConfig | None = None,
) -> dict[str, Any]:
"""Probe the API to get result count without fetching full results.
@ -230,10 +381,21 @@ async def probe_query(
district: District identifier string.
max_days_since_added: Maximum days since listing was added (BUY only).
furnish_types: List of furnish types to filter (RENT only).
config: Scraper configuration. Loads from environment if not provided.
Returns:
API response containing totalAvailableResults.
Raises:
CircuitBreakerOpenError: If the circuit breaker is open.
ThrottlingError: If the request is throttled.
"""
if config is None:
config = ScraperConfig.from_env()
check_circuit_breaker(config)
cb = get_circuit_breaker(config)
params: dict[str, str] = {
"locationIdentifier": districts.get_districts()[district],
"channel": str(channel).upper(),
@ -271,11 +433,36 @@ async def probe_query(
"Connection": "keep-alive",
}
async with session.get(
"https://api.rightmove.co.uk/api/property-listing",
params=params,
headers=request_headers,
) as response:
if response.status != 200:
raise Exception(f"Probe failed: {await response.text()}")
return await response.json()
start_time = time.time()
try:
async with session.get(
"https://api.rightmove.co.uk/api/property-listing",
params=params,
headers=request_headers,
) as response:
response_time = time.time() - start_time
body = await response.json() if response.status == 200 else None
# Validate response for throttling
validate_response(
response,
response_time,
body,
config.slow_response_threshold,
expect_data=False, # Probe doesn't need data, just count
)
if response.status != 200:
raise Exception(f"Probe failed: {await response.text()}")
if cb is not None:
cb.record_success()
return body # type: ignore
except ThrottlingError:
if cb is not None:
cb.record_failure()
raise
except Exception as e:
if cb is not None:
cb.record_failure()
raise e