Add throttling detection and circuit breaker for Rightmove scraper
This commit is contained in:
parent
e8293c6042
commit
f880664a98
10 changed files with 1428 additions and 86 deletions
|
|
@ -1,4 +1,6 @@
|
|||
import enum
|
||||
import logging
|
||||
import time
|
||||
from typing import Any
|
||||
from contextlib import asynccontextmanager
|
||||
from collections.abc import AsyncIterator
|
||||
|
|
@ -6,9 +8,26 @@ from collections.abc import AsyncIterator
|
|||
import aiohttp
|
||||
from models.listing import FurnishType, ListingType
|
||||
from rec import districts
|
||||
from tenacity import retry, stop_after_attempt, wait_random
|
||||
from rec.exceptions import (
|
||||
CircuitBreakerOpenError,
|
||||
ThrottlingError,
|
||||
)
|
||||
from rec.throttle_detector import get_throttle_metrics, validate_response
|
||||
from rec.circuit_breaker import CircuitBreaker
|
||||
from tenacity import (
|
||||
retry,
|
||||
retry_if_exception_type,
|
||||
stop_after_attempt,
|
||||
wait_exponential,
|
||||
wait_random,
|
||||
)
|
||||
from config.scraper_config import ScraperConfig
|
||||
|
||||
logger = logging.getLogger("uvicorn.error")
|
||||
|
||||
# Global circuit breaker instance
|
||||
_circuit_breaker: CircuitBreaker | None = None
|
||||
|
||||
|
||||
DEFAULT_HEADERS = {
|
||||
"Host": "api.rightmove.co.uk",
|
||||
|
|
@ -65,20 +84,81 @@ async def create_session(
|
|||
await session.close()
|
||||
|
||||
|
||||
@retry(wait=wait_random(min=1, max=2), stop=stop_after_attempt(3))
|
||||
def get_circuit_breaker(config: ScraperConfig | None = None) -> CircuitBreaker | None:
|
||||
"""Get the global circuit breaker instance.
|
||||
|
||||
Args:
|
||||
config: Configuration for initializing the circuit breaker.
|
||||
|
||||
Returns:
|
||||
CircuitBreaker instance if enabled, None otherwise.
|
||||
"""
|
||||
global _circuit_breaker
|
||||
if config is None:
|
||||
config = ScraperConfig.from_env()
|
||||
|
||||
if not config.enable_circuit_breaker:
|
||||
return None
|
||||
|
||||
if _circuit_breaker is None:
|
||||
_circuit_breaker = CircuitBreaker(
|
||||
failure_threshold=config.circuit_breaker_failure_threshold,
|
||||
recovery_timeout=config.circuit_breaker_recovery_timeout,
|
||||
)
|
||||
return _circuit_breaker
|
||||
|
||||
|
||||
def reset_circuit_breaker() -> None:
|
||||
"""Reset the global circuit breaker."""
|
||||
global _circuit_breaker
|
||||
if _circuit_breaker is not None:
|
||||
_circuit_breaker.reset()
|
||||
|
||||
|
||||
def check_circuit_breaker(config: ScraperConfig | None = None) -> None:
|
||||
"""Check if the circuit breaker allows requests.
|
||||
|
||||
Args:
|
||||
config: Configuration for the circuit breaker.
|
||||
|
||||
Raises:
|
||||
CircuitBreakerOpenError: If the circuit is open.
|
||||
"""
|
||||
cb = get_circuit_breaker(config)
|
||||
if cb is not None:
|
||||
cb.call()
|
||||
|
||||
|
||||
@retry(
|
||||
retry=retry_if_exception_type(ThrottlingError),
|
||||
wait=wait_exponential(multiplier=2, min=2, max=120),
|
||||
stop=stop_after_attempt(5),
|
||||
)
|
||||
async def detail_query(
|
||||
detail_id: int,
|
||||
session: aiohttp.ClientSession | None = None,
|
||||
config: ScraperConfig | None = None,
|
||||
) -> dict[str, Any]:
|
||||
"""Fetch detailed property information.
|
||||
|
||||
Args:
|
||||
detail_id: The property identifier.
|
||||
session: Optional aiohttp session. Creates new one if not provided.
|
||||
config: Scraper configuration. Loads from environment if not provided.
|
||||
|
||||
Returns:
|
||||
Property details as a dictionary.
|
||||
|
||||
Raises:
|
||||
CircuitBreakerOpenError: If the circuit breaker is open.
|
||||
ThrottlingError: If the request is throttled.
|
||||
"""
|
||||
if config is None:
|
||||
config = ScraperConfig.from_env()
|
||||
|
||||
check_circuit_breaker(config)
|
||||
cb = get_circuit_breaker(config)
|
||||
|
||||
params = {
|
||||
"apiApplication": "ANDROID",
|
||||
"appVersion": "3.70.0",
|
||||
|
|
@ -86,13 +166,38 @@ async def detail_query(
|
|||
url = f"https://api.rightmove.co.uk/api/property/{detail_id}"
|
||||
|
||||
async def do_request(s: aiohttp.ClientSession) -> dict[str, Any]:
|
||||
async with s.get(url, params=params, headers=DEFAULT_HEADERS) as response:
|
||||
if response.status != 200:
|
||||
raise Exception(
|
||||
f"""id: {detail_id}. Status Code: {response.status}."""
|
||||
f"""Failed due to: {await response.text()}"""
|
||||
start_time = time.time()
|
||||
try:
|
||||
async with s.get(url, params=params, headers=DEFAULT_HEADERS) as response:
|
||||
response_time = time.time() - start_time
|
||||
body = await response.json() if response.status == 200 else None
|
||||
|
||||
# Validate response for throttling
|
||||
validate_response(
|
||||
response,
|
||||
response_time,
|
||||
body,
|
||||
config.slow_response_threshold,
|
||||
expect_data=True,
|
||||
)
|
||||
return await response.json()
|
||||
|
||||
if response.status != 200:
|
||||
raise Exception(
|
||||
f"""id: {detail_id}. Status Code: {response.status}."""
|
||||
f"""Failed due to: {await response.text()}"""
|
||||
)
|
||||
|
||||
if cb is not None:
|
||||
cb.record_success()
|
||||
return body # type: ignore
|
||||
except ThrottlingError:
|
||||
if cb is not None:
|
||||
cb.record_failure()
|
||||
raise
|
||||
except Exception as e:
|
||||
if cb is not None:
|
||||
cb.record_failure()
|
||||
raise e
|
||||
|
||||
if session:
|
||||
return await do_request(session)
|
||||
|
|
@ -101,7 +206,11 @@ async def detail_query(
|
|||
return await do_request(new_session)
|
||||
|
||||
|
||||
@retry(wait=wait_random(min=1, max=60), stop=stop_after_attempt(3))
|
||||
@retry(
|
||||
retry=retry_if_exception_type(ThrottlingError),
|
||||
wait=wait_exponential(multiplier=2, min=2, max=120),
|
||||
stop=stop_after_attempt(5),
|
||||
)
|
||||
async def listing_query(
|
||||
*,
|
||||
page: int,
|
||||
|
|
@ -118,6 +227,7 @@ async def listing_query(
|
|||
page_size: int = 25,
|
||||
furnish_types: list[FurnishType] = [],
|
||||
session: aiohttp.ClientSession | None = None,
|
||||
config: ScraperConfig | None = None,
|
||||
) -> dict[str, Any]:
|
||||
"""Execute a listing search query.
|
||||
|
||||
|
|
@ -136,10 +246,21 @@ async def listing_query(
|
|||
page_size: Number of results per page (default 25).
|
||||
furnish_types: List of furnish types to filter (RENT only).
|
||||
session: Optional aiohttp session. Creates new one if not provided.
|
||||
config: Scraper configuration. Loads from environment if not provided.
|
||||
|
||||
Returns:
|
||||
API response as a dictionary.
|
||||
|
||||
Raises:
|
||||
CircuitBreakerOpenError: If the circuit breaker is open.
|
||||
ThrottlingError: If the request is throttled.
|
||||
"""
|
||||
if config is None:
|
||||
config = ScraperConfig.from_env()
|
||||
|
||||
check_circuit_breaker(config)
|
||||
cb = get_circuit_breaker(config)
|
||||
|
||||
params: dict[str, str] = {
|
||||
"locationIdentifier": districts.get_districts()[district],
|
||||
"channel": str(channel).upper(),
|
||||
|
|
@ -185,14 +306,39 @@ async def listing_query(
|
|||
}
|
||||
|
||||
async def do_request(s: aiohttp.ClientSession) -> dict[str, Any]:
|
||||
async with s.get(
|
||||
"https://api.rightmove.co.uk/api/property-listing",
|
||||
params=params,
|
||||
headers=request_headers,
|
||||
) as response:
|
||||
if response.status != 200:
|
||||
raise Exception(f"Failed due to: {await response.text()}")
|
||||
return await response.json()
|
||||
start_time = time.time()
|
||||
try:
|
||||
async with s.get(
|
||||
"https://api.rightmove.co.uk/api/property-listing",
|
||||
params=params,
|
||||
headers=request_headers,
|
||||
) as response:
|
||||
response_time = time.time() - start_time
|
||||
body = await response.json() if response.status == 200 else None
|
||||
|
||||
# Validate response for throttling
|
||||
validate_response(
|
||||
response,
|
||||
response_time,
|
||||
body,
|
||||
config.slow_response_threshold,
|
||||
expect_data=(page == 1), # Only expect data on first page
|
||||
)
|
||||
|
||||
if response.status != 200:
|
||||
raise Exception(f"Failed due to: {await response.text()}")
|
||||
|
||||
if cb is not None:
|
||||
cb.record_success()
|
||||
return body # type: ignore
|
||||
except ThrottlingError:
|
||||
if cb is not None:
|
||||
cb.record_failure()
|
||||
raise
|
||||
except Exception as e:
|
||||
if cb is not None:
|
||||
cb.record_failure()
|
||||
raise e
|
||||
|
||||
if session:
|
||||
return await do_request(session)
|
||||
|
|
@ -201,7 +347,11 @@ async def listing_query(
|
|||
return await do_request(new_session)
|
||||
|
||||
|
||||
@retry(wait=wait_random(min=1, max=10), stop=stop_after_attempt(3))
|
||||
@retry(
|
||||
retry=retry_if_exception_type(ThrottlingError),
|
||||
wait=wait_exponential(multiplier=2, min=2, max=60),
|
||||
stop=stop_after_attempt(5),
|
||||
)
|
||||
async def probe_query(
|
||||
*,
|
||||
session: aiohttp.ClientSession,
|
||||
|
|
@ -214,6 +364,7 @@ async def probe_query(
|
|||
district: str,
|
||||
max_days_since_added: int = 30,
|
||||
furnish_types: list[FurnishType] = [],
|
||||
config: ScraperConfig | None = None,
|
||||
) -> dict[str, Any]:
|
||||
"""Probe the API to get result count without fetching full results.
|
||||
|
||||
|
|
@ -230,10 +381,21 @@ async def probe_query(
|
|||
district: District identifier string.
|
||||
max_days_since_added: Maximum days since listing was added (BUY only).
|
||||
furnish_types: List of furnish types to filter (RENT only).
|
||||
config: Scraper configuration. Loads from environment if not provided.
|
||||
|
||||
Returns:
|
||||
API response containing totalAvailableResults.
|
||||
|
||||
Raises:
|
||||
CircuitBreakerOpenError: If the circuit breaker is open.
|
||||
ThrottlingError: If the request is throttled.
|
||||
"""
|
||||
if config is None:
|
||||
config = ScraperConfig.from_env()
|
||||
|
||||
check_circuit_breaker(config)
|
||||
cb = get_circuit_breaker(config)
|
||||
|
||||
params: dict[str, str] = {
|
||||
"locationIdentifier": districts.get_districts()[district],
|
||||
"channel": str(channel).upper(),
|
||||
|
|
@ -271,11 +433,36 @@ async def probe_query(
|
|||
"Connection": "keep-alive",
|
||||
}
|
||||
|
||||
async with session.get(
|
||||
"https://api.rightmove.co.uk/api/property-listing",
|
||||
params=params,
|
||||
headers=request_headers,
|
||||
) as response:
|
||||
if response.status != 200:
|
||||
raise Exception(f"Probe failed: {await response.text()}")
|
||||
return await response.json()
|
||||
start_time = time.time()
|
||||
try:
|
||||
async with session.get(
|
||||
"https://api.rightmove.co.uk/api/property-listing",
|
||||
params=params,
|
||||
headers=request_headers,
|
||||
) as response:
|
||||
response_time = time.time() - start_time
|
||||
body = await response.json() if response.status == 200 else None
|
||||
|
||||
# Validate response for throttling
|
||||
validate_response(
|
||||
response,
|
||||
response_time,
|
||||
body,
|
||||
config.slow_response_threshold,
|
||||
expect_data=False, # Probe doesn't need data, just count
|
||||
)
|
||||
|
||||
if response.status != 200:
|
||||
raise Exception(f"Probe failed: {await response.text()}")
|
||||
|
||||
if cb is not None:
|
||||
cb.record_success()
|
||||
return body # type: ignore
|
||||
except ThrottlingError:
|
||||
if cb is not None:
|
||||
cb.record_failure()
|
||||
raise
|
||||
except Exception as e:
|
||||
if cb is not None:
|
||||
cb.record_failure()
|
||||
raise e
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue