Requests to Rightmove API previously had no explicit timeout, causing hung connections to block workers indefinitely. Add a configurable request_timeout (default 30s) to ScraperConfig and apply it to all aiohttp sessions. Also retry on TimeoutError in addition to ThrottlingError for all API query functions.
512 lines
14 KiB
Python
512 lines
14 KiB
Python
import enum
|
|
import logging
|
|
import time
|
|
from typing import Any
|
|
from contextlib import asynccontextmanager
|
|
from collections.abc import AsyncIterator
|
|
|
|
import aiohttp
|
|
from models.listing import FurnishType, ListingType
|
|
from rec import districts
|
|
from rec.exceptions import (
|
|
CircuitBreakerOpenError,
|
|
RightmoveAPIError,
|
|
ThrottlingError,
|
|
)
|
|
from rec.throttle_detector import get_throttle_metrics, validate_response
|
|
from rec.circuit_breaker import CircuitBreaker
|
|
from tenacity import (
|
|
retry,
|
|
retry_if_exception_type,
|
|
stop_after_attempt,
|
|
wait_exponential,
|
|
wait_random,
|
|
)
|
|
from config.scraper_config import ScraperConfig
|
|
|
|
logger = logging.getLogger("uvicorn.error")
|
|
|
|
# Global circuit breaker instance
|
|
_circuit_breaker: CircuitBreaker | None = None
|
|
|
|
# API constants
|
|
ANDROID_APP_VERSION = "3.70.0"
|
|
ANDROID_APP_VERSION_LISTING = "4.28.0"
|
|
RIGHTMOVE_API_BASE = "https://api.rightmove.co.uk/api"
|
|
PROPERTY_LISTING_ENDPOINT = f"{RIGHTMOVE_API_BASE}/property-listing"
|
|
|
|
DEFAULT_HEADERS = {
|
|
"Host": "api.rightmove.co.uk",
|
|
"User-Agent": "okhttp/4.12.0",
|
|
"Connection": "keep-alive",
|
|
}
|
|
|
|
LISTING_HEADERS = {
|
|
**DEFAULT_HEADERS,
|
|
"Accept-Encoding": "gzip, deflate, br",
|
|
}
|
|
|
|
|
|
class PropertyType(enum.StrEnum):
|
|
BUNGALOW = "bungalow"
|
|
DETACHED = "detached"
|
|
FLAT = "flat"
|
|
LAND = "land"
|
|
PARK_HOME = "park-home"
|
|
SEMI_DETACHED = "semi-detached"
|
|
TERRACED = "terraced"
|
|
|
|
|
|
@asynccontextmanager
|
|
async def create_session(
|
|
config: ScraperConfig | None = None,
|
|
) -> AsyncIterator[aiohttp.ClientSession]:
|
|
"""Create an aiohttp session with optional proxy support.
|
|
|
|
Args:
|
|
config: Scraper configuration. Loads from environment if not provided.
|
|
|
|
Yields:
|
|
Configured aiohttp ClientSession.
|
|
"""
|
|
if config is None:
|
|
config = ScraperConfig.from_env()
|
|
|
|
connector = None
|
|
if config.proxy_url:
|
|
try:
|
|
from aiohttp_socks import ProxyConnector
|
|
|
|
connector = ProxyConnector.from_url(config.proxy_url)
|
|
except ImportError:
|
|
raise ImportError(
|
|
"aiohttp-socks is required for proxy support. "
|
|
"Install with: pip install aiohttp-socks"
|
|
)
|
|
|
|
session = aiohttp.ClientSession(
|
|
trust_env=True,
|
|
connector=connector,
|
|
headers=DEFAULT_HEADERS,
|
|
timeout=aiohttp.ClientTimeout(total=config.request_timeout),
|
|
)
|
|
try:
|
|
yield session
|
|
finally:
|
|
await session.close()
|
|
|
|
|
|
def get_circuit_breaker(config: ScraperConfig | None = None) -> CircuitBreaker | None:
|
|
"""Get the global circuit breaker instance.
|
|
|
|
Args:
|
|
config: Configuration for initializing the circuit breaker.
|
|
|
|
Returns:
|
|
CircuitBreaker instance if enabled, None otherwise.
|
|
"""
|
|
global _circuit_breaker
|
|
if config is None:
|
|
config = ScraperConfig.from_env()
|
|
|
|
if not config.enable_circuit_breaker:
|
|
return None
|
|
|
|
if _circuit_breaker is None:
|
|
_circuit_breaker = CircuitBreaker(
|
|
failure_threshold=config.circuit_breaker_failure_threshold,
|
|
recovery_timeout=config.circuit_breaker_recovery_timeout,
|
|
)
|
|
return _circuit_breaker
|
|
|
|
|
|
def reset_circuit_breaker() -> None:
|
|
"""Reset the global circuit breaker."""
|
|
global _circuit_breaker
|
|
if _circuit_breaker is not None:
|
|
_circuit_breaker.reset()
|
|
|
|
|
|
def check_circuit_breaker(config: ScraperConfig | None = None) -> None:
|
|
"""Check if the circuit breaker allows requests.
|
|
|
|
Args:
|
|
config: Configuration for the circuit breaker.
|
|
|
|
Raises:
|
|
CircuitBreakerOpenError: If the circuit is open.
|
|
"""
|
|
cb = get_circuit_breaker(config)
|
|
if cb is not None:
|
|
cb.call()
|
|
|
|
|
|
def _build_base_params(
|
|
*,
|
|
channel: ListingType,
|
|
page: int,
|
|
page_size: int,
|
|
radius: float,
|
|
min_price: int,
|
|
max_price: int,
|
|
min_bedrooms: int,
|
|
max_bedrooms: int,
|
|
district: str,
|
|
) -> dict[str, str]:
|
|
return {
|
|
"locationIdentifier": districts.get_districts()[district],
|
|
"channel": str(channel).upper(),
|
|
"page": str(page),
|
|
"numberOfPropertiesPerPage": str(page_size),
|
|
"radius": str(radius),
|
|
"sortBy": "distance",
|
|
"includeUnavailableProperties": "false",
|
|
"minPrice": str(min_price),
|
|
"maxPrice": str(max_price),
|
|
"minBedrooms": str(min_bedrooms),
|
|
"maxBedrooms": str(max_bedrooms),
|
|
"apiApplication": "ANDROID",
|
|
"appVersion": ANDROID_APP_VERSION_LISTING,
|
|
}
|
|
|
|
|
|
def _build_listing_params(
|
|
*,
|
|
page: int,
|
|
channel: ListingType,
|
|
min_bedrooms: int,
|
|
max_bedrooms: int,
|
|
radius: float,
|
|
min_price: int,
|
|
max_price: int,
|
|
district: str,
|
|
mustNewHome: bool,
|
|
max_days_since_added: int,
|
|
property_type: list[PropertyType],
|
|
page_size: int,
|
|
furnish_types: list[FurnishType],
|
|
) -> dict[str, str]:
|
|
params = _build_base_params(
|
|
channel=channel,
|
|
page=page,
|
|
page_size=page_size,
|
|
radius=radius,
|
|
min_price=min_price,
|
|
max_price=max_price,
|
|
min_bedrooms=min_bedrooms,
|
|
max_bedrooms=max_bedrooms,
|
|
district=district,
|
|
)
|
|
if channel is ListingType.BUY:
|
|
params["dontShow"] = "sharedOwnership,retirement"
|
|
if len(property_type) > 0:
|
|
params["propertyTypes"] = ",".join(property_type)
|
|
if max_days_since_added is not None and max_days_since_added not in [
|
|
1,
|
|
3,
|
|
7,
|
|
14,
|
|
]:
|
|
raise ValueError(
|
|
f"Invalid max_days_since_added={max_days_since_added}, "
|
|
f"must be one of [1, 3, 7, 14]"
|
|
)
|
|
params["maxDaysSinceAdded"] = str(max_days_since_added)
|
|
|
|
if mustNewHome:
|
|
params["mustHave"] = "newHome"
|
|
if channel is ListingType.RENT:
|
|
if furnish_types:
|
|
params["furnishTypes"] = ",".join(furnish_types)
|
|
return params
|
|
|
|
|
|
def _build_probe_params(
|
|
*,
|
|
channel: ListingType,
|
|
min_bedrooms: int,
|
|
max_bedrooms: int,
|
|
radius: float,
|
|
min_price: int,
|
|
max_price: int,
|
|
district: str,
|
|
max_days_since_added: int,
|
|
furnish_types: list[FurnishType],
|
|
) -> dict[str, str]:
|
|
params = _build_base_params(
|
|
channel=channel,
|
|
page=1,
|
|
page_size=1, # Minimal page size for probing
|
|
radius=radius,
|
|
min_price=min_price,
|
|
max_price=max_price,
|
|
min_bedrooms=min_bedrooms,
|
|
max_bedrooms=max_bedrooms,
|
|
district=district,
|
|
)
|
|
if channel is ListingType.BUY:
|
|
params["dontShow"] = "sharedOwnership,retirement"
|
|
if max_days_since_added is not None and max_days_since_added in [
|
|
1,
|
|
3,
|
|
7,
|
|
14,
|
|
]:
|
|
params["maxDaysSinceAdded"] = str(max_days_since_added)
|
|
|
|
if channel is ListingType.RENT:
|
|
if furnish_types:
|
|
params["furnishTypes"] = ",".join(furnish_types)
|
|
return params
|
|
|
|
|
|
async def _execute_api_request(
|
|
*,
|
|
url: str,
|
|
params: dict[str, str],
|
|
headers: dict[str, str],
|
|
session: aiohttp.ClientSession | None,
|
|
config: ScraperConfig,
|
|
expect_data: bool = True,
|
|
error_context: str = "",
|
|
) -> dict[str, Any]:
|
|
check_circuit_breaker(config)
|
|
cb = get_circuit_breaker(config)
|
|
|
|
async def do_request(s: aiohttp.ClientSession) -> dict[str, Any]:
|
|
start_time = time.time()
|
|
try:
|
|
async with s.get(url, params=params, headers=headers) as response:
|
|
response_time = time.time() - start_time
|
|
body = await response.json() if response.status == 200 else None
|
|
|
|
validate_response(
|
|
response,
|
|
response_time,
|
|
body,
|
|
config.slow_response_threshold,
|
|
expect_data=expect_data,
|
|
)
|
|
|
|
if response.status != 200:
|
|
raise RightmoveAPIError(
|
|
f"{error_context}Failed due to: {await response.text()}"
|
|
)
|
|
|
|
if cb is not None:
|
|
cb.record_success()
|
|
return body # type: ignore
|
|
except ThrottlingError:
|
|
if cb is not None:
|
|
cb.record_failure()
|
|
raise
|
|
except Exception as e:
|
|
if cb is not None:
|
|
cb.record_failure()
|
|
raise e
|
|
|
|
if session:
|
|
return await do_request(session)
|
|
else:
|
|
async with aiohttp.ClientSession(
|
|
trust_env=True,
|
|
timeout=aiohttp.ClientTimeout(total=config.request_timeout),
|
|
) as new_session:
|
|
return await do_request(new_session)
|
|
|
|
|
|
@retry(
|
|
retry=retry_if_exception_type((ThrottlingError, TimeoutError)),
|
|
wait=wait_exponential(multiplier=2, min=2, max=120),
|
|
stop=stop_after_attempt(5),
|
|
)
|
|
async def detail_query(
|
|
detail_id: int,
|
|
session: aiohttp.ClientSession | None = None,
|
|
config: ScraperConfig | None = None,
|
|
) -> dict[str, Any]:
|
|
"""Fetch detailed property information.
|
|
|
|
Args:
|
|
detail_id: The property identifier.
|
|
session: Optional aiohttp session. Creates new one if not provided.
|
|
config: Scraper configuration. Loads from environment if not provided.
|
|
|
|
Returns:
|
|
Property details as a dictionary.
|
|
|
|
Raises:
|
|
CircuitBreakerOpenError: If the circuit breaker is open.
|
|
ThrottlingError: If the request is throttled.
|
|
"""
|
|
if config is None:
|
|
config = ScraperConfig.from_env()
|
|
|
|
params = {
|
|
"apiApplication": "ANDROID",
|
|
"appVersion": ANDROID_APP_VERSION,
|
|
}
|
|
url = f"{RIGHTMOVE_API_BASE}/property/{detail_id}"
|
|
|
|
return await _execute_api_request(
|
|
url=url,
|
|
params=params,
|
|
headers=DEFAULT_HEADERS,
|
|
session=session,
|
|
config=config,
|
|
expect_data=True,
|
|
error_context=f"id: {detail_id}. Status Code: ",
|
|
)
|
|
|
|
|
|
@retry(
|
|
retry=retry_if_exception_type((ThrottlingError, TimeoutError)),
|
|
wait=wait_exponential(multiplier=2, min=2, max=120),
|
|
stop=stop_after_attempt(5),
|
|
)
|
|
async def listing_query(
|
|
*,
|
|
page: int,
|
|
channel: ListingType,
|
|
min_bedrooms: int,
|
|
max_bedrooms: int,
|
|
radius: float,
|
|
min_price: int,
|
|
max_price: int,
|
|
district: str, # = "STATION^5168", # kings cross station
|
|
mustNewHome: bool = False,
|
|
max_days_since_added: int = 30,
|
|
property_type: list[PropertyType] | None = None,
|
|
page_size: int = 25,
|
|
furnish_types: list[FurnishType] | None = None,
|
|
session: aiohttp.ClientSession | None = None,
|
|
config: ScraperConfig | None = None,
|
|
) -> dict[str, Any]:
|
|
"""Execute a listing search query.
|
|
|
|
Args:
|
|
page: Page number to fetch (1-indexed).
|
|
channel: Listing type (BUY or RENT).
|
|
min_bedrooms: Minimum number of bedrooms.
|
|
max_bedrooms: Maximum number of bedrooms.
|
|
radius: Search radius.
|
|
min_price: Minimum price.
|
|
max_price: Maximum price.
|
|
district: District identifier string.
|
|
mustNewHome: Filter for new homes only (BUY only).
|
|
max_days_since_added: Maximum days since listing was added (BUY only).
|
|
property_type: List of property types to filter (BUY only).
|
|
page_size: Number of results per page (default 25).
|
|
furnish_types: List of furnish types to filter (RENT only).
|
|
session: Optional aiohttp session. Creates new one if not provided.
|
|
config: Scraper configuration. Loads from environment if not provided.
|
|
|
|
Returns:
|
|
API response as a dictionary.
|
|
|
|
Raises:
|
|
CircuitBreakerOpenError: If the circuit breaker is open.
|
|
ThrottlingError: If the request is throttled.
|
|
"""
|
|
if config is None:
|
|
config = ScraperConfig.from_env()
|
|
if property_type is None:
|
|
property_type = []
|
|
if furnish_types is None:
|
|
furnish_types = []
|
|
|
|
params = _build_listing_params(
|
|
page=page,
|
|
channel=channel,
|
|
min_bedrooms=min_bedrooms,
|
|
max_bedrooms=max_bedrooms,
|
|
radius=radius,
|
|
min_price=min_price,
|
|
max_price=max_price,
|
|
district=district,
|
|
mustNewHome=mustNewHome,
|
|
max_days_since_added=max_days_since_added,
|
|
property_type=property_type,
|
|
page_size=page_size,
|
|
furnish_types=furnish_types,
|
|
)
|
|
|
|
return await _execute_api_request(
|
|
url=PROPERTY_LISTING_ENDPOINT,
|
|
params=params,
|
|
headers=LISTING_HEADERS,
|
|
session=session,
|
|
config=config,
|
|
expect_data=(page == 1),
|
|
)
|
|
|
|
|
|
@retry(
|
|
retry=retry_if_exception_type((ThrottlingError, TimeoutError)),
|
|
wait=wait_exponential(multiplier=2, min=2, max=60),
|
|
stop=stop_after_attempt(5),
|
|
)
|
|
async def probe_query(
|
|
*,
|
|
session: aiohttp.ClientSession,
|
|
channel: ListingType,
|
|
min_bedrooms: int,
|
|
max_bedrooms: int,
|
|
radius: float,
|
|
min_price: int,
|
|
max_price: int,
|
|
district: str,
|
|
max_days_since_added: int = 30,
|
|
furnish_types: list[FurnishType] | None = None,
|
|
config: ScraperConfig | None = None,
|
|
) -> dict[str, Any]:
|
|
"""Probe the API to get result count without fetching full results.
|
|
|
|
Makes a minimal request (page_size=1) to efficiently get totalAvailableResults.
|
|
|
|
Args:
|
|
session: aiohttp session for making requests.
|
|
channel: Listing type (BUY or RENT).
|
|
min_bedrooms: Minimum number of bedrooms.
|
|
max_bedrooms: Maximum number of bedrooms.
|
|
radius: Search radius.
|
|
min_price: Minimum price.
|
|
max_price: Maximum price.
|
|
district: District identifier string.
|
|
max_days_since_added: Maximum days since listing was added (BUY only).
|
|
furnish_types: List of furnish types to filter (RENT only).
|
|
config: Scraper configuration. Loads from environment if not provided.
|
|
|
|
Returns:
|
|
API response containing totalAvailableResults.
|
|
|
|
Raises:
|
|
CircuitBreakerOpenError: If the circuit breaker is open.
|
|
ThrottlingError: If the request is throttled.
|
|
"""
|
|
if config is None:
|
|
config = ScraperConfig.from_env()
|
|
if furnish_types is None:
|
|
furnish_types = []
|
|
|
|
params = _build_probe_params(
|
|
channel=channel,
|
|
min_bedrooms=min_bedrooms,
|
|
max_bedrooms=max_bedrooms,
|
|
radius=radius,
|
|
min_price=min_price,
|
|
max_price=max_price,
|
|
district=district,
|
|
max_days_since_added=max_days_since_added,
|
|
furnish_types=furnish_types,
|
|
)
|
|
|
|
return await _execute_api_request(
|
|
url=PROPERTY_LISTING_ENDPOINT,
|
|
params=params,
|
|
headers=LISTING_HEADERS,
|
|
session=session,
|
|
config=config,
|
|
expect_data=False,
|
|
error_context="Probe failed: ",
|
|
)
|