Requests to Rightmove API previously had no explicit timeout, causing hung connections to block workers indefinitely. Add a configurable request_timeout (default 30s) to ScraperConfig and apply it to all aiohttp sessions. Also retry on TimeoutError in addition to ThrottlingError for all API query functions.
99 lines
4.5 KiB
Python
99 lines
4.5 KiB
Python
"""Scraper configuration with environment variable loading."""
|
|
from __future__ import annotations
|
|
|
|
import multiprocessing
|
|
import os
|
|
from dataclasses import dataclass
|
|
from typing import Self
|
|
|
|
# Limit OCR threads to 25% of available cores to avoid starving other work.
|
|
MAX_OCR_WORKERS = max(1, multiprocessing.cpu_count() // 4)
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class ScraperConfig:
|
|
"""Configuration for the Rightmove scraper.
|
|
|
|
Attributes:
|
|
max_concurrent_requests: Maximum number of concurrent HTTP requests.
|
|
request_delay_ms: Delay between requests in milliseconds.
|
|
result_cap: Maximum results Rightmove returns per query (their limit).
|
|
split_threshold: When results exceed this, split the query further.
|
|
min_price_band: Minimum width of a price band (won't split below this).
|
|
max_pages_per_query: Maximum pages to fetch per subquery (60 * 25 = 1500).
|
|
proxy_url: Optional SOCKS proxy URL (e.g., socks5://localhost:9050 for Tor).
|
|
slow_response_threshold: Response time threshold in seconds for throttle detection.
|
|
request_timeout: Total timeout per HTTP request in seconds.
|
|
enable_circuit_breaker: Whether to enable circuit breaker protection.
|
|
circuit_breaker_failure_threshold: Number of consecutive failures to open circuit.
|
|
circuit_breaker_recovery_timeout: Seconds to wait before testing recovery.
|
|
"""
|
|
|
|
max_concurrent_requests: int = 5
|
|
request_delay_ms: int = 100
|
|
result_cap: int = 1500
|
|
split_threshold: int = 1200 # Split when approaching cap
|
|
min_price_band: int = 100 # Minimum band width in currency units
|
|
max_pages_per_query: int = 60 # 60 * 25 = 1500 results max
|
|
proxy_url: str | None = None
|
|
slow_response_threshold: float = 10.0 # seconds
|
|
request_timeout: float = 30.0 # seconds
|
|
enable_circuit_breaker: bool = True
|
|
circuit_breaker_failure_threshold: int = 5
|
|
circuit_breaker_recovery_timeout: float = 60.0
|
|
|
|
@classmethod
|
|
def from_env(cls) -> Self:
|
|
"""Load configuration from environment variables.
|
|
|
|
Environment variables:
|
|
RIGHTMOVE_MAX_CONCURRENT: Max concurrent requests (default: 5)
|
|
RIGHTMOVE_REQUEST_DELAY_MS: Request delay in ms (default: 100)
|
|
RIGHTMOVE_RESULT_CAP: Result cap per query (default: 1500)
|
|
RIGHTMOVE_SPLIT_THRESHOLD: Split threshold (default: 1200)
|
|
RIGHTMOVE_MIN_PRICE_BAND: Minimum price band width (default: 100)
|
|
RIGHTMOVE_MAX_PAGES: Max pages per query (default: 60)
|
|
RIGHTMOVE_PROXY_URL: SOCKS proxy URL (default: None)
|
|
RIGHTMOVE_SLOW_RESPONSE_THRESHOLD: Slow response threshold in seconds (default: 10.0)
|
|
RIGHTMOVE_REQUEST_TIMEOUT: Total timeout per HTTP request in seconds (default: 30.0)
|
|
RIGHTMOVE_ENABLE_CIRCUIT_BREAKER: Enable circuit breaker (default: True)
|
|
RIGHTMOVE_CIRCUIT_BREAKER_FAILURES: Failures to open circuit (default: 5)
|
|
RIGHTMOVE_CIRCUIT_BREAKER_TIMEOUT: Recovery timeout in seconds (default: 60.0)
|
|
|
|
Returns:
|
|
ScraperConfig instance with values from environment or defaults.
|
|
"""
|
|
return cls(
|
|
max_concurrent_requests=int(
|
|
os.environ.get("RIGHTMOVE_MAX_CONCURRENT", "5")
|
|
),
|
|
request_delay_ms=int(
|
|
os.environ.get("RIGHTMOVE_REQUEST_DELAY_MS", "100")
|
|
),
|
|
result_cap=int(os.environ.get("RIGHTMOVE_RESULT_CAP", "1500")),
|
|
split_threshold=int(
|
|
os.environ.get("RIGHTMOVE_SPLIT_THRESHOLD", "1200")
|
|
),
|
|
min_price_band=int(
|
|
os.environ.get("RIGHTMOVE_MIN_PRICE_BAND", "100")
|
|
),
|
|
max_pages_per_query=int(
|
|
os.environ.get("RIGHTMOVE_MAX_PAGES", "60")
|
|
),
|
|
proxy_url=os.environ.get("RIGHTMOVE_PROXY_URL") or None,
|
|
slow_response_threshold=float(
|
|
os.environ.get("RIGHTMOVE_SLOW_RESPONSE_THRESHOLD", "10.0")
|
|
),
|
|
request_timeout=float(
|
|
os.environ.get("RIGHTMOVE_REQUEST_TIMEOUT", "30.0")
|
|
),
|
|
enable_circuit_breaker=os.environ.get(
|
|
"RIGHTMOVE_ENABLE_CIRCUIT_BREAKER", "true"
|
|
).lower() in ("true", "1", "yes"),
|
|
circuit_breaker_failure_threshold=int(
|
|
os.environ.get("RIGHTMOVE_CIRCUIT_BREAKER_FAILURES", "5")
|
|
),
|
|
circuit_breaker_recovery_timeout=float(
|
|
os.environ.get("RIGHTMOVE_CIRCUIT_BREAKER_TIMEOUT", "60.0")
|
|
),
|
|
)
|