Add throttling detection and circuit breaker for Rightmove scraper

This commit is contained in:
Viktor Barzin 2026-02-02 22:50:19 +00:00
parent e8293c6042
commit f880664a98
10 changed files with 1428 additions and 86 deletions

View file

@ -18,6 +18,10 @@ class ScraperConfig:
min_price_band: Minimum width of a price band (won't split below this).
max_pages_per_query: Maximum pages to fetch per subquery (60 * 25 = 1500).
proxy_url: Optional SOCKS proxy URL (e.g., socks5://localhost:9050 for Tor).
slow_response_threshold: Response time threshold in seconds for throttle detection.
enable_circuit_breaker: Whether to enable circuit breaker protection.
circuit_breaker_failure_threshold: Number of consecutive failures to open circuit.
circuit_breaker_recovery_timeout: Seconds to wait before testing recovery.
"""
max_concurrent_requests: int = 5
@ -27,6 +31,10 @@ class ScraperConfig:
min_price_band: int = 100 # Minimum band width in currency units
max_pages_per_query: int = 60 # 60 * 25 = 1500 results max
proxy_url: str | None = None
slow_response_threshold: float = 10.0 # seconds
enable_circuit_breaker: bool = True
circuit_breaker_failure_threshold: int = 5
circuit_breaker_recovery_timeout: float = 60.0
@classmethod
def from_env(cls) -> Self:
@ -40,6 +48,10 @@ class ScraperConfig:
RIGHTMOVE_MIN_PRICE_BAND: Minimum price band width (default: 100)
RIGHTMOVE_MAX_PAGES: Max pages per query (default: 60)
RIGHTMOVE_PROXY_URL: SOCKS proxy URL (default: None)
RIGHTMOVE_SLOW_RESPONSE_THRESHOLD: Slow response threshold in seconds (default: 10.0)
RIGHTMOVE_ENABLE_CIRCUIT_BREAKER: Enable circuit breaker (default: True)
RIGHTMOVE_CIRCUIT_BREAKER_FAILURES: Failures to open circuit (default: 5)
RIGHTMOVE_CIRCUIT_BREAKER_TIMEOUT: Recovery timeout in seconds (default: 60.0)
Returns:
ScraperConfig instance with values from environment or defaults.
@ -62,4 +74,16 @@ class ScraperConfig:
os.environ.get("RIGHTMOVE_MAX_PAGES", "60")
),
proxy_url=os.environ.get("RIGHTMOVE_PROXY_URL") or None,
slow_response_threshold=float(
os.environ.get("RIGHTMOVE_SLOW_RESPONSE_THRESHOLD", "10.0")
),
enable_circuit_breaker=os.environ.get(
"RIGHTMOVE_ENABLE_CIRCUIT_BREAKER", "true"
).lower() in ("true", "1", "yes"),
circuit_breaker_failure_threshold=int(
os.environ.get("RIGHTMOVE_CIRCUIT_BREAKER_FAILURES", "5")
),
circuit_breaker_recovery_timeout=float(
os.environ.get("RIGHTMOVE_CIRCUIT_BREAKER_TIMEOUT", "60.0")
),
)