"""Scraper configuration with environment variable loading.""" from __future__ import annotations import os from dataclasses import dataclass from typing import Self @dataclass(frozen=True) class ScraperConfig: """Configuration for the Rightmove scraper. Attributes: max_concurrent_requests: Maximum number of concurrent HTTP requests. request_delay_ms: Delay between requests in milliseconds. result_cap: Maximum results Rightmove returns per query (their limit). split_threshold: When results exceed this, split the query further. min_price_band: Minimum width of a price band (won't split below this). max_pages_per_query: Maximum pages to fetch per subquery (60 * 25 = 1500). proxy_url: Optional SOCKS proxy URL (e.g., socks5://localhost:9050 for Tor). slow_response_threshold: Response time threshold in seconds for throttle detection. enable_circuit_breaker: Whether to enable circuit breaker protection. circuit_breaker_failure_threshold: Number of consecutive failures to open circuit. circuit_breaker_recovery_timeout: Seconds to wait before testing recovery. """ max_concurrent_requests: int = 5 request_delay_ms: int = 100 result_cap: int = 1500 split_threshold: int = 1200 # Split when approaching cap min_price_band: int = 100 # Minimum band width in currency units max_pages_per_query: int = 60 # 60 * 25 = 1500 results max proxy_url: str | None = None slow_response_threshold: float = 10.0 # seconds enable_circuit_breaker: bool = True circuit_breaker_failure_threshold: int = 5 circuit_breaker_recovery_timeout: float = 60.0 @classmethod def from_env(cls) -> Self: """Load configuration from environment variables. Environment variables: RIGHTMOVE_MAX_CONCURRENT: Max concurrent requests (default: 5) RIGHTMOVE_REQUEST_DELAY_MS: Request delay in ms (default: 100) RIGHTMOVE_RESULT_CAP: Result cap per query (default: 1500) RIGHTMOVE_SPLIT_THRESHOLD: Split threshold (default: 1200) RIGHTMOVE_MIN_PRICE_BAND: Minimum price band width (default: 100) RIGHTMOVE_MAX_PAGES: Max pages per query (default: 60) RIGHTMOVE_PROXY_URL: SOCKS proxy URL (default: None) RIGHTMOVE_SLOW_RESPONSE_THRESHOLD: Slow response threshold in seconds (default: 10.0) RIGHTMOVE_ENABLE_CIRCUIT_BREAKER: Enable circuit breaker (default: True) RIGHTMOVE_CIRCUIT_BREAKER_FAILURES: Failures to open circuit (default: 5) RIGHTMOVE_CIRCUIT_BREAKER_TIMEOUT: Recovery timeout in seconds (default: 60.0) Returns: ScraperConfig instance with values from environment or defaults. """ return cls( max_concurrent_requests=int( os.environ.get("RIGHTMOVE_MAX_CONCURRENT", "5") ), request_delay_ms=int( os.environ.get("RIGHTMOVE_REQUEST_DELAY_MS", "100") ), result_cap=int(os.environ.get("RIGHTMOVE_RESULT_CAP", "1500")), split_threshold=int( os.environ.get("RIGHTMOVE_SPLIT_THRESHOLD", "1200") ), min_price_band=int( os.environ.get("RIGHTMOVE_MIN_PRICE_BAND", "100") ), max_pages_per_query=int( os.environ.get("RIGHTMOVE_MAX_PAGES", "60") ), proxy_url=os.environ.get("RIGHTMOVE_PROXY_URL") or None, slow_response_threshold=float( os.environ.get("RIGHTMOVE_SLOW_RESPONSE_THRESHOLD", "10.0") ), enable_circuit_breaker=os.environ.get( "RIGHTMOVE_ENABLE_CIRCUIT_BREAKER", "true" ).lower() in ("true", "1", "yes"), circuit_breaker_failure_threshold=int( os.environ.get("RIGHTMOVE_CIRCUIT_BREAKER_FAILURES", "5") ), circuit_breaker_recovery_timeout=float( os.environ.get("RIGHTMOVE_CIRCUIT_BREAKER_TIMEOUT", "60.0") ), )