Add intelligent query splitting to maximize Rightmove data extraction
This commit is contained in:
parent
29ba739063
commit
e8293c6042
11 changed files with 1970 additions and 113 deletions
|
|
@ -1,4 +1,5 @@
|
|||
"""Configuration modules."""
|
||||
from config.schedule_config import ScheduleConfig, SchedulesConfig
|
||||
from config.scraper_config import ScraperConfig
|
||||
|
||||
__all__ = ["ScheduleConfig", "SchedulesConfig"]
|
||||
__all__ = ["ScheduleConfig", "SchedulesConfig", "ScraperConfig"]
|
||||
|
|
|
|||
65
crawler/config/scraper_config.py
Normal file
65
crawler/config/scraper_config.py
Normal file
|
|
@ -0,0 +1,65 @@
|
|||
"""Scraper configuration with environment variable loading."""
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
from dataclasses import dataclass
|
||||
from typing import Self
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class ScraperConfig:
|
||||
"""Configuration for the Rightmove scraper.
|
||||
|
||||
Attributes:
|
||||
max_concurrent_requests: Maximum number of concurrent HTTP requests.
|
||||
request_delay_ms: Delay between requests in milliseconds.
|
||||
result_cap: Maximum results Rightmove returns per query (their limit).
|
||||
split_threshold: When results exceed this, split the query further.
|
||||
min_price_band: Minimum width of a price band (won't split below this).
|
||||
max_pages_per_query: Maximum pages to fetch per subquery (60 * 25 = 1500).
|
||||
proxy_url: Optional SOCKS proxy URL (e.g., socks5://localhost:9050 for Tor).
|
||||
"""
|
||||
|
||||
max_concurrent_requests: int = 5
|
||||
request_delay_ms: int = 100
|
||||
result_cap: int = 1500
|
||||
split_threshold: int = 1200 # Split when approaching cap
|
||||
min_price_band: int = 100 # Minimum band width in currency units
|
||||
max_pages_per_query: int = 60 # 60 * 25 = 1500 results max
|
||||
proxy_url: str | None = None
|
||||
|
||||
@classmethod
|
||||
def from_env(cls) -> Self:
|
||||
"""Load configuration from environment variables.
|
||||
|
||||
Environment variables:
|
||||
RIGHTMOVE_MAX_CONCURRENT: Max concurrent requests (default: 5)
|
||||
RIGHTMOVE_REQUEST_DELAY_MS: Request delay in ms (default: 100)
|
||||
RIGHTMOVE_RESULT_CAP: Result cap per query (default: 1500)
|
||||
RIGHTMOVE_SPLIT_THRESHOLD: Split threshold (default: 1200)
|
||||
RIGHTMOVE_MIN_PRICE_BAND: Minimum price band width (default: 100)
|
||||
RIGHTMOVE_MAX_PAGES: Max pages per query (default: 60)
|
||||
RIGHTMOVE_PROXY_URL: SOCKS proxy URL (default: None)
|
||||
|
||||
Returns:
|
||||
ScraperConfig instance with values from environment or defaults.
|
||||
"""
|
||||
return cls(
|
||||
max_concurrent_requests=int(
|
||||
os.environ.get("RIGHTMOVE_MAX_CONCURRENT", "5")
|
||||
),
|
||||
request_delay_ms=int(
|
||||
os.environ.get("RIGHTMOVE_REQUEST_DELAY_MS", "100")
|
||||
),
|
||||
result_cap=int(os.environ.get("RIGHTMOVE_RESULT_CAP", "1500")),
|
||||
split_threshold=int(
|
||||
os.environ.get("RIGHTMOVE_SPLIT_THRESHOLD", "1200")
|
||||
),
|
||||
min_price_band=int(
|
||||
os.environ.get("RIGHTMOVE_MIN_PRICE_BAND", "100")
|
||||
),
|
||||
max_pages_per_query=int(
|
||||
os.environ.get("RIGHTMOVE_MAX_PAGES", "60")
|
||||
),
|
||||
proxy_url=os.environ.get("RIGHTMOVE_PROXY_URL") or None,
|
||||
)
|
||||
Loading…
Add table
Add a link
Reference in a new issue