Flatten repo structure: move crawler/ to root, remove vqa/ and immoweb/

The crawler subdirectory was the only active project. Moving it to the repo root simplifies paths and removes the unnecessary nesting. The vqa/ and immoweb/ directories were legacy/unused and have been removed. Updated .drone.yml, .gitignore, .claude/ docs, and skills to reflect the new flat structure.
2026-02-07 23:01:20 +00:00 · 2026-02-07 23:01:20 +00:00 · eafbc1ac52
commit eafbc1ac52
parent e2247be700
221 changed files with 70 additions and 146140 deletions
--- a/config/init.py
+++ b/config/init.py
@ -0,0 +1,5 @@
+"""Configuration modules."""
+from config.schedule_config import ScheduleConfig, SchedulesConfig
+from config.scraper_config import ScraperConfig
+
+__all__ = ["ScheduleConfig", "SchedulesConfig", "ScraperConfig"]
--- a/config/schedule_config.py
+++ b/config/schedule_config.py
@ -0,0 +1,122 @@
+"""Schedule configuration for periodic scraping tasks."""
+from __future__ import annotations
+
+import json
+import logging
+import os
+import re
+from typing import Self
+
+from pydantic import BaseModel, field_validator
+
+from models.listing import FurnishType, ListingType, QueryParameters
+
+logger = logging.getLogger("uvicorn.error")
+
+# Cron field validation patterns
+CRON_MINUTE_PATTERN = re.compile(r"^(\*|([0-5]?\d)(,[0-5]?\d)*|\*/[1-9]\d*)$")
+CRON_HOUR_PATTERN = re.compile(r"^(\*|(1?\d|2[0-3])(,(1?\d|2[0-3]))*|\*/[1-9]\d*)$")
+CRON_DAY_OF_WEEK_PATTERN = re.compile(r"^(\*|[0-6](,[0-6])*|\*/[1-6])$")
+
+
+class ScheduleConfig(BaseModel):
+    """Configuration for a single periodic scrape schedule."""
+
+    name: str
+    enabled: bool = True
+    minute: str = "0"
+    hour: str = "2"
+    day_of_week: str = "*"
+    listing_type: ListingType
+    min_bedrooms: int = 1
+    max_bedrooms: int = 999
+    min_price: int = 0
+    max_price: int = 10_000_000
+    district_names: list[str] = []
+    furnish_types: list[str] | None = None
+
+    @field_validator("minute")
+    @classmethod
+    def validate_minute(cls, v: str) -> str:
+        """Validate cron minute field (0-59, *, or */N)."""
+        if not CRON_MINUTE_PATTERN.match(v):
+            raise ValueError(
+                f"Invalid cron minute '{v}'. Must be 0-59, *, */N, or comma-separated values."
+            )
+        return v
+
+    @field_validator("hour")
+    @classmethod
+    def validate_hour(cls, v: str) -> str:
+        """Validate cron hour field (0-23, *, or */N)."""
+        if not CRON_HOUR_PATTERN.match(v):
+            raise ValueError(
+                f"Invalid cron hour '{v}'. Must be 0-23, *, */N, or comma-separated values."
+            )
+        return v
+
+    @field_validator("day_of_week")
+    @classmethod
+    def validate_day_of_week(cls, v: str) -> str:
+        """Validate cron day_of_week field (0-6, *, or */N)."""
+        if not CRON_DAY_OF_WEEK_PATTERN.match(v):
+            raise ValueError(
+                f"Invalid cron day_of_week '{v}'. Must be 0-6, *, */N, or comma-separated values."
+            )
+        return v
+
+    def to_query_parameters(self) -> QueryParameters:
+        """Convert schedule config to QueryParameters for the scrape task."""
+        furnish_types_enum: list[FurnishType] | None = None
+        if self.furnish_types:
+            furnish_types_enum = [FurnishType(ft) for ft in self.furnish_types]
+
+        return QueryParameters(
+            listing_type=self.listing_type,
+            min_bedrooms=self.min_bedrooms,
+            max_bedrooms=self.max_bedrooms,
+            min_price=self.min_price,
+            max_price=self.max_price,
+            district_names=set(self.district_names),
+            furnish_types=furnish_types_enum,
+        )
+
+
+class SchedulesConfig(BaseModel):
+    """Container for multiple schedule configurations."""
+
+    schedules: list[ScheduleConfig] = []
+
+    @classmethod
+    def from_env(cls, env_var: str = "SCRAPE_SCHEDULES") -> Self:
+        """Load schedules from environment variable.
+
+        Args:
+            env_var: Name of the environment variable containing JSON config.
+
+        Returns:
+            SchedulesConfig instance with parsed schedules.
+
+        Raises:
+            ValueError: If the JSON is invalid or schedule validation fails.
+        """
+        raw_value = os.environ.get(env_var, "").strip()
+
+        if not raw_value:
+            logger.info(f"No {env_var} configured, no periodic scrapes will be scheduled")
+            return cls(schedules=[])
+
+        try:
+            parsed = json.loads(raw_value)
+        except json.JSONDecodeError as e:
+            raise ValueError(f"Invalid JSON in {env_var}: {e}") from e
+
+        if not isinstance(parsed, list):
+            raise ValueError(f"{env_var} must be a JSON array")
+
+        schedules = [ScheduleConfig.model_validate(item) for item in parsed]
+        return cls(schedules=schedules)
+
+    def get_enabled_schedules(self) -> list[ScheduleConfig]:
+        """Return only enabled schedules."""
+        return [s for s in self.schedules if s.enabled]
--- a/config/scraper_config.py
+++ b/config/scraper_config.py
@ -0,0 +1,89 @@
+"""Scraper configuration with environment variable loading."""
+from __future__ import annotations
+
+import os
+from dataclasses import dataclass
+from typing import Self
+
+
+@dataclass(frozen=True)
+class ScraperConfig:
+    """Configuration for the Rightmove scraper.
+
+    Attributes:
+        max_concurrent_requests: Maximum number of concurrent HTTP requests.
+        request_delay_ms: Delay between requests in milliseconds.
+        result_cap: Maximum results Rightmove returns per query (their limit).
+        split_threshold: When results exceed this, split the query further.
+        min_price_band: Minimum width of a price band (won't split below this).
+        max_pages_per_query: Maximum pages to fetch per subquery (60 * 25 = 1500).
+        proxy_url: Optional SOCKS proxy URL (e.g., socks5://localhost:9050 for Tor).
+        slow_response_threshold: Response time threshold in seconds for throttle detection.
+        enable_circuit_breaker: Whether to enable circuit breaker protection.
+        circuit_breaker_failure_threshold: Number of consecutive failures to open circuit.
+        circuit_breaker_recovery_timeout: Seconds to wait before testing recovery.
+    """
+
+    max_concurrent_requests: int = 5
+    request_delay_ms: int = 100
+    result_cap: int = 1500
+    split_threshold: int = 1200  # Split when approaching cap
+    min_price_band: int = 100  # Minimum band width in currency units
+    max_pages_per_query: int = 60  # 60 * 25 = 1500 results max
+    proxy_url: str | None = None
+    slow_response_threshold: float = 10.0  # seconds
+    enable_circuit_breaker: bool = True
+    circuit_breaker_failure_threshold: int = 5
+    circuit_breaker_recovery_timeout: float = 60.0
+
+    @classmethod
+    def from_env(cls) -> Self:
+        """Load configuration from environment variables.
+
+        Environment variables:
+            RIGHTMOVE_MAX_CONCURRENT: Max concurrent requests (default: 5)
+            RIGHTMOVE_REQUEST_DELAY_MS: Request delay in ms (default: 100)
+            RIGHTMOVE_RESULT_CAP: Result cap per query (default: 1500)
+            RIGHTMOVE_SPLIT_THRESHOLD: Split threshold (default: 1200)
+            RIGHTMOVE_MIN_PRICE_BAND: Minimum price band width (default: 100)
+            RIGHTMOVE_MAX_PAGES: Max pages per query (default: 60)
+            RIGHTMOVE_PROXY_URL: SOCKS proxy URL (default: None)
+            RIGHTMOVE_SLOW_RESPONSE_THRESHOLD: Slow response threshold in seconds (default: 10.0)
+            RIGHTMOVE_ENABLE_CIRCUIT_BREAKER: Enable circuit breaker (default: True)
+            RIGHTMOVE_CIRCUIT_BREAKER_FAILURES: Failures to open circuit (default: 5)
+            RIGHTMOVE_CIRCUIT_BREAKER_TIMEOUT: Recovery timeout in seconds (default: 60.0)
+
+        Returns:
+            ScraperConfig instance with values from environment or defaults.
+        """
+        return cls(
+            max_concurrent_requests=int(
+                os.environ.get("RIGHTMOVE_MAX_CONCURRENT", "5")
+            ),
+            request_delay_ms=int(
+                os.environ.get("RIGHTMOVE_REQUEST_DELAY_MS", "100")
+            ),
+            result_cap=int(os.environ.get("RIGHTMOVE_RESULT_CAP", "1500")),
+            split_threshold=int(
+                os.environ.get("RIGHTMOVE_SPLIT_THRESHOLD", "1200")
+            ),
+            min_price_band=int(
+                os.environ.get("RIGHTMOVE_MIN_PRICE_BAND", "100")
+            ),
+            max_pages_per_query=int(
+                os.environ.get("RIGHTMOVE_MAX_PAGES", "60")
+            ),
+            proxy_url=os.environ.get("RIGHTMOVE_PROXY_URL") or None,
+            slow_response_threshold=float(
+                os.environ.get("RIGHTMOVE_SLOW_RESPONSE_THRESHOLD", "10.0")
+            ),
+            enable_circuit_breaker=os.environ.get(
+                "RIGHTMOVE_ENABLE_CIRCUIT_BREAKER", "true"
+            ).lower() in ("true", "1", "yes"),
+            circuit_breaker_failure_threshold=int(
+                os.environ.get("RIGHTMOVE_CIRCUIT_BREAKER_FAILURES", "5")
+            ),
+            circuit_breaker_recovery_timeout=float(
+                os.environ.get("RIGHTMOVE_CIRCUIT_BREAKER_TIMEOUT", "60.0")
+            ),
+        )