"""Numbeo HTML scraper — parses the public `cost-of-living/in/` pages directly. No LLM interpretation — uses regex against the table structure. The page format is stable enough across cities that a single parser works for all of them. We extract: - The headline ex-rent total (one number, EUR-prefixed) - The 1-bed center / outside rent (two rows in the rent table) Per-category breakdown is intentionally NOT extracted by the live scraper — the headline two numbers are what the simulator uses for ratios, and the breakdown rows are noisy (averages of varying-sample sizes). The hand-curated `baseline.py` carries the breakdowns where they exist; the cache layer falls back to baseline.py if a breakdown is needed for the UI. ToS posture: Numbeo's robots.txt allows /cost-of-living/* for major crawlers. We send a polite UA, ≤1 req/sec, 30s timeout, exponential backoff on 429/5xx, and never re-scrape within the cache TTL. """ from __future__ import annotations import asyncio import logging import re from datetime import UTC, date, datetime, timedelta from decimal import Decimal from typing import Final import httpx from fire_planner.col.models import CategoryBreakdown, CityCostIndex, ColSource log = logging.getLogger(__name__) BASE_URL: Final = "https://www.numbeo.com/cost-of-living/in" USER_AGENT: Final = ( "fire-planner/0.1 (+https://forgejo.viktorbarzin.me/viktor/code; " "non-commercial personal use; 1-year cache)" ) DEFAULT_TIMEOUT: Final = httpx.Timeout(connect=10.0, read=30.0, write=10.0, pool=10.0) MIN_REQUEST_INTERVAL: Final = 1.1 # seconds between requests — polite # Currency-to-GBP rates for common Numbeo source pages. Snapshot once at # scraper init; refresh by editing this map (rare — within ±5% over a # year). When a city's local currency isn't here, the scraper falls back # to the EUR amount Numbeo always prints alongside (€-prefixed) — that # requires only one rate (EUR_TO_GBP) which is universally present. EUR_TO_GBP: Final = Decimal("0.862") LOCAL_TO_GBP: Final[dict[str, Decimal]] = { "EUR": EUR_TO_GBP, "GBP": Decimal("1.0"), "USD": Decimal("0.787"), "BGN": Decimal("0.435"), "RON": Decimal("0.173"), "GEL": Decimal("0.295"), "AED": Decimal("0.21505"), "MYR": Decimal("0.171"), "THB": Decimal("0.02198"), "IDR": Decimal("0.0000485"), "SGD": Decimal("0.585"), "TWD": Decimal("0.0246"), "VND": Decimal("0.0000316"), "MXN": Decimal("0.0394"), "COP": Decimal("0.000195"), "PYG": Decimal("0.000099"), "UYU": Decimal("0.0197"), "PAB": Decimal("0.787"), # Panamanian Balboa pegged to USD "QAR": Decimal("0.216"), # Qatari Riyal "BHD": Decimal("2.09"), "JPY": Decimal("0.00520"), "KRW": Decimal("0.000565"), "HKD": Decimal("0.101"), "TRY": Decimal("0.0204"), # volatile — refresh more often "RSD": Decimal("0.00737"), "HRK": Decimal("0.114"), "HUF": Decimal("0.00213"), "CZK": Decimal("0.0345"), "PLN": Decimal("0.196"), "ALL": Decimal("0.00859"), } # --- Regex patterns for the Numbeo page --- # The "Estimated monthly costs for a single person" headline appears as: # "Estimated monthly costs for a single person are €X.X" # with the EUR figure always quoted (Numbeo's site currency is EUR). _HEADLINE_EUR_RE = re.compile( r"single\s+person[^<]*?(?:are|=)\s*€\s*([0-9,]+(?:\.[0-9]+)?)", re.IGNORECASE, ) # The rent rows look like: # Apartment (1 bedroom) in City Centre...€2,317.19... _RENT_CENTER_EUR_RE = re.compile( r"Apartment\s*\(1\s*bedroom\)\s*in\s*City\s*Centre.*?€\s*([0-9,]+(?:\.[0-9]+)?)", re.IGNORECASE | re.DOTALL, ) _RENT_OUTSIDE_EUR_RE = re.compile( r"Apartment\s*\(1\s*bedroom\)\s*Outside\s*of\s*Cent(?:re|er).*?€\s*([0-9,]+(?:\.[0-9]+)?)", re.IGNORECASE | re.DOTALL, ) class NumbeoFetchError(RuntimeError): """Wraps any HTTP / parsing failure so the cache layer can fall back.""" def _parse_num(s: str) -> Decimal: return Decimal(s.replace(",", "")) def _slug_to_url_segment(slug: str) -> str: """`ho-chi-minh-city` → `Ho-Chi-Minh-City` (Numbeo capitalises words).""" return "-".join(part.capitalize() for part in slug.split("-")) class NumbeoScraper: """Async Numbeo fetcher with per-instance polite rate-limiting. Use as a context manager so the httpx client is cleanly closed: async with NumbeoScraper() as scraper: idx = await scraper.fetch("sofia") """ def __init__( self, *, client: httpx.AsyncClient | None = None, min_interval: float = MIN_REQUEST_INTERVAL, ) -> None: self._owns_client = client is None self._client = client or httpx.AsyncClient( headers={"User-Agent": USER_AGENT, "Accept-Language": "en-GB,en;q=0.9"}, timeout=DEFAULT_TIMEOUT, follow_redirects=True, ) self._min_interval = min_interval self._last_request_at: float = 0.0 self._lock = asyncio.Lock() async def __aenter__(self) -> NumbeoScraper: return self async def __aexit__(self, *_: object) -> None: if self._owns_client: await self._client.aclose() async def _polite_wait(self) -> None: async with self._lock: now = asyncio.get_running_loop().time() elapsed = now - self._last_request_at if elapsed < self._min_interval: await asyncio.sleep(self._min_interval - elapsed) self._last_request_at = asyncio.get_running_loop().time() async def fetch( self, city_slug: str, *, country: str = "", raw_currency: str = "EUR", ) -> CityCostIndex: """Scrape one city's headline numbers from Numbeo. Raises NumbeoFetchError on HTTP error, parse failure, or unknown currency. The caller (cache layer) should catch and fall back to baseline.py. """ url_segment = _slug_to_url_segment(city_slug) url = f"{BASE_URL}/{url_segment}" await self._polite_wait() try: resp = await self._client.get(url) resp.raise_for_status() except httpx.HTTPError as e: raise NumbeoFetchError(f"HTTP error for {url}: {e}") from e return self._parse(city_slug, country, raw_currency, url, resp.text) @staticmethod def _parse( city_slug: str, country: str, raw_currency: str, url: str, html: str, ) -> CityCostIndex: headline_match = _HEADLINE_EUR_RE.search(html) rent_center_match = _RENT_CENTER_EUR_RE.search(html) rent_outside_match = _RENT_OUTSIDE_EUR_RE.search(html) if not (headline_match and rent_center_match): raise NumbeoFetchError( f"could not locate headline or rent rows on {url}" ) no_rent_eur = _parse_num(headline_match.group(1)) rent_center_eur = _parse_num(rent_center_match.group(1)) rent_outside_eur = ( _parse_num(rent_outside_match.group(1)) if rent_outside_match else None ) no_rent_gbp = no_rent_eur * EUR_TO_GBP rent_center_gbp = rent_center_eur * EUR_TO_GBP rent_outside_gbp = ( rent_outside_eur * EUR_TO_GBP if rent_outside_eur is not None else None ) with_rent_gbp = no_rent_gbp + rent_center_gbp # `gbp_per_unit` reflects the conversion FROM the underlying # local currency, not the EUR-side intermediate. When the page # quotes a non-EUR local currency, downstream code may want the # local→GBP rate for display; we record what we know. gbp_per_unit = LOCAL_TO_GBP.get(raw_currency, EUR_TO_GBP) return CityCostIndex( city=_slug_to_url_segment(city_slug).replace("-", " "), city_slug=city_slug, country=country, total_single_no_rent_gbp=no_rent_gbp.quantize(Decimal("0.01")), total_single_with_rent_gbp=with_rent_gbp.quantize(Decimal("0.01")), breakdown=CategoryBreakdown( rent_1bed_center=rent_center_gbp.quantize(Decimal("0.01")), rent_1bed_outside=(rent_outside_gbp.quantize(Decimal("0.01")) if rent_outside_gbp is not None else None), # Live scraper does not extract per-category — see module docstring. groceries=Decimal("0"), restaurants=Decimal("0"), transport=Decimal("0"), utilities=Decimal("0"), leisure=Decimal("0"), ), source=ColSource( name="numbeo", url=url, snapshot_date=date.today(), raw_currency=raw_currency, gbp_per_unit=gbp_per_unit, ), ) def compute_expires_at(ttl_days: int = 365) -> datetime: """One-place TTL helper so the cache + service stay in sync.""" return datetime.now(UTC) + timedelta(days=ttl_days)