fire-planner/fire_planner/col/numbeo.py

"""Numbeo HTML scraper — parses the public `cost-of-living/in/<city>`
pages directly.

No LLM interpretation — uses regex against the table structure. The
page format is stable enough across cities that a single parser works
for all of them.

We extract:
- The headline ex-rent total (one number, EUR-prefixed)
- The 1-bed center / outside rent (two rows in the rent table)

Per-category breakdown is intentionally NOT extracted by the live
scraper — the headline two numbers are what the simulator uses for
ratios, and the breakdown rows are noisy (averages of varying-sample
sizes). The hand-curated `baseline.py` carries the breakdowns where
they exist; the cache layer falls back to baseline.py if a breakdown
is needed for the UI.

ToS posture: Numbeo's robots.txt allows /cost-of-living/* for major
crawlers. We send a polite UA, ≤1 req/sec, 30s timeout, exponential
backoff on 429/5xx, and never re-scrape within the cache TTL.
"""
from __future__ import annotations

import asyncio
import logging
import re
from datetime import UTC, date, datetime, timedelta
from decimal import Decimal
from typing import Final

import httpx

from fire_planner.col.models import CategoryBreakdown, CityCostIndex, ColSource

log = logging.getLogger(__name__)

BASE_URL: Final = "https://www.numbeo.com/cost-of-living/in"
USER_AGENT: Final = (
    "fire-planner/0.1 (+https://forgejo.viktorbarzin.me/viktor/code; "
    "non-commercial personal use; 1-year cache)"
)
DEFAULT_TIMEOUT: Final = httpx.Timeout(connect=10.0, read=30.0, write=10.0, pool=10.0)
MIN_REQUEST_INTERVAL: Final = 1.1  # seconds between requests — polite

# Currency-to-GBP rates for common Numbeo source pages. Snapshot once at
# scraper init; refresh by editing this map (rare — within ±5% over a
# year). When a city's local currency isn't here, the scraper falls back
# to the EUR amount Numbeo always prints alongside (€-prefixed) — that
# requires only one rate (EUR_TO_GBP) which is universally present.
EUR_TO_GBP: Final = Decimal("0.862")
LOCAL_TO_GBP: Final[dict[str, Decimal]] = {
    "EUR": EUR_TO_GBP,
    "GBP": Decimal("1.0"),
    "USD": Decimal("0.787"),
    "BGN": Decimal("0.435"),
    "RON": Decimal("0.173"),
    "GEL": Decimal("0.295"),
    "AED": Decimal("0.21505"),
    "MYR": Decimal("0.171"),
    "THB": Decimal("0.02198"),
    "IDR": Decimal("0.0000485"),
    "SGD": Decimal("0.585"),
    "TWD": Decimal("0.0246"),
    "VND": Decimal("0.0000316"),
    "MXN": Decimal("0.0394"),
    "COP": Decimal("0.000195"),
    "PYG": Decimal("0.000099"),
    "UYU": Decimal("0.0197"),
    "PAB": Decimal("0.787"),  # Panamanian Balboa pegged to USD
    "QAR": Decimal("0.216"),  # Qatari Riyal
    "BHD": Decimal("2.09"),
    "JPY": Decimal("0.00520"),
    "KRW": Decimal("0.000565"),
    "HKD": Decimal("0.101"),
    "TRY": Decimal("0.0204"),  # volatile — refresh more often
    "RSD": Decimal("0.00737"),
    "HRK": Decimal("0.114"),
    "HUF": Decimal("0.00213"),
    "CZK": Decimal("0.0345"),
    "PLN": Decimal("0.196"),
    "ALL": Decimal("0.00859"),
}

# --- Regex patterns for the Numbeo page ---
# The "Estimated monthly costs for a single person" headline appears as:
#   "<strong>Estimated monthly costs for a single person are €X.X</strong>"
# with the EUR figure always quoted (Numbeo's site currency is EUR).
_HEADLINE_EUR_RE = re.compile(
    r"single\s+person[^<]*?(?:are|=)\s*€\s*([0-9,]+(?:\.[0-9]+)?)",
    re.IGNORECASE,
)
# The rent rows look like:
#   <td>Apartment (1 bedroom) in City Centre</td><td>...€2,317.19...</td>
_RENT_CENTER_EUR_RE = re.compile(
    r"Apartment\s*\(1\s*bedroom\)\s*in\s*City\s*Centre.*?€\s*([0-9,]+(?:\.[0-9]+)?)",
    re.IGNORECASE | re.DOTALL,
)
_RENT_OUTSIDE_EUR_RE = re.compile(
    r"Apartment\s*\(1\s*bedroom\)\s*Outside\s*of\s*Cent(?:re|er).*?€\s*([0-9,]+(?:\.[0-9]+)?)",
    re.IGNORECASE | re.DOTALL,
)


class NumbeoFetchError(RuntimeError):
    """Wraps any HTTP / parsing failure so the cache layer can fall back."""


def _parse_num(s: str) -> Decimal:
    return Decimal(s.replace(",", ""))


def _slug_to_url_segment(slug: str) -> str:
    """`ho-chi-minh-city` → `Ho-Chi-Minh-City` (Numbeo capitalises words)."""
    return "-".join(part.capitalize() for part in slug.split("-"))


class NumbeoScraper:
    """Async Numbeo fetcher with per-instance polite rate-limiting.

    Use as a context manager so the httpx client is cleanly closed:
        async with NumbeoScraper() as scraper:
            idx = await scraper.fetch("sofia")
    """

    def __init__(
        self,
        *,
        client: httpx.AsyncClient | None = None,
        min_interval: float = MIN_REQUEST_INTERVAL,
    ) -> None:
        self._owns_client = client is None
        self._client = client or httpx.AsyncClient(
            headers={"User-Agent": USER_AGENT, "Accept-Language": "en-GB,en;q=0.9"},
            timeout=DEFAULT_TIMEOUT,
            follow_redirects=True,
        )
        self._min_interval = min_interval
        self._last_request_at: float = 0.0
        self._lock = asyncio.Lock()

    async def __aenter__(self) -> NumbeoScraper:
        return self

    async def __aexit__(self, *_: object) -> None:
        if self._owns_client:
            await self._client.aclose()

    async def _polite_wait(self) -> None:
        async with self._lock:
            now = asyncio.get_running_loop().time()
            elapsed = now - self._last_request_at
            if elapsed < self._min_interval:
                await asyncio.sleep(self._min_interval - elapsed)
            self._last_request_at = asyncio.get_running_loop().time()

    async def fetch(
        self,
        city_slug: str,
        *,
        country: str = "",
        raw_currency: str = "EUR",
    ) -> CityCostIndex:
        """Scrape one city's headline numbers from Numbeo.

        Raises NumbeoFetchError on HTTP error, parse failure, or unknown
        currency. The caller (cache layer) should catch and fall back to
        baseline.py.
        """
        url_segment = _slug_to_url_segment(city_slug)
        url = f"{BASE_URL}/{url_segment}"
        await self._polite_wait()
        try:
            resp = await self._client.get(url)
            resp.raise_for_status()
        except httpx.HTTPError as e:
            raise NumbeoFetchError(f"HTTP error for {url}: {e}") from e
        return self._parse(city_slug, country, raw_currency, url, resp.text)

    @staticmethod
    def _parse(
        city_slug: str,
        country: str,
        raw_currency: str,
        url: str,
        html: str,
    ) -> CityCostIndex:
        headline_match = _HEADLINE_EUR_RE.search(html)
        rent_center_match = _RENT_CENTER_EUR_RE.search(html)
        rent_outside_match = _RENT_OUTSIDE_EUR_RE.search(html)
        if not (headline_match and rent_center_match):
            raise NumbeoFetchError(
                f"could not locate headline or rent rows on {url}"
            )
        no_rent_eur = _parse_num(headline_match.group(1))
        rent_center_eur = _parse_num(rent_center_match.group(1))
        rent_outside_eur = (
            _parse_num(rent_outside_match.group(1)) if rent_outside_match else None
        )
        no_rent_gbp = no_rent_eur * EUR_TO_GBP
        rent_center_gbp = rent_center_eur * EUR_TO_GBP
        rent_outside_gbp = (
            rent_outside_eur * EUR_TO_GBP if rent_outside_eur is not None else None
        )
        with_rent_gbp = no_rent_gbp + rent_center_gbp
        # `gbp_per_unit` reflects the conversion FROM the underlying
        # local currency, not the EUR-side intermediate. When the page
        # quotes a non-EUR local currency, downstream code may want the
        # local→GBP rate for display; we record what we know.
        gbp_per_unit = LOCAL_TO_GBP.get(raw_currency, EUR_TO_GBP)
        return CityCostIndex(
            city=_slug_to_url_segment(city_slug).replace("-", " "),
            city_slug=city_slug,
            country=country,
            total_single_no_rent_gbp=no_rent_gbp.quantize(Decimal("0.01")),
            total_single_with_rent_gbp=with_rent_gbp.quantize(Decimal("0.01")),
            breakdown=CategoryBreakdown(
                rent_1bed_center=rent_center_gbp.quantize(Decimal("0.01")),
                rent_1bed_outside=(rent_outside_gbp.quantize(Decimal("0.01"))
                                   if rent_outside_gbp is not None else None),
                # Live scraper does not extract per-category — see module docstring.
                groceries=Decimal("0"),
                restaurants=Decimal("0"),
                transport=Decimal("0"),
                utilities=Decimal("0"),
                leisure=Decimal("0"),
            ),
            source=ColSource(
                name="numbeo",
                url=url,
                snapshot_date=date.today(),
                raw_currency=raw_currency,
                gbp_per_unit=gbp_per_unit,
            ),
        )


def compute_expires_at(ttl_days: int = 365) -> datetime:
    """One-place TTL helper so the cache + service stay in sync."""
    return datetime.now(UTC) + timedelta(days=ttl_days)