fire-planner/fire_planner/col/expatistan.py

"""Expatistan HTML scraper — secondary COL source.

Used by the cache layer to cross-check Numbeo. Expatistan's page format
is different (price-of-living-index based, not absolute monthly figures),
so the headline we extract is their "single person, monthly cost"
estimate from the "Cost of Living in <city>" landing page.

Lower fidelity than Numbeo but ToS-friendlier — Expatistan publishes
their data under CC and explicitly allows non-commercial scraping.

Source-of-truth precedence (set in service.reconcile):
  1. numbeo  — primary, most data points
  2. expatistan — secondary, cross-check
  3. baseline — hand-curated fallback
"""
from __future__ import annotations

import asyncio
import logging
import re
from datetime import date
from decimal import Decimal
from typing import Final

import httpx

from fire_planner.col.models import CategoryBreakdown, CityCostIndex, ColSource

log = logging.getLogger(__name__)

BASE_URL: Final = "https://www.expatistan.com/cost-of-living"
USER_AGENT: Final = (
    "fire-planner/0.1 (+https://forgejo.viktorbarzin.me/viktor/code; "
    "non-commercial personal use; 1-year cache)"
)
DEFAULT_TIMEOUT: Final = httpx.Timeout(connect=10.0, read=30.0, write=10.0, pool=10.0)
MIN_REQUEST_INTERVAL: Final = 1.1

# Expatistan publishes prices in USD by default. Convert to GBP.
USD_TO_GBP: Final = Decimal("0.787")

# Single-person monthly estimate appears in the page text as:
#   "Cost of living in <City>, <Country> for an expat is $X" or similar
# Format varies; capture both "$X,XXX" and "$X" patterns.
_SINGLE_PERSON_USD_RE = re.compile(
    r"(?:single\s+person|expat)[^$]*?\$\s*([0-9,]+(?:\.[0-9]+)?)",
    re.IGNORECASE | re.DOTALL,
)
# Apartment rent (1 bedroom) appears on the "Prices" table line:
#   "Rent for a furnished single room (1 bedroom) in city centre  $X,XXX"
_RENT_CENTER_USD_RE = re.compile(
    r"(?:1\s*bedroom|one[-\s]?bedroom)[^$<]*?(?:cent|expensive)[^$]*?"
    r"\$\s*([0-9,]+(?:\.[0-9]+)?)",
    re.IGNORECASE | re.DOTALL,
)


class ExpatistanFetchError(RuntimeError):
    """HTTP/parse failures so the cache layer can fall back."""


def _parse_num(s: str) -> Decimal:
    return Decimal(s.replace(",", ""))


class ExpatistanScraper:
    def __init__(
        self,
        *,
        client: httpx.AsyncClient | None = None,
        min_interval: float = MIN_REQUEST_INTERVAL,
    ) -> None:
        self._owns_client = client is None
        self._client = client or httpx.AsyncClient(
            headers={"User-Agent": USER_AGENT, "Accept-Language": "en-GB,en;q=0.9"},
            timeout=DEFAULT_TIMEOUT,
            follow_redirects=True,
        )
        self._min_interval = min_interval
        self._last_request_at: float = 0.0
        self._lock = asyncio.Lock()

    async def __aenter__(self) -> ExpatistanScraper:
        return self

    async def __aexit__(self, *_: object) -> None:
        if self._owns_client:
            await self._client.aclose()

    async def _polite_wait(self) -> None:
        async with self._lock:
            now = asyncio.get_running_loop().time()
            elapsed = now - self._last_request_at
            if elapsed < self._min_interval:
                await asyncio.sleep(self._min_interval - elapsed)
            self._last_request_at = asyncio.get_running_loop().time()

    async def fetch(
        self,
        city_slug: str,
        *,
        country: str = "",
    ) -> CityCostIndex:
        # Expatistan uses lowercase city slugs separated by hyphens —
        # same convention as our internal slugs.
        url = f"{BASE_URL}/{city_slug}"
        await self._polite_wait()
        try:
            resp = await self._client.get(url)
            resp.raise_for_status()
        except httpx.HTTPError as e:
            raise ExpatistanFetchError(f"HTTP error for {url}: {e}") from e
        return self._parse(city_slug, country, url, resp.text)

    @staticmethod
    def _parse(
        city_slug: str,
        country: str,
        url: str,
        html: str,
    ) -> CityCostIndex:
        single_match = _SINGLE_PERSON_USD_RE.search(html)
        rent_match = _RENT_CENTER_USD_RE.search(html)
        if not (single_match and rent_match):
            raise ExpatistanFetchError(
                f"could not locate single-person or rent figure on {url}"
            )
        # Expatistan's "single person" headline is total with rent —
        # different convention from Numbeo. Use it as `total_with_rent`
        # directly; derive no_rent by subtracting their rent figure.
        with_rent_usd = _parse_num(single_match.group(1))
        rent_usd = _parse_num(rent_match.group(1))
        with_rent_gbp = with_rent_usd * USD_TO_GBP
        rent_gbp = rent_usd * USD_TO_GBP
        no_rent_gbp = with_rent_gbp - rent_gbp
        # Guard against malformed pages where rent > total (unusual but
        # possible if the regex grabs the wrong row).
        if no_rent_gbp <= 0:
            raise ExpatistanFetchError(
                f"derived no_rent <= 0 ({no_rent_gbp}) on {url}; "
                f"with_rent={with_rent_gbp}, rent={rent_gbp}"
            )
        return CityCostIndex(
            city=city_slug.replace("-", " ").title(),
            city_slug=city_slug,
            country=country,
            total_single_no_rent_gbp=no_rent_gbp.quantize(Decimal("0.01")),
            total_single_with_rent_gbp=with_rent_gbp.quantize(Decimal("0.01")),
            breakdown=CategoryBreakdown(
                rent_1bed_center=rent_gbp.quantize(Decimal("0.01")),
                rent_1bed_outside=None,
                groceries=Decimal("0"),
                restaurants=Decimal("0"),
                transport=Decimal("0"),
                utilities=Decimal("0"),
                leisure=Decimal("0"),
            ),
            source=ColSource(
                name="expatistan",
                url=url,
                snapshot_date=date.today(),
                raw_currency="USD",
                gbp_per_unit=USD_TO_GBP,
            ),
        )