"""Expatistan HTML scraper — secondary COL source. Used by the cache layer to cross-check Numbeo. Expatistan's page format is different (price-of-living-index based, not absolute monthly figures), so the headline we extract is their "single person, monthly cost" estimate from the "Cost of Living in " landing page. Lower fidelity than Numbeo but ToS-friendlier — Expatistan publishes their data under CC and explicitly allows non-commercial scraping. Source-of-truth precedence (set in service.reconcile): 1. numbeo — primary, most data points 2. expatistan — secondary, cross-check 3. baseline — hand-curated fallback """ from __future__ import annotations import asyncio import logging import re from datetime import date from decimal import Decimal from typing import Final import httpx from fire_planner.col.models import CategoryBreakdown, CityCostIndex, ColSource log = logging.getLogger(__name__) BASE_URL: Final = "https://www.expatistan.com/cost-of-living" USER_AGENT: Final = ( "fire-planner/0.1 (+https://forgejo.viktorbarzin.me/viktor/code; " "non-commercial personal use; 1-year cache)" ) DEFAULT_TIMEOUT: Final = httpx.Timeout(connect=10.0, read=30.0, write=10.0, pool=10.0) MIN_REQUEST_INTERVAL: Final = 1.1 # Expatistan publishes prices in USD by default. Convert to GBP. USD_TO_GBP: Final = Decimal("0.787") # Single-person monthly estimate appears in the page text as: # "Cost of living in , for an expat is $X" or similar # Format varies; capture both "$X,XXX" and "$X" patterns. _SINGLE_PERSON_USD_RE = re.compile( r"(?:single\s+person|expat)[^$]*?\$\s*([0-9,]+(?:\.[0-9]+)?)", re.IGNORECASE | re.DOTALL, ) # Apartment rent (1 bedroom) appears on the "Prices" table line: # "Rent for a furnished single room (1 bedroom) in city centre $X,XXX" _RENT_CENTER_USD_RE = re.compile( r"(?:1\s*bedroom|one[-\s]?bedroom)[^$<]*?(?:cent|expensive)[^$]*?" r"\$\s*([0-9,]+(?:\.[0-9]+)?)", re.IGNORECASE | re.DOTALL, ) class ExpatistanFetchError(RuntimeError): """HTTP/parse failures so the cache layer can fall back.""" def _parse_num(s: str) -> Decimal: return Decimal(s.replace(",", "")) class ExpatistanScraper: def __init__( self, *, client: httpx.AsyncClient | None = None, min_interval: float = MIN_REQUEST_INTERVAL, ) -> None: self._owns_client = client is None self._client = client or httpx.AsyncClient( headers={"User-Agent": USER_AGENT, "Accept-Language": "en-GB,en;q=0.9"}, timeout=DEFAULT_TIMEOUT, follow_redirects=True, ) self._min_interval = min_interval self._last_request_at: float = 0.0 self._lock = asyncio.Lock() async def __aenter__(self) -> ExpatistanScraper: return self async def __aexit__(self, *_: object) -> None: if self._owns_client: await self._client.aclose() async def _polite_wait(self) -> None: async with self._lock: now = asyncio.get_running_loop().time() elapsed = now - self._last_request_at if elapsed < self._min_interval: await asyncio.sleep(self._min_interval - elapsed) self._last_request_at = asyncio.get_running_loop().time() async def fetch( self, city_slug: str, *, country: str = "", ) -> CityCostIndex: # Expatistan uses lowercase city slugs separated by hyphens — # same convention as our internal slugs. url = f"{BASE_URL}/{city_slug}" await self._polite_wait() try: resp = await self._client.get(url) resp.raise_for_status() except httpx.HTTPError as e: raise ExpatistanFetchError(f"HTTP error for {url}: {e}") from e return self._parse(city_slug, country, url, resp.text) @staticmethod def _parse( city_slug: str, country: str, url: str, html: str, ) -> CityCostIndex: single_match = _SINGLE_PERSON_USD_RE.search(html) rent_match = _RENT_CENTER_USD_RE.search(html) if not (single_match and rent_match): raise ExpatistanFetchError( f"could not locate single-person or rent figure on {url}" ) # Expatistan's "single person" headline is total with rent — # different convention from Numbeo. Use it as `total_with_rent` # directly; derive no_rent by subtracting their rent figure. with_rent_usd = _parse_num(single_match.group(1)) rent_usd = _parse_num(rent_match.group(1)) with_rent_gbp = with_rent_usd * USD_TO_GBP rent_gbp = rent_usd * USD_TO_GBP no_rent_gbp = with_rent_gbp - rent_gbp # Guard against malformed pages where rent > total (unusual but # possible if the regex grabs the wrong row). if no_rent_gbp <= 0: raise ExpatistanFetchError( f"derived no_rent <= 0 ({no_rent_gbp}) on {url}; " f"with_rent={with_rent_gbp}, rent={rent_gbp}" ) return CityCostIndex( city=city_slug.replace("-", " ").title(), city_slug=city_slug, country=country, total_single_no_rent_gbp=no_rent_gbp.quantize(Decimal("0.01")), total_single_with_rent_gbp=with_rent_gbp.quantize(Decimal("0.01")), breakdown=CategoryBreakdown( rent_1bed_center=rent_gbp.quantize(Decimal("0.01")), rent_1bed_outside=None, groceries=Decimal("0"), restaurants=Decimal("0"), transport=Decimal("0"), utilities=Decimal("0"), leisure=Decimal("0"), ), source=ColSource( name="expatistan", url=url, snapshot_date=date.today(), raw_currency="USD", gbp_per_unit=USD_TO_GBP, ), )