"""Expatistan HTML scraper — secondary COL source.
Used by the cache layer to cross-check Numbeo. Expatistan's page format
is different (price-of-living-index based, not absolute monthly figures),
so the headline we extract is their "single person, monthly cost"
estimate from the "Cost of Living in " landing page.
Lower fidelity than Numbeo but ToS-friendlier — Expatistan publishes
their data under CC and explicitly allows non-commercial scraping.
Source-of-truth precedence (set in service.reconcile):
1. numbeo — primary, most data points
2. expatistan — secondary, cross-check
3. baseline — hand-curated fallback
"""
from __future__ import annotations
import asyncio
import logging
import re
from datetime import date
from decimal import Decimal
from typing import Final
import httpx
from fire_planner.col.models import CategoryBreakdown, CityCostIndex, ColSource
log = logging.getLogger(__name__)
BASE_URL: Final = "https://www.expatistan.com/cost-of-living"
USER_AGENT: Final = (
"fire-planner/0.1 (+https://forgejo.viktorbarzin.me/viktor/code; "
"non-commercial personal use; 1-year cache)"
)
DEFAULT_TIMEOUT: Final = httpx.Timeout(connect=10.0, read=30.0, write=10.0, pool=10.0)
MIN_REQUEST_INTERVAL: Final = 1.1
# Expatistan publishes prices in USD by default. Convert to GBP.
USD_TO_GBP: Final = Decimal("0.787")
# Single-person monthly estimate appears in the page text as:
# "Cost of living in , for an expat is $X" or similar
# Format varies; capture both "$X,XXX" and "$X" patterns.
_SINGLE_PERSON_USD_RE = re.compile(
r"(?:single\s+person|expat)[^$]*?\$\s*([0-9,]+(?:\.[0-9]+)?)",
re.IGNORECASE | re.DOTALL,
)
# Apartment rent (1 bedroom) appears on the "Prices" table line:
# "Rent for a furnished single room (1 bedroom) in city centre $X,XXX"
_RENT_CENTER_USD_RE = re.compile(
r"(?:1\s*bedroom|one[-\s]?bedroom)[^$<]*?(?:cent|expensive)[^$]*?"
r"\$\s*([0-9,]+(?:\.[0-9]+)?)",
re.IGNORECASE | re.DOTALL,
)
class ExpatistanFetchError(RuntimeError):
"""HTTP/parse failures so the cache layer can fall back."""
def _parse_num(s: str) -> Decimal:
return Decimal(s.replace(",", ""))
class ExpatistanScraper:
def __init__(
self,
*,
client: httpx.AsyncClient | None = None,
min_interval: float = MIN_REQUEST_INTERVAL,
) -> None:
self._owns_client = client is None
self._client = client or httpx.AsyncClient(
headers={"User-Agent": USER_AGENT, "Accept-Language": "en-GB,en;q=0.9"},
timeout=DEFAULT_TIMEOUT,
follow_redirects=True,
)
self._min_interval = min_interval
self._last_request_at: float = 0.0
self._lock = asyncio.Lock()
async def __aenter__(self) -> ExpatistanScraper:
return self
async def __aexit__(self, *_: object) -> None:
if self._owns_client:
await self._client.aclose()
async def _polite_wait(self) -> None:
async with self._lock:
now = asyncio.get_running_loop().time()
elapsed = now - self._last_request_at
if elapsed < self._min_interval:
await asyncio.sleep(self._min_interval - elapsed)
self._last_request_at = asyncio.get_running_loop().time()
async def fetch(
self,
city_slug: str,
*,
country: str = "",
) -> CityCostIndex:
# Expatistan uses lowercase city slugs separated by hyphens —
# same convention as our internal slugs.
url = f"{BASE_URL}/{city_slug}"
await self._polite_wait()
try:
resp = await self._client.get(url)
resp.raise_for_status()
except httpx.HTTPError as e:
raise ExpatistanFetchError(f"HTTP error for {url}: {e}") from e
return self._parse(city_slug, country, url, resp.text)
@staticmethod
def _parse(
city_slug: str,
country: str,
url: str,
html: str,
) -> CityCostIndex:
single_match = _SINGLE_PERSON_USD_RE.search(html)
rent_match = _RENT_CENTER_USD_RE.search(html)
if not (single_match and rent_match):
raise ExpatistanFetchError(
f"could not locate single-person or rent figure on {url}"
)
# Expatistan's "single person" headline is total with rent —
# different convention from Numbeo. Use it as `total_with_rent`
# directly; derive no_rent by subtracting their rent figure.
with_rent_usd = _parse_num(single_match.group(1))
rent_usd = _parse_num(rent_match.group(1))
with_rent_gbp = with_rent_usd * USD_TO_GBP
rent_gbp = rent_usd * USD_TO_GBP
no_rent_gbp = with_rent_gbp - rent_gbp
# Guard against malformed pages where rent > total (unusual but
# possible if the regex grabs the wrong row).
if no_rent_gbp <= 0:
raise ExpatistanFetchError(
f"derived no_rent <= 0 ({no_rent_gbp}) on {url}; "
f"with_rent={with_rent_gbp}, rent={rent_gbp}"
)
return CityCostIndex(
city=city_slug.replace("-", " ").title(),
city_slug=city_slug,
country=country,
total_single_no_rent_gbp=no_rent_gbp.quantize(Decimal("0.01")),
total_single_with_rent_gbp=with_rent_gbp.quantize(Decimal("0.01")),
breakdown=CategoryBreakdown(
rent_1bed_center=rent_gbp.quantize(Decimal("0.01")),
rent_1bed_outside=None,
groceries=Decimal("0"),
restaurants=Decimal("0"),
transport=Decimal("0"),
utilities=Decimal("0"),
leisure=Decimal("0"),
),
source=ColSource(
name="expatistan",
url=url,
snapshot_date=date.today(),
raw_currency="USD",
gbp_per_unit=USD_TO_GBP,
),
)