`
pages directly.
No LLM interpretation — uses regex against the table structure. The
page format is stable enough across cities that a single parser works
for all of them.
We extract:
- The headline ex-rent total (one number, EUR-prefixed)
- The 1-bed center / outside rent (two rows in the rent table)
Per-category breakdown is intentionally NOT extracted by the live
scraper — the headline two numbers are what the simulator uses for
ratios, and the breakdown rows are noisy (averages of varying-sample
sizes). The hand-curated `baseline.py` carries the breakdowns where
they exist; the cache layer falls back to baseline.py if a breakdown
is needed for the UI.
ToS posture: Numbeo's robots.txt allows /cost-of-living/* for major
crawlers. We send a polite UA, ≤1 req/sec, 30s timeout, exponential
backoff on 429/5xx, and never re-scrape within the cache TTL.
"""
from __future__ import annotations
import asyncio
import logging
import re
from datetime import UTC, date, datetime, timedelta
from decimal import Decimal
from typing import Final
import httpx
from fire_planner.col.models import CategoryBreakdown, CityCostIndex, ColSource
log = logging.getLogger(__name__)
BASE_URL: Final = "https://www.numbeo.com/cost-of-living/in"
USER_AGENT: Final = (
"fire-planner/0.1 (+https://forgejo.viktorbarzin.me/viktor/code; "
"non-commercial personal use; 1-year cache)"
)
DEFAULT_TIMEOUT: Final = httpx.Timeout(connect=10.0, read=30.0, write=10.0, pool=10.0)
MIN_REQUEST_INTERVAL: Final = 1.1 # seconds between requests — polite
# Currency-to-GBP rates for common Numbeo source pages. Snapshot once at
# scraper init; refresh by editing this map (rare — within ±5% over a
# year). When a city's local currency isn't here, the scraper falls back
# to the EUR amount Numbeo always prints alongside (€-prefixed) — that
# requires only one rate (EUR_TO_GBP) which is universally present.
EUR_TO_GBP: Final = Decimal("0.862")
LOCAL_TO_GBP: Final[dict[str, Decimal]] = {
"EUR": EUR_TO_GBP,
"GBP": Decimal("1.0"),
"USD": Decimal("0.787"),
"BGN": Decimal("0.435"),
"RON": Decimal("0.173"),
"GEL": Decimal("0.295"),
"AED": Decimal("0.21505"),
"MYR": Decimal("0.171"),
"THB": Decimal("0.02198"),
"IDR": Decimal("0.0000485"),
"SGD": Decimal("0.585"),
"TWD": Decimal("0.0246"),
"VND": Decimal("0.0000316"),
"MXN": Decimal("0.0394"),
"COP": Decimal("0.000195"),
"PYG": Decimal("0.000099"),
"UYU": Decimal("0.0197"),
"PAB": Decimal("0.787"), # Panamanian Balboa pegged to USD
"QAR": Decimal("0.216"), # Qatari Riyal
"BHD": Decimal("2.09"),
"JPY": Decimal("0.00520"),
"KRW": Decimal("0.000565"),
"HKD": Decimal("0.101"),
"TRY": Decimal("0.0204"), # volatile — refresh more often
"RSD": Decimal("0.00737"),
"HRK": Decimal("0.114"),
"HUF": Decimal("0.00213"),
"CZK": Decimal("0.0345"),
"PLN": Decimal("0.196"),
"ALL": Decimal("0.00859"),
}
# --- Regex patterns for the Numbeo page ---
# The "Estimated monthly costs for a single person" headline appears as:
# "Estimated monthly costs for a single person are €X.X"
# with the EUR figure always quoted (Numbeo's site currency is EUR).
_HEADLINE_EUR_RE = re.compile(
r"single\s+person[^<]*?(?:are|=)\s*€\s*([0-9,]+(?:\.[0-9]+)?)",
re.IGNORECASE,
)
# The rent rows look like:
# | Apartment (1 bedroom) in City Centre | ...€2,317.19... |
_RENT_CENTER_EUR_RE = re.compile(
r"Apartment\s*\(1\s*bedroom\)\s*in\s*City\s*Centre.*?€\s*([0-9,]+(?:\.[0-9]+)?)",
re.IGNORECASE | re.DOTALL,
)
_RENT_OUTSIDE_EUR_RE = re.compile(
r"Apartment\s*\(1\s*bedroom\)\s*Outside\s*of\s*Cent(?:re|er).*?€\s*([0-9,]+(?:\.[0-9]+)?)",
re.IGNORECASE | re.DOTALL,
)
class NumbeoFetchError(RuntimeError):
"""Wraps any HTTP / parsing failure so the cache layer can fall back."""
def _parse_num(s: str) -> Decimal:
return Decimal(s.replace(",", ""))
def _slug_to_url_segment(slug: str) -> str:
"""`ho-chi-minh-city` → `Ho-Chi-Minh-City` (Numbeo capitalises words)."""
return "-".join(part.capitalize() for part in slug.split("-"))
class NumbeoScraper:
"""Async Numbeo fetcher with per-instance polite rate-limiting.
Use as a context manager so the httpx client is cleanly closed:
async with NumbeoScraper() as scraper:
idx = await scraper.fetch("sofia")
"""
def __init__(
self,
*,
client: httpx.AsyncClient | None = None,
min_interval: float = MIN_REQUEST_INTERVAL,
) -> None:
self._owns_client = client is None
self._client = client or httpx.AsyncClient(
headers={"User-Agent": USER_AGENT, "Accept-Language": "en-GB,en;q=0.9"},
timeout=DEFAULT_TIMEOUT,
follow_redirects=True,
)
self._min_interval = min_interval
self._last_request_at: float = 0.0
self._lock = asyncio.Lock()
async def __aenter__(self) -> NumbeoScraper:
return self
async def __aexit__(self, *_: object) -> None:
if self._owns_client:
await self._client.aclose()
async def _polite_wait(self) -> None:
async with self._lock:
now = asyncio.get_running_loop().time()
elapsed = now - self._last_request_at
if elapsed < self._min_interval:
await asyncio.sleep(self._min_interval - elapsed)
self._last_request_at = asyncio.get_running_loop().time()
async def fetch(
self,
city_slug: str,
*,
country: str = "",
raw_currency: str = "EUR",
) -> CityCostIndex:
"""Scrape one city's headline numbers from Numbeo.
Raises NumbeoFetchError on HTTP error, parse failure, or unknown
currency. The caller (cache layer) should catch and fall back to
baseline.py.
"""
url_segment = _slug_to_url_segment(city_slug)
url = f"{BASE_URL}/{url_segment}"
await self._polite_wait()
try:
resp = await self._client.get(url)
resp.raise_for_status()
except httpx.HTTPError as e:
raise NumbeoFetchError(f"HTTP error for {url}: {e}") from e
return self._parse(city_slug, country, raw_currency, url, resp.text)
@staticmethod
def _parse(
city_slug: str,
country: str,
raw_currency: str,
url: str,
html: str,
) -> CityCostIndex:
headline_match = _HEADLINE_EUR_RE.search(html)
rent_center_match = _RENT_CENTER_EUR_RE.search(html)
rent_outside_match = _RENT_OUTSIDE_EUR_RE.search(html)
if not (headline_match and rent_center_match):
raise NumbeoFetchError(
f"could not locate headline or rent rows on {url}"
)
no_rent_eur = _parse_num(headline_match.group(1))
rent_center_eur = _parse_num(rent_center_match.group(1))
rent_outside_eur = (
_parse_num(rent_outside_match.group(1)) if rent_outside_match else None
)
no_rent_gbp = no_rent_eur * EUR_TO_GBP
rent_center_gbp = rent_center_eur * EUR_TO_GBP
rent_outside_gbp = (
rent_outside_eur * EUR_TO_GBP if rent_outside_eur is not None else None
)
with_rent_gbp = no_rent_gbp + rent_center_gbp
# `gbp_per_unit` reflects the conversion FROM the underlying
# local currency, not the EUR-side intermediate. When the page
# quotes a non-EUR local currency, downstream code may want the
# local→GBP rate for display; we record what we know.
gbp_per_unit = LOCAL_TO_GBP.get(raw_currency, EUR_TO_GBP)
return CityCostIndex(
city=_slug_to_url_segment(city_slug).replace("-", " "),
city_slug=city_slug,
country=country,
total_single_no_rent_gbp=no_rent_gbp.quantize(Decimal("0.01")),
total_single_with_rent_gbp=with_rent_gbp.quantize(Decimal("0.01")),
breakdown=CategoryBreakdown(
rent_1bed_center=rent_center_gbp.quantize(Decimal("0.01")),
rent_1bed_outside=(rent_outside_gbp.quantize(Decimal("0.01"))
if rent_outside_gbp is not None else None),
# Live scraper does not extract per-category — see module docstring.
groceries=Decimal("0"),
restaurants=Decimal("0"),
transport=Decimal("0"),
utilities=Decimal("0"),
leisure=Decimal("0"),
),
source=ColSource(
name="numbeo",
url=url,
snapshot_date=date.today(),
raw_currency=raw_currency,
gbp_per_unit=gbp_per_unit,
),
)
def compute_expires_at(ttl_days: int = 365) -> datetime:
"""One-place TTL helper so the cache + service stay in sync."""
return datetime.now(UTC) + timedelta(days=ttl_days)