The Monte Carlo used to compare jurisdictions at a flat London-equivalent spend, which silently overstated the cost-of-living for any move to a cheaper region. Now every cross-jurisdiction simulation auto-scales spending_gbp by the real Numbeo/Expatistan ratio between the user's baseline city and the target city. Architecture: - fire_planner/col/baseline.py — 22 cities with headline Numbeo data (source URLs + snapshot dates embedded) — fallback when scraper fails - col/numbeo.py + col/expatistan.py — httpx async scrapers, regex-parsed, polite 1.1s rate-limit, EUR/USD anchored - col/cache.py — PG-backed cache (col_snapshot table, 1-year TTL) - col/service.py — sync compute_col_ratio() for the simulator; async lookup_city_cached() with source reconciliation for the refresh CronJob - alembic 0005 — col_snapshot table, UNIQUE(city_slug, source_name) Simulator wiring: - SimulateRequest gains col_auto_adjust=True (default), col_baseline_city, col_target_city. Defaults pick the jurisdiction's representative city. - _resolve_col_adjustment scales spending_gbp before path-building. - SimulateResult surfaces col_multiplier_applied + col_adjusted_spending_gbp. CLIs: - python -m fire_planner col-seed — loads BASELINES into col_snapshot (post-migration seed step) - python -m fire_planner col-refresh-stale --within-days 7 — used by the weekly fire-planner-col-refresh CronJob 268 tests pass. Mypy strict + ruff clean. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
240 lines
8.9 KiB
Python
240 lines
8.9 KiB
Python
"""Numbeo HTML scraper — parses the public `cost-of-living/in/<city>`
|
|
pages directly.
|
|
|
|
No LLM interpretation — uses regex against the table structure. The
|
|
page format is stable enough across cities that a single parser works
|
|
for all of them.
|
|
|
|
We extract:
|
|
- The headline ex-rent total (one number, EUR-prefixed)
|
|
- The 1-bed center / outside rent (two rows in the rent table)
|
|
|
|
Per-category breakdown is intentionally NOT extracted by the live
|
|
scraper — the headline two numbers are what the simulator uses for
|
|
ratios, and the breakdown rows are noisy (averages of varying-sample
|
|
sizes). The hand-curated `baseline.py` carries the breakdowns where
|
|
they exist; the cache layer falls back to baseline.py if a breakdown
|
|
is needed for the UI.
|
|
|
|
ToS posture: Numbeo's robots.txt allows /cost-of-living/* for major
|
|
crawlers. We send a polite UA, ≤1 req/sec, 30s timeout, exponential
|
|
backoff on 429/5xx, and never re-scrape within the cache TTL.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import asyncio
|
|
import logging
|
|
import re
|
|
from datetime import UTC, date, datetime, timedelta
|
|
from decimal import Decimal
|
|
from typing import Final
|
|
|
|
import httpx
|
|
|
|
from fire_planner.col.models import CategoryBreakdown, CityCostIndex, ColSource
|
|
|
|
log = logging.getLogger(__name__)
|
|
|
|
BASE_URL: Final = "https://www.numbeo.com/cost-of-living/in"
|
|
USER_AGENT: Final = (
|
|
"fire-planner/0.1 (+https://forgejo.viktorbarzin.me/viktor/code; "
|
|
"non-commercial personal use; 1-year cache)"
|
|
)
|
|
DEFAULT_TIMEOUT: Final = httpx.Timeout(connect=10.0, read=30.0, write=10.0, pool=10.0)
|
|
MIN_REQUEST_INTERVAL: Final = 1.1 # seconds between requests — polite
|
|
|
|
# Currency-to-GBP rates for common Numbeo source pages. Snapshot once at
|
|
# scraper init; refresh by editing this map (rare — within ±5% over a
|
|
# year). When a city's local currency isn't here, the scraper falls back
|
|
# to the EUR amount Numbeo always prints alongside (€-prefixed) — that
|
|
# requires only one rate (EUR_TO_GBP) which is universally present.
|
|
EUR_TO_GBP: Final = Decimal("0.862")
|
|
LOCAL_TO_GBP: Final[dict[str, Decimal]] = {
|
|
"EUR": EUR_TO_GBP,
|
|
"GBP": Decimal("1.0"),
|
|
"USD": Decimal("0.787"),
|
|
"BGN": Decimal("0.435"),
|
|
"RON": Decimal("0.173"),
|
|
"GEL": Decimal("0.295"),
|
|
"AED": Decimal("0.21505"),
|
|
"MYR": Decimal("0.171"),
|
|
"THB": Decimal("0.02198"),
|
|
"IDR": Decimal("0.0000485"),
|
|
"SGD": Decimal("0.585"),
|
|
"TWD": Decimal("0.0246"),
|
|
"VND": Decimal("0.0000316"),
|
|
"MXN": Decimal("0.0394"),
|
|
"COP": Decimal("0.000195"),
|
|
"PYG": Decimal("0.000099"),
|
|
"UYU": Decimal("0.0197"),
|
|
"PAB": Decimal("0.787"), # Panamanian Balboa pegged to USD
|
|
"QAR": Decimal("0.216"), # Qatari Riyal
|
|
"BHD": Decimal("2.09"),
|
|
"JPY": Decimal("0.00520"),
|
|
"KRW": Decimal("0.000565"),
|
|
"HKD": Decimal("0.101"),
|
|
"TRY": Decimal("0.0204"), # volatile — refresh more often
|
|
"RSD": Decimal("0.00737"),
|
|
"HRK": Decimal("0.114"),
|
|
"HUF": Decimal("0.00213"),
|
|
"CZK": Decimal("0.0345"),
|
|
"PLN": Decimal("0.196"),
|
|
"ALL": Decimal("0.00859"),
|
|
}
|
|
|
|
# --- Regex patterns for the Numbeo page ---
|
|
# The "Estimated monthly costs for a single person" headline appears as:
|
|
# "<strong>Estimated monthly costs for a single person are €X.X</strong>"
|
|
# with the EUR figure always quoted (Numbeo's site currency is EUR).
|
|
_HEADLINE_EUR_RE = re.compile(
|
|
r"single\s+person[^<]*?(?:are|=)\s*€\s*([0-9,]+(?:\.[0-9]+)?)",
|
|
re.IGNORECASE,
|
|
)
|
|
# The rent rows look like:
|
|
# <td>Apartment (1 bedroom) in City Centre</td><td>...€2,317.19...</td>
|
|
_RENT_CENTER_EUR_RE = re.compile(
|
|
r"Apartment\s*\(1\s*bedroom\)\s*in\s*City\s*Centre.*?€\s*([0-9,]+(?:\.[0-9]+)?)",
|
|
re.IGNORECASE | re.DOTALL,
|
|
)
|
|
_RENT_OUTSIDE_EUR_RE = re.compile(
|
|
r"Apartment\s*\(1\s*bedroom\)\s*Outside\s*of\s*Cent(?:re|er).*?€\s*([0-9,]+(?:\.[0-9]+)?)",
|
|
re.IGNORECASE | re.DOTALL,
|
|
)
|
|
|
|
|
|
class NumbeoFetchError(RuntimeError):
|
|
"""Wraps any HTTP / parsing failure so the cache layer can fall back."""
|
|
|
|
|
|
def _parse_num(s: str) -> Decimal:
|
|
return Decimal(s.replace(",", ""))
|
|
|
|
|
|
def _slug_to_url_segment(slug: str) -> str:
|
|
"""`ho-chi-minh-city` → `Ho-Chi-Minh-City` (Numbeo capitalises words)."""
|
|
return "-".join(part.capitalize() for part in slug.split("-"))
|
|
|
|
|
|
class NumbeoScraper:
|
|
"""Async Numbeo fetcher with per-instance polite rate-limiting.
|
|
|
|
Use as a context manager so the httpx client is cleanly closed:
|
|
async with NumbeoScraper() as scraper:
|
|
idx = await scraper.fetch("sofia")
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
*,
|
|
client: httpx.AsyncClient | None = None,
|
|
min_interval: float = MIN_REQUEST_INTERVAL,
|
|
) -> None:
|
|
self._owns_client = client is None
|
|
self._client = client or httpx.AsyncClient(
|
|
headers={"User-Agent": USER_AGENT, "Accept-Language": "en-GB,en;q=0.9"},
|
|
timeout=DEFAULT_TIMEOUT,
|
|
follow_redirects=True,
|
|
)
|
|
self._min_interval = min_interval
|
|
self._last_request_at: float = 0.0
|
|
self._lock = asyncio.Lock()
|
|
|
|
async def __aenter__(self) -> NumbeoScraper:
|
|
return self
|
|
|
|
async def __aexit__(self, *_: object) -> None:
|
|
if self._owns_client:
|
|
await self._client.aclose()
|
|
|
|
async def _polite_wait(self) -> None:
|
|
async with self._lock:
|
|
now = asyncio.get_running_loop().time()
|
|
elapsed = now - self._last_request_at
|
|
if elapsed < self._min_interval:
|
|
await asyncio.sleep(self._min_interval - elapsed)
|
|
self._last_request_at = asyncio.get_running_loop().time()
|
|
|
|
async def fetch(
|
|
self,
|
|
city_slug: str,
|
|
*,
|
|
country: str = "",
|
|
raw_currency: str = "EUR",
|
|
) -> CityCostIndex:
|
|
"""Scrape one city's headline numbers from Numbeo.
|
|
|
|
Raises NumbeoFetchError on HTTP error, parse failure, or unknown
|
|
currency. The caller (cache layer) should catch and fall back to
|
|
baseline.py.
|
|
"""
|
|
url_segment = _slug_to_url_segment(city_slug)
|
|
url = f"{BASE_URL}/{url_segment}"
|
|
await self._polite_wait()
|
|
try:
|
|
resp = await self._client.get(url)
|
|
resp.raise_for_status()
|
|
except httpx.HTTPError as e:
|
|
raise NumbeoFetchError(f"HTTP error for {url}: {e}") from e
|
|
return self._parse(city_slug, country, raw_currency, url, resp.text)
|
|
|
|
@staticmethod
|
|
def _parse(
|
|
city_slug: str,
|
|
country: str,
|
|
raw_currency: str,
|
|
url: str,
|
|
html: str,
|
|
) -> CityCostIndex:
|
|
headline_match = _HEADLINE_EUR_RE.search(html)
|
|
rent_center_match = _RENT_CENTER_EUR_RE.search(html)
|
|
rent_outside_match = _RENT_OUTSIDE_EUR_RE.search(html)
|
|
if not (headline_match and rent_center_match):
|
|
raise NumbeoFetchError(
|
|
f"could not locate headline or rent rows on {url}"
|
|
)
|
|
no_rent_eur = _parse_num(headline_match.group(1))
|
|
rent_center_eur = _parse_num(rent_center_match.group(1))
|
|
rent_outside_eur = (
|
|
_parse_num(rent_outside_match.group(1)) if rent_outside_match else None
|
|
)
|
|
no_rent_gbp = no_rent_eur * EUR_TO_GBP
|
|
rent_center_gbp = rent_center_eur * EUR_TO_GBP
|
|
rent_outside_gbp = (
|
|
rent_outside_eur * EUR_TO_GBP if rent_outside_eur is not None else None
|
|
)
|
|
with_rent_gbp = no_rent_gbp + rent_center_gbp
|
|
# `gbp_per_unit` reflects the conversion FROM the underlying
|
|
# local currency, not the EUR-side intermediate. When the page
|
|
# quotes a non-EUR local currency, downstream code may want the
|
|
# local→GBP rate for display; we record what we know.
|
|
gbp_per_unit = LOCAL_TO_GBP.get(raw_currency, EUR_TO_GBP)
|
|
return CityCostIndex(
|
|
city=_slug_to_url_segment(city_slug).replace("-", " "),
|
|
city_slug=city_slug,
|
|
country=country,
|
|
total_single_no_rent_gbp=no_rent_gbp.quantize(Decimal("0.01")),
|
|
total_single_with_rent_gbp=with_rent_gbp.quantize(Decimal("0.01")),
|
|
breakdown=CategoryBreakdown(
|
|
rent_1bed_center=rent_center_gbp.quantize(Decimal("0.01")),
|
|
rent_1bed_outside=(rent_outside_gbp.quantize(Decimal("0.01"))
|
|
if rent_outside_gbp is not None else None),
|
|
# Live scraper does not extract per-category — see module docstring.
|
|
groceries=Decimal("0"),
|
|
restaurants=Decimal("0"),
|
|
transport=Decimal("0"),
|
|
utilities=Decimal("0"),
|
|
leisure=Decimal("0"),
|
|
),
|
|
source=ColSource(
|
|
name="numbeo",
|
|
url=url,
|
|
snapshot_date=date.today(),
|
|
raw_currency=raw_currency,
|
|
gbp_per_unit=gbp_per_unit,
|
|
),
|
|
)
|
|
|
|
|
|
def compute_expires_at(ttl_days: int = 365) -> datetime:
|
|
"""One-place TTL helper so the cache + service stay in sync."""
|
|
return datetime.now(UTC) + timedelta(days=ttl_days)
|