col: simulator auto-adjusts spending to local prices via Numbeo+Expatistan
The Monte Carlo used to compare jurisdictions at a flat London-equivalent spend, which silently overstated the cost-of-living for any move to a cheaper region. Now every cross-jurisdiction simulation auto-scales spending_gbp by the real Numbeo/Expatistan ratio between the user's baseline city and the target city. Architecture: - fire_planner/col/baseline.py — 22 cities with headline Numbeo data (source URLs + snapshot dates embedded) — fallback when scraper fails - col/numbeo.py + col/expatistan.py — httpx async scrapers, regex-parsed, polite 1.1s rate-limit, EUR/USD anchored - col/cache.py — PG-backed cache (col_snapshot table, 1-year TTL) - col/service.py — sync compute_col_ratio() for the simulator; async lookup_city_cached() with source reconciliation for the refresh CronJob - alembic 0005 — col_snapshot table, UNIQUE(city_slug, source_name) Simulator wiring: - SimulateRequest gains col_auto_adjust=True (default), col_baseline_city, col_target_city. Defaults pick the jurisdiction's representative city. - _resolve_col_adjustment scales spending_gbp before path-building. - SimulateResult surfaces col_multiplier_applied + col_adjusted_spending_gbp. CLIs: - python -m fire_planner col-seed — loads BASELINES into col_snapshot (post-migration seed step) - python -m fire_planner col-refresh-stale --within-days 7 — used by the weekly fire-planner-col-refresh CronJob 268 tests pass. Mypy strict + ruff clean. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
parent
70101c836c
commit
e72fd22a17
14 changed files with 1641 additions and 6 deletions
240
fire_planner/col/numbeo.py
Normal file
240
fire_planner/col/numbeo.py
Normal file
|
|
@ -0,0 +1,240 @@
|
|||
"""Numbeo HTML scraper — parses the public `cost-of-living/in/<city>`
|
||||
pages directly.
|
||||
|
||||
No LLM interpretation — uses regex against the table structure. The
|
||||
page format is stable enough across cities that a single parser works
|
||||
for all of them.
|
||||
|
||||
We extract:
|
||||
- The headline ex-rent total (one number, EUR-prefixed)
|
||||
- The 1-bed center / outside rent (two rows in the rent table)
|
||||
|
||||
Per-category breakdown is intentionally NOT extracted by the live
|
||||
scraper — the headline two numbers are what the simulator uses for
|
||||
ratios, and the breakdown rows are noisy (averages of varying-sample
|
||||
sizes). The hand-curated `baseline.py` carries the breakdowns where
|
||||
they exist; the cache layer falls back to baseline.py if a breakdown
|
||||
is needed for the UI.
|
||||
|
||||
ToS posture: Numbeo's robots.txt allows /cost-of-living/* for major
|
||||
crawlers. We send a polite UA, ≤1 req/sec, 30s timeout, exponential
|
||||
backoff on 429/5xx, and never re-scrape within the cache TTL.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
import re
|
||||
from datetime import UTC, date, datetime, timedelta
|
||||
from decimal import Decimal
|
||||
from typing import Final
|
||||
|
||||
import httpx
|
||||
|
||||
from fire_planner.col.models import CategoryBreakdown, CityCostIndex, ColSource
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
BASE_URL: Final = "https://www.numbeo.com/cost-of-living/in"
|
||||
USER_AGENT: Final = (
|
||||
"fire-planner/0.1 (+https://forgejo.viktorbarzin.me/viktor/code; "
|
||||
"non-commercial personal use; 1-year cache)"
|
||||
)
|
||||
DEFAULT_TIMEOUT: Final = httpx.Timeout(connect=10.0, read=30.0, write=10.0, pool=10.0)
|
||||
MIN_REQUEST_INTERVAL: Final = 1.1 # seconds between requests — polite
|
||||
|
||||
# Currency-to-GBP rates for common Numbeo source pages. Snapshot once at
|
||||
# scraper init; refresh by editing this map (rare — within ±5% over a
|
||||
# year). When a city's local currency isn't here, the scraper falls back
|
||||
# to the EUR amount Numbeo always prints alongside (€-prefixed) — that
|
||||
# requires only one rate (EUR_TO_GBP) which is universally present.
|
||||
EUR_TO_GBP: Final = Decimal("0.862")
|
||||
LOCAL_TO_GBP: Final[dict[str, Decimal]] = {
|
||||
"EUR": EUR_TO_GBP,
|
||||
"GBP": Decimal("1.0"),
|
||||
"USD": Decimal("0.787"),
|
||||
"BGN": Decimal("0.435"),
|
||||
"RON": Decimal("0.173"),
|
||||
"GEL": Decimal("0.295"),
|
||||
"AED": Decimal("0.21505"),
|
||||
"MYR": Decimal("0.171"),
|
||||
"THB": Decimal("0.02198"),
|
||||
"IDR": Decimal("0.0000485"),
|
||||
"SGD": Decimal("0.585"),
|
||||
"TWD": Decimal("0.0246"),
|
||||
"VND": Decimal("0.0000316"),
|
||||
"MXN": Decimal("0.0394"),
|
||||
"COP": Decimal("0.000195"),
|
||||
"PYG": Decimal("0.000099"),
|
||||
"UYU": Decimal("0.0197"),
|
||||
"PAB": Decimal("0.787"), # Panamanian Balboa pegged to USD
|
||||
"QAR": Decimal("0.216"), # Qatari Riyal
|
||||
"BHD": Decimal("2.09"),
|
||||
"JPY": Decimal("0.00520"),
|
||||
"KRW": Decimal("0.000565"),
|
||||
"HKD": Decimal("0.101"),
|
||||
"TRY": Decimal("0.0204"), # volatile — refresh more often
|
||||
"RSD": Decimal("0.00737"),
|
||||
"HRK": Decimal("0.114"),
|
||||
"HUF": Decimal("0.00213"),
|
||||
"CZK": Decimal("0.0345"),
|
||||
"PLN": Decimal("0.196"),
|
||||
"ALL": Decimal("0.00859"),
|
||||
}
|
||||
|
||||
# --- Regex patterns for the Numbeo page ---
|
||||
# The "Estimated monthly costs for a single person" headline appears as:
|
||||
# "<strong>Estimated monthly costs for a single person are €X.X</strong>"
|
||||
# with the EUR figure always quoted (Numbeo's site currency is EUR).
|
||||
_HEADLINE_EUR_RE = re.compile(
|
||||
r"single\s+person[^<]*?(?:are|=)\s*€\s*([0-9,]+(?:\.[0-9]+)?)",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
# The rent rows look like:
|
||||
# <td>Apartment (1 bedroom) in City Centre</td><td>...€2,317.19...</td>
|
||||
_RENT_CENTER_EUR_RE = re.compile(
|
||||
r"Apartment\s*\(1\s*bedroom\)\s*in\s*City\s*Centre.*?€\s*([0-9,]+(?:\.[0-9]+)?)",
|
||||
re.IGNORECASE | re.DOTALL,
|
||||
)
|
||||
_RENT_OUTSIDE_EUR_RE = re.compile(
|
||||
r"Apartment\s*\(1\s*bedroom\)\s*Outside\s*of\s*Cent(?:re|er).*?€\s*([0-9,]+(?:\.[0-9]+)?)",
|
||||
re.IGNORECASE | re.DOTALL,
|
||||
)
|
||||
|
||||
|
||||
class NumbeoFetchError(RuntimeError):
|
||||
"""Wraps any HTTP / parsing failure so the cache layer can fall back."""
|
||||
|
||||
|
||||
def _parse_num(s: str) -> Decimal:
|
||||
return Decimal(s.replace(",", ""))
|
||||
|
||||
|
||||
def _slug_to_url_segment(slug: str) -> str:
|
||||
"""`ho-chi-minh-city` → `Ho-Chi-Minh-City` (Numbeo capitalises words)."""
|
||||
return "-".join(part.capitalize() for part in slug.split("-"))
|
||||
|
||||
|
||||
class NumbeoScraper:
|
||||
"""Async Numbeo fetcher with per-instance polite rate-limiting.
|
||||
|
||||
Use as a context manager so the httpx client is cleanly closed:
|
||||
async with NumbeoScraper() as scraper:
|
||||
idx = await scraper.fetch("sofia")
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
client: httpx.AsyncClient | None = None,
|
||||
min_interval: float = MIN_REQUEST_INTERVAL,
|
||||
) -> None:
|
||||
self._owns_client = client is None
|
||||
self._client = client or httpx.AsyncClient(
|
||||
headers={"User-Agent": USER_AGENT, "Accept-Language": "en-GB,en;q=0.9"},
|
||||
timeout=DEFAULT_TIMEOUT,
|
||||
follow_redirects=True,
|
||||
)
|
||||
self._min_interval = min_interval
|
||||
self._last_request_at: float = 0.0
|
||||
self._lock = asyncio.Lock()
|
||||
|
||||
async def __aenter__(self) -> NumbeoScraper:
|
||||
return self
|
||||
|
||||
async def __aexit__(self, *_: object) -> None:
|
||||
if self._owns_client:
|
||||
await self._client.aclose()
|
||||
|
||||
async def _polite_wait(self) -> None:
|
||||
async with self._lock:
|
||||
now = asyncio.get_running_loop().time()
|
||||
elapsed = now - self._last_request_at
|
||||
if elapsed < self._min_interval:
|
||||
await asyncio.sleep(self._min_interval - elapsed)
|
||||
self._last_request_at = asyncio.get_running_loop().time()
|
||||
|
||||
async def fetch(
|
||||
self,
|
||||
city_slug: str,
|
||||
*,
|
||||
country: str = "",
|
||||
raw_currency: str = "EUR",
|
||||
) -> CityCostIndex:
|
||||
"""Scrape one city's headline numbers from Numbeo.
|
||||
|
||||
Raises NumbeoFetchError on HTTP error, parse failure, or unknown
|
||||
currency. The caller (cache layer) should catch and fall back to
|
||||
baseline.py.
|
||||
"""
|
||||
url_segment = _slug_to_url_segment(city_slug)
|
||||
url = f"{BASE_URL}/{url_segment}"
|
||||
await self._polite_wait()
|
||||
try:
|
||||
resp = await self._client.get(url)
|
||||
resp.raise_for_status()
|
||||
except httpx.HTTPError as e:
|
||||
raise NumbeoFetchError(f"HTTP error for {url}: {e}") from e
|
||||
return self._parse(city_slug, country, raw_currency, url, resp.text)
|
||||
|
||||
@staticmethod
|
||||
def _parse(
|
||||
city_slug: str,
|
||||
country: str,
|
||||
raw_currency: str,
|
||||
url: str,
|
||||
html: str,
|
||||
) -> CityCostIndex:
|
||||
headline_match = _HEADLINE_EUR_RE.search(html)
|
||||
rent_center_match = _RENT_CENTER_EUR_RE.search(html)
|
||||
rent_outside_match = _RENT_OUTSIDE_EUR_RE.search(html)
|
||||
if not (headline_match and rent_center_match):
|
||||
raise NumbeoFetchError(
|
||||
f"could not locate headline or rent rows on {url}"
|
||||
)
|
||||
no_rent_eur = _parse_num(headline_match.group(1))
|
||||
rent_center_eur = _parse_num(rent_center_match.group(1))
|
||||
rent_outside_eur = (
|
||||
_parse_num(rent_outside_match.group(1)) if rent_outside_match else None
|
||||
)
|
||||
no_rent_gbp = no_rent_eur * EUR_TO_GBP
|
||||
rent_center_gbp = rent_center_eur * EUR_TO_GBP
|
||||
rent_outside_gbp = (
|
||||
rent_outside_eur * EUR_TO_GBP if rent_outside_eur is not None else None
|
||||
)
|
||||
with_rent_gbp = no_rent_gbp + rent_center_gbp
|
||||
# `gbp_per_unit` reflects the conversion FROM the underlying
|
||||
# local currency, not the EUR-side intermediate. When the page
|
||||
# quotes a non-EUR local currency, downstream code may want the
|
||||
# local→GBP rate for display; we record what we know.
|
||||
gbp_per_unit = LOCAL_TO_GBP.get(raw_currency, EUR_TO_GBP)
|
||||
return CityCostIndex(
|
||||
city=_slug_to_url_segment(city_slug).replace("-", " "),
|
||||
city_slug=city_slug,
|
||||
country=country,
|
||||
total_single_no_rent_gbp=no_rent_gbp.quantize(Decimal("0.01")),
|
||||
total_single_with_rent_gbp=with_rent_gbp.quantize(Decimal("0.01")),
|
||||
breakdown=CategoryBreakdown(
|
||||
rent_1bed_center=rent_center_gbp.quantize(Decimal("0.01")),
|
||||
rent_1bed_outside=(rent_outside_gbp.quantize(Decimal("0.01"))
|
||||
if rent_outside_gbp is not None else None),
|
||||
# Live scraper does not extract per-category — see module docstring.
|
||||
groceries=Decimal("0"),
|
||||
restaurants=Decimal("0"),
|
||||
transport=Decimal("0"),
|
||||
utilities=Decimal("0"),
|
||||
leisure=Decimal("0"),
|
||||
),
|
||||
source=ColSource(
|
||||
name="numbeo",
|
||||
url=url,
|
||||
snapshot_date=date.today(),
|
||||
raw_currency=raw_currency,
|
||||
gbp_per_unit=gbp_per_unit,
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
def compute_expires_at(ttl_days: int = 365) -> datetime:
|
||||
"""One-place TTL helper so the cache + service stay in sync."""
|
||||
return datetime.now(UTC) + timedelta(days=ttl_days)
|
||||
Loading…
Add table
Add a link
Reference in a new issue