fire-planner/fire_planner/col/expatistan.py
Viktor Barzin e72fd22a17 col: simulator auto-adjusts spending to local prices via Numbeo+Expatistan
The Monte Carlo used to compare jurisdictions at a flat London-equivalent
spend, which silently overstated the cost-of-living for any move to a
cheaper region. Now every cross-jurisdiction simulation auto-scales
spending_gbp by the real Numbeo/Expatistan ratio between the user's
baseline city and the target city.

Architecture:
- fire_planner/col/baseline.py — 22 cities with headline Numbeo data
  (source URLs + snapshot dates embedded) — fallback when scraper fails
- col/numbeo.py + col/expatistan.py — httpx async scrapers, regex-parsed,
  polite 1.1s rate-limit, EUR/USD anchored
- col/cache.py — PG-backed cache (col_snapshot table, 1-year TTL)
- col/service.py — sync compute_col_ratio() for the simulator; async
  lookup_city_cached() with source reconciliation for the refresh CronJob
- alembic 0005 — col_snapshot table, UNIQUE(city_slug, source_name)

Simulator wiring:
- SimulateRequest gains col_auto_adjust=True (default), col_baseline_city,
  col_target_city. Defaults pick the jurisdiction's representative city.
- _resolve_col_adjustment scales spending_gbp before path-building.
- SimulateResult surfaces col_multiplier_applied + col_adjusted_spending_gbp.

CLIs:
- python -m fire_planner col-seed — loads BASELINES into col_snapshot
  (post-migration seed step)
- python -m fire_planner col-refresh-stale --within-days 7 — used by the
  weekly fire-planner-col-refresh CronJob

268 tests pass. Mypy strict + ruff clean.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-22 14:14:57 +00:00

165 lines
5.9 KiB
Python

"""Expatistan HTML scraper — secondary COL source.
Used by the cache layer to cross-check Numbeo. Expatistan's page format
is different (price-of-living-index based, not absolute monthly figures),
so the headline we extract is their "single person, monthly cost"
estimate from the "Cost of Living in <city>" landing page.
Lower fidelity than Numbeo but ToS-friendlier — Expatistan publishes
their data under CC and explicitly allows non-commercial scraping.
Source-of-truth precedence (set in service.reconcile):
1. numbeo — primary, most data points
2. expatistan — secondary, cross-check
3. baseline — hand-curated fallback
"""
from __future__ import annotations
import asyncio
import logging
import re
from datetime import date
from decimal import Decimal
from typing import Final
import httpx
from fire_planner.col.models import CategoryBreakdown, CityCostIndex, ColSource
log = logging.getLogger(__name__)
BASE_URL: Final = "https://www.expatistan.com/cost-of-living"
USER_AGENT: Final = (
"fire-planner/0.1 (+https://forgejo.viktorbarzin.me/viktor/code; "
"non-commercial personal use; 1-year cache)"
)
DEFAULT_TIMEOUT: Final = httpx.Timeout(connect=10.0, read=30.0, write=10.0, pool=10.0)
MIN_REQUEST_INTERVAL: Final = 1.1
# Expatistan publishes prices in USD by default. Convert to GBP.
USD_TO_GBP: Final = Decimal("0.787")
# Single-person monthly estimate appears in the page text as:
# "Cost of living in <City>, <Country> for an expat is $X" or similar
# Format varies; capture both "$X,XXX" and "$X" patterns.
_SINGLE_PERSON_USD_RE = re.compile(
r"(?:single\s+person|expat)[^$]*?\$\s*([0-9,]+(?:\.[0-9]+)?)",
re.IGNORECASE | re.DOTALL,
)
# Apartment rent (1 bedroom) appears on the "Prices" table line:
# "Rent for a furnished single room (1 bedroom) in city centre $X,XXX"
_RENT_CENTER_USD_RE = re.compile(
r"(?:1\s*bedroom|one[-\s]?bedroom)[^$<]*?(?:cent|expensive)[^$]*?"
r"\$\s*([0-9,]+(?:\.[0-9]+)?)",
re.IGNORECASE | re.DOTALL,
)
class ExpatistanFetchError(RuntimeError):
"""HTTP/parse failures so the cache layer can fall back."""
def _parse_num(s: str) -> Decimal:
return Decimal(s.replace(",", ""))
class ExpatistanScraper:
def __init__(
self,
*,
client: httpx.AsyncClient | None = None,
min_interval: float = MIN_REQUEST_INTERVAL,
) -> None:
self._owns_client = client is None
self._client = client or httpx.AsyncClient(
headers={"User-Agent": USER_AGENT, "Accept-Language": "en-GB,en;q=0.9"},
timeout=DEFAULT_TIMEOUT,
follow_redirects=True,
)
self._min_interval = min_interval
self._last_request_at: float = 0.0
self._lock = asyncio.Lock()
async def __aenter__(self) -> ExpatistanScraper:
return self
async def __aexit__(self, *_: object) -> None:
if self._owns_client:
await self._client.aclose()
async def _polite_wait(self) -> None:
async with self._lock:
now = asyncio.get_running_loop().time()
elapsed = now - self._last_request_at
if elapsed < self._min_interval:
await asyncio.sleep(self._min_interval - elapsed)
self._last_request_at = asyncio.get_running_loop().time()
async def fetch(
self,
city_slug: str,
*,
country: str = "",
) -> CityCostIndex:
# Expatistan uses lowercase city slugs separated by hyphens —
# same convention as our internal slugs.
url = f"{BASE_URL}/{city_slug}"
await self._polite_wait()
try:
resp = await self._client.get(url)
resp.raise_for_status()
except httpx.HTTPError as e:
raise ExpatistanFetchError(f"HTTP error for {url}: {e}") from e
return self._parse(city_slug, country, url, resp.text)
@staticmethod
def _parse(
city_slug: str,
country: str,
url: str,
html: str,
) -> CityCostIndex:
single_match = _SINGLE_PERSON_USD_RE.search(html)
rent_match = _RENT_CENTER_USD_RE.search(html)
if not (single_match and rent_match):
raise ExpatistanFetchError(
f"could not locate single-person or rent figure on {url}"
)
# Expatistan's "single person" headline is total with rent —
# different convention from Numbeo. Use it as `total_with_rent`
# directly; derive no_rent by subtracting their rent figure.
with_rent_usd = _parse_num(single_match.group(1))
rent_usd = _parse_num(rent_match.group(1))
with_rent_gbp = with_rent_usd * USD_TO_GBP
rent_gbp = rent_usd * USD_TO_GBP
no_rent_gbp = with_rent_gbp - rent_gbp
# Guard against malformed pages where rent > total (unusual but
# possible if the regex grabs the wrong row).
if no_rent_gbp <= 0:
raise ExpatistanFetchError(
f"derived no_rent <= 0 ({no_rent_gbp}) on {url}; "
f"with_rent={with_rent_gbp}, rent={rent_gbp}"
)
return CityCostIndex(
city=city_slug.replace("-", " ").title(),
city_slug=city_slug,
country=country,
total_single_no_rent_gbp=no_rent_gbp.quantize(Decimal("0.01")),
total_single_with_rent_gbp=with_rent_gbp.quantize(Decimal("0.01")),
breakdown=CategoryBreakdown(
rent_1bed_center=rent_gbp.quantize(Decimal("0.01")),
rent_1bed_outside=None,
groceries=Decimal("0"),
restaurants=Decimal("0"),
transport=Decimal("0"),
utilities=Decimal("0"),
leisure=Decimal("0"),
),
source=ColSource(
name="expatistan",
url=url,
snapshot_date=date.today(),
raw_currency="USD",
gbp_per_unit=USD_TO_GBP,
),
)