col: simulator auto-adjusts spending to local prices via Numbeo+Expatistan
The Monte Carlo used to compare jurisdictions at a flat London-equivalent spend, which silently overstated the cost-of-living for any move to a cheaper region. Now every cross-jurisdiction simulation auto-scales spending_gbp by the real Numbeo/Expatistan ratio between the user's baseline city and the target city. Architecture: - fire_planner/col/baseline.py — 22 cities with headline Numbeo data (source URLs + snapshot dates embedded) — fallback when scraper fails - col/numbeo.py + col/expatistan.py — httpx async scrapers, regex-parsed, polite 1.1s rate-limit, EUR/USD anchored - col/cache.py — PG-backed cache (col_snapshot table, 1-year TTL) - col/service.py — sync compute_col_ratio() for the simulator; async lookup_city_cached() with source reconciliation for the refresh CronJob - alembic 0005 — col_snapshot table, UNIQUE(city_slug, source_name) Simulator wiring: - SimulateRequest gains col_auto_adjust=True (default), col_baseline_city, col_target_city. Defaults pick the jurisdiction's representative city. - _resolve_col_adjustment scales spending_gbp before path-building. - SimulateResult surfaces col_multiplier_applied + col_adjusted_spending_gbp. CLIs: - python -m fire_planner col-seed — loads BASELINES into col_snapshot (post-migration seed step) - python -m fire_planner col-refresh-stale --within-days 7 — used by the weekly fire-planner-col-refresh CronJob 268 tests pass. Mypy strict + ruff clean. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
parent
70101c836c
commit
e72fd22a17
14 changed files with 1641 additions and 6 deletions
165
fire_planner/col/expatistan.py
Normal file
165
fire_planner/col/expatistan.py
Normal file
|
|
@ -0,0 +1,165 @@
|
|||
"""Expatistan HTML scraper — secondary COL source.
|
||||
|
||||
Used by the cache layer to cross-check Numbeo. Expatistan's page format
|
||||
is different (price-of-living-index based, not absolute monthly figures),
|
||||
so the headline we extract is their "single person, monthly cost"
|
||||
estimate from the "Cost of Living in <city>" landing page.
|
||||
|
||||
Lower fidelity than Numbeo but ToS-friendlier — Expatistan publishes
|
||||
their data under CC and explicitly allows non-commercial scraping.
|
||||
|
||||
Source-of-truth precedence (set in service.reconcile):
|
||||
1. numbeo — primary, most data points
|
||||
2. expatistan — secondary, cross-check
|
||||
3. baseline — hand-curated fallback
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
import re
|
||||
from datetime import date
|
||||
from decimal import Decimal
|
||||
from typing import Final
|
||||
|
||||
import httpx
|
||||
|
||||
from fire_planner.col.models import CategoryBreakdown, CityCostIndex, ColSource
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
BASE_URL: Final = "https://www.expatistan.com/cost-of-living"
|
||||
USER_AGENT: Final = (
|
||||
"fire-planner/0.1 (+https://forgejo.viktorbarzin.me/viktor/code; "
|
||||
"non-commercial personal use; 1-year cache)"
|
||||
)
|
||||
DEFAULT_TIMEOUT: Final = httpx.Timeout(connect=10.0, read=30.0, write=10.0, pool=10.0)
|
||||
MIN_REQUEST_INTERVAL: Final = 1.1
|
||||
|
||||
# Expatistan publishes prices in USD by default. Convert to GBP.
|
||||
USD_TO_GBP: Final = Decimal("0.787")
|
||||
|
||||
# Single-person monthly estimate appears in the page text as:
|
||||
# "Cost of living in <City>, <Country> for an expat is $X" or similar
|
||||
# Format varies; capture both "$X,XXX" and "$X" patterns.
|
||||
_SINGLE_PERSON_USD_RE = re.compile(
|
||||
r"(?:single\s+person|expat)[^$]*?\$\s*([0-9,]+(?:\.[0-9]+)?)",
|
||||
re.IGNORECASE | re.DOTALL,
|
||||
)
|
||||
# Apartment rent (1 bedroom) appears on the "Prices" table line:
|
||||
# "Rent for a furnished single room (1 bedroom) in city centre $X,XXX"
|
||||
_RENT_CENTER_USD_RE = re.compile(
|
||||
r"(?:1\s*bedroom|one[-\s]?bedroom)[^$<]*?(?:cent|expensive)[^$]*?"
|
||||
r"\$\s*([0-9,]+(?:\.[0-9]+)?)",
|
||||
re.IGNORECASE | re.DOTALL,
|
||||
)
|
||||
|
||||
|
||||
class ExpatistanFetchError(RuntimeError):
|
||||
"""HTTP/parse failures so the cache layer can fall back."""
|
||||
|
||||
|
||||
def _parse_num(s: str) -> Decimal:
|
||||
return Decimal(s.replace(",", ""))
|
||||
|
||||
|
||||
class ExpatistanScraper:
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
client: httpx.AsyncClient | None = None,
|
||||
min_interval: float = MIN_REQUEST_INTERVAL,
|
||||
) -> None:
|
||||
self._owns_client = client is None
|
||||
self._client = client or httpx.AsyncClient(
|
||||
headers={"User-Agent": USER_AGENT, "Accept-Language": "en-GB,en;q=0.9"},
|
||||
timeout=DEFAULT_TIMEOUT,
|
||||
follow_redirects=True,
|
||||
)
|
||||
self._min_interval = min_interval
|
||||
self._last_request_at: float = 0.0
|
||||
self._lock = asyncio.Lock()
|
||||
|
||||
async def __aenter__(self) -> ExpatistanScraper:
|
||||
return self
|
||||
|
||||
async def __aexit__(self, *_: object) -> None:
|
||||
if self._owns_client:
|
||||
await self._client.aclose()
|
||||
|
||||
async def _polite_wait(self) -> None:
|
||||
async with self._lock:
|
||||
now = asyncio.get_running_loop().time()
|
||||
elapsed = now - self._last_request_at
|
||||
if elapsed < self._min_interval:
|
||||
await asyncio.sleep(self._min_interval - elapsed)
|
||||
self._last_request_at = asyncio.get_running_loop().time()
|
||||
|
||||
async def fetch(
|
||||
self,
|
||||
city_slug: str,
|
||||
*,
|
||||
country: str = "",
|
||||
) -> CityCostIndex:
|
||||
# Expatistan uses lowercase city slugs separated by hyphens —
|
||||
# same convention as our internal slugs.
|
||||
url = f"{BASE_URL}/{city_slug}"
|
||||
await self._polite_wait()
|
||||
try:
|
||||
resp = await self._client.get(url)
|
||||
resp.raise_for_status()
|
||||
except httpx.HTTPError as e:
|
||||
raise ExpatistanFetchError(f"HTTP error for {url}: {e}") from e
|
||||
return self._parse(city_slug, country, url, resp.text)
|
||||
|
||||
@staticmethod
|
||||
def _parse(
|
||||
city_slug: str,
|
||||
country: str,
|
||||
url: str,
|
||||
html: str,
|
||||
) -> CityCostIndex:
|
||||
single_match = _SINGLE_PERSON_USD_RE.search(html)
|
||||
rent_match = _RENT_CENTER_USD_RE.search(html)
|
||||
if not (single_match and rent_match):
|
||||
raise ExpatistanFetchError(
|
||||
f"could not locate single-person or rent figure on {url}"
|
||||
)
|
||||
# Expatistan's "single person" headline is total with rent —
|
||||
# different convention from Numbeo. Use it as `total_with_rent`
|
||||
# directly; derive no_rent by subtracting their rent figure.
|
||||
with_rent_usd = _parse_num(single_match.group(1))
|
||||
rent_usd = _parse_num(rent_match.group(1))
|
||||
with_rent_gbp = with_rent_usd * USD_TO_GBP
|
||||
rent_gbp = rent_usd * USD_TO_GBP
|
||||
no_rent_gbp = with_rent_gbp - rent_gbp
|
||||
# Guard against malformed pages where rent > total (unusual but
|
||||
# possible if the regex grabs the wrong row).
|
||||
if no_rent_gbp <= 0:
|
||||
raise ExpatistanFetchError(
|
||||
f"derived no_rent <= 0 ({no_rent_gbp}) on {url}; "
|
||||
f"with_rent={with_rent_gbp}, rent={rent_gbp}"
|
||||
)
|
||||
return CityCostIndex(
|
||||
city=city_slug.replace("-", " ").title(),
|
||||
city_slug=city_slug,
|
||||
country=country,
|
||||
total_single_no_rent_gbp=no_rent_gbp.quantize(Decimal("0.01")),
|
||||
total_single_with_rent_gbp=with_rent_gbp.quantize(Decimal("0.01")),
|
||||
breakdown=CategoryBreakdown(
|
||||
rent_1bed_center=rent_gbp.quantize(Decimal("0.01")),
|
||||
rent_1bed_outside=None,
|
||||
groceries=Decimal("0"),
|
||||
restaurants=Decimal("0"),
|
||||
transport=Decimal("0"),
|
||||
utilities=Decimal("0"),
|
||||
leisure=Decimal("0"),
|
||||
),
|
||||
source=ColSource(
|
||||
name="expatistan",
|
||||
url=url,
|
||||
snapshot_date=date.today(),
|
||||
raw_currency="USD",
|
||||
gbp_per_unit=USD_TO_GBP,
|
||||
),
|
||||
)
|
||||
Loading…
Add table
Add a link
Reference in a new issue