col: simulator auto-adjusts spending to local prices via Numbeo+Expatistan

The Monte Carlo used to compare jurisdictions at a flat London-equivalent
spend, which silently overstated the cost-of-living for any move to a
cheaper region. Now every cross-jurisdiction simulation auto-scales
spending_gbp by the real Numbeo/Expatistan ratio between the user's
baseline city and the target city.

Architecture:
- fire_planner/col/baseline.py — 22 cities with headline Numbeo data
  (source URLs + snapshot dates embedded) — fallback when scraper fails
- col/numbeo.py + col/expatistan.py — httpx async scrapers, regex-parsed,
  polite 1.1s rate-limit, EUR/USD anchored
- col/cache.py — PG-backed cache (col_snapshot table, 1-year TTL)
- col/service.py — sync compute_col_ratio() for the simulator; async
  lookup_city_cached() with source reconciliation for the refresh CronJob
- alembic 0005 — col_snapshot table, UNIQUE(city_slug, source_name)

Simulator wiring:
- SimulateRequest gains col_auto_adjust=True (default), col_baseline_city,
  col_target_city. Defaults pick the jurisdiction's representative city.
- _resolve_col_adjustment scales spending_gbp before path-building.
- SimulateResult surfaces col_multiplier_applied + col_adjusted_spending_gbp.

CLIs:
- python -m fire_planner col-seed — loads BASELINES into col_snapshot
  (post-migration seed step)
- python -m fire_planner col-refresh-stale --within-days 7 — used by the
  weekly fire-planner-col-refresh CronJob

268 tests pass. Mypy strict + ruff clean.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
Viktor Barzin 2026-05-22 14:14:57 +00:00
parent 70101c836c
commit e72fd22a17
14 changed files with 1641 additions and 6 deletions

View file

@ -0,0 +1,73 @@
"""add col_snapshot table for cached cost-of-living data
Revision ID: 0005
Revises: 0004
Create Date: 2026-05-21 12:00:00.000000
Phase 2 of the cost-of-living subsystem (`fire_planner.col`). Caches
Numbeo / Expatistan headline data with a 1-year TTL so the simulator
can scale `spending_gbp` to local prices without re-scraping per-call.
Refresh is async (Phase-3 CronJob); user-facing lookups never block on
the network in the steady state.
Unique on (city_slug, source_name) multiple sources per city are
allowed; service.py reconciles them when computing the headline.
"""
from collections.abc import Sequence
import sqlalchemy as sa
from alembic import op
revision: str = "0005"
down_revision: str | None = "0004"
branch_labels: str | Sequence[str] | None = None
depends_on: str | Sequence[str] | None = None
SCHEMA = "fire_planner"
def upgrade() -> None:
op.create_table(
"col_snapshot",
sa.Column("id", sa.Integer(), nullable=False, autoincrement=True),
sa.Column("city_slug", sa.String(length=64), nullable=False),
sa.Column("city_display", sa.String(length=128), nullable=False),
sa.Column("country", sa.String(length=64), nullable=False),
sa.Column("source_name", sa.String(length=32), nullable=False),
sa.Column("source_url", sa.String(), nullable=True),
sa.Column("snapshot_date", sa.Date(), nullable=False),
sa.Column("fetched_at", sa.TIMESTAMP(timezone=True), nullable=False,
server_default=sa.func.now()),
sa.Column("expires_at", sa.TIMESTAMP(timezone=True), nullable=False),
sa.Column("total_no_rent_gbp", sa.Numeric(12, 2), nullable=False),
sa.Column("total_with_rent_gbp", sa.Numeric(12, 2), nullable=False),
sa.Column("rent_1bed_center_gbp", sa.Numeric(12, 2), nullable=False),
sa.Column("rent_1bed_outside_gbp", sa.Numeric(12, 2), nullable=True),
sa.Column("raw_currency", sa.String(length=3), nullable=False,
server_default=sa.text("'GBP'")),
sa.Column("gbp_per_unit", sa.Numeric(12, 8), nullable=False,
server_default=sa.text("1")),
sa.Column("by_category_json", sa.JSON(), nullable=True),
sa.PrimaryKeyConstraint("id"),
sa.UniqueConstraint("city_slug", "source_name", name="uq_col_snapshot_city_source"),
schema=SCHEMA,
)
op.create_index(
"ix_col_snapshot_city_slug",
"col_snapshot",
["city_slug"],
schema=SCHEMA,
)
op.create_index(
"ix_col_snapshot_expires_at",
"col_snapshot",
["expires_at"],
schema=SCHEMA,
)
def downgrade() -> None:
op.drop_index("ix_col_snapshot_expires_at", table_name="col_snapshot", schema=SCHEMA)
op.drop_index("ix_col_snapshot_city_slug", table_name="col_snapshot", schema=SCHEMA)
op.drop_table("col_snapshot", schema=SCHEMA)

View file

@ -57,6 +57,103 @@ def migrate() -> None:
sys.exit(rc.returncode)
@cli.command("col-seed")
@click.option("--ttl-days",
type=int,
default=365,
help="Cache TTL in days (default 365 — matches Viktor's 1y choice).")
def col_seed(ttl_days: int) -> None:
"""Seed `col_snapshot` from baseline.py BASELINES.
Idempotent uses upsert on (city_slug, source_name). Run once after
the alembic migration creates the table. Subsequent live-scrape
refreshes (Phase 3 CronJob) supersede these rows; the baseline
fallback remains as a last-resort source.
"""
asyncio.run(_col_seed(ttl_days))
async def _col_seed(ttl_days: int) -> None:
from fire_planner.col.baseline import BASELINES
from fire_planner.col.cache import upsert as col_upsert
engine = create_engine_from_env()
factory = make_session_factory(engine)
try:
async with factory() as sess:
for slug, idx in BASELINES.items():
# Tag the source as `baseline` rather than `numbeo` so a
# later live scrape (source_name='numbeo') doesn't conflict
# on the (city_slug, source_name) unique constraint.
tagged = idx.model_copy(
update={"source": idx.source.model_copy(update={"name": "baseline"})}
)
await col_upsert(sess, tagged, ttl_days=ttl_days)
click.echo(f" seeded {slug:20s} total={idx.total_single_with_rent_gbp} GBP")
finally:
await engine.dispose()
click.echo(f"\ncol-seed: {len(BASELINES)} cities upserted (ttl_days={ttl_days}).")
@cli.command("col-refresh-stale")
@click.option("--within-days",
type=int,
default=7,
help="Refresh rows whose expires_at is within this many days.")
@click.option("--ttl-days",
type=int,
default=365,
help="TTL for re-written rows (default 365).")
def col_refresh_stale(within_days: int, ttl_days: int) -> None:
"""Re-scrape COL rows that are within `within_days` of expiry.
Designed for the weekly CronJob. Walks every distinct city_slug in
`col_snapshot` whose newest row will expire within the window,
calls Numbeo+Expatistan via `service.lookup_city_cached`, which
upserts the result. Idempotent no-op for fresh rows.
"""
asyncio.run(_col_refresh_stale(within_days, ttl_days))
async def _col_refresh_stale(within_days: int, ttl_days: int) -> None:
from sqlalchemy import select, text
from fire_planner.col.service import lookup_city_cached
from fire_planner.db import ColSnapshot
engine = create_engine_from_env()
factory = make_session_factory(engine)
threshold = f"NOW() + INTERVAL '{int(within_days)} days'"
refreshed = 0
failed = 0
try:
async with factory() as sess:
# Find distinct city_slug whose freshest row expires within window.
stmt = (
select(ColSnapshot.city_slug, ColSnapshot.country)
.distinct()
.where(text(f"expires_at <= {threshold}"))
)
rows = (await sess.execute(stmt)).all()
click.echo(f"col-refresh-stale: {len(rows)} city(ies) need refresh "
f"(within_days={within_days})")
for slug, country in rows:
try:
# lookup_city_cached upserts on cache miss, which is
# what "stale" means here — read_fresh returns None.
idx = await lookup_city_cached(sess, slug, country=country or "")
click.echo(f" refreshed {slug:20s}{idx.source.name:10s} "
f"total={idx.total_single_with_rent_gbp}")
refreshed += 1
except Exception as e: # broad — log and continue per-city
click.echo(f" FAILED {slug}: {e}", err=True)
failed += 1
finally:
await engine.dispose()
click.echo(f"\ncol-refresh-stale done: refreshed={refreshed} failed={failed} "
f"ttl_days={ttl_days}")
@cli.command("ingest")
@click.option("--source",
type=click.Choice(["wealthfolio"]),

View file

@ -504,6 +504,22 @@ class SimulateRequest(BaseModel):
annual_real_adjust_pct: Decimal = Decimal("0")
guardrail_threshold_pct: Decimal | None = None
guardrail_cut_pct: Decimal = Decimal("0.10")
# Cost-of-living auto-adjust: when `col_auto_adjust=True`, the
# simulator looks up COL ratio (target/baseline) from `fire_planner.col`
# and scales `spending_gbp` BEFORE running paths. Defaults to True so
# cross-jurisdiction comparisons are honest by default — earlier
# comparisons used hand-wave 0.5x/0.75x multipliers, which were
# consistently optimistic vs. actual Numbeo data (Bulgaria is 0.41x,
# not 0.50x; Cyprus 0.67x, not 0.75x).
#
# `col_target_city` defaults to the jurisdiction's representative
# city (uk→london, cyprus→limassol, etc.). Set explicitly to anchor
# on a different city (e.g. `cyprus`+`paphos` if Limassol is too
# expensive a proxy). For `jurisdiction='nomad'` there is no
# representative city and auto-adjust is skipped silently.
col_auto_adjust: bool = True
col_baseline_city: str = "london"
col_target_city: str | None = None
class SimulateResult(BaseModel):
@ -516,6 +532,13 @@ class SimulateResult(BaseModel):
elapsed_seconds: Decimal
yearly: list[ProjectionPoint]
goals_probability: list[GoalProbability] = Field(default_factory=list)
# When `col_auto_adjust=True`, surface the applied multiplier + the
# COL-adjusted spending so the user can see what was used. Null when
# auto-adjust was off, jurisdiction had no representative city
# (nomad), or baseline==target (London-to-London).
col_multiplier_applied: Decimal | None = None
col_adjusted_spending_gbp: Decimal | None = None
col_target_city: str | None = None
class CompareRequest(BaseModel):

View file

@ -26,6 +26,7 @@ from fire_planner.api.schemas import (
SimulateRequest,
SimulateResult,
)
from fire_planner.col import compute_col_ratio, representative_city_for
from fire_planner.flex_spending import FlexRule as EngineFlexRule
from fire_planner.glide_path import static
from fire_planner.goals_eval import evaluate_goals
@ -50,6 +51,36 @@ router = APIRouter(tags=["simulate"])
_RETURNS_CSV = Path("/data/shiller_returns.csv")
def _resolve_col_adjustment(
req: SimulateRequest,
) -> tuple[SimulateRequest, Decimal | None, Decimal | None, str | None]:
"""Apply cost-of-living adjustment to `req.spending_gbp` when enabled.
Returns the (possibly modified) request, the multiplier applied (or
None), the post-adjustment spending GBP (or None), and the resolved
target city slug (or None). Skipped silently when:
- col_auto_adjust is False
- the jurisdiction has no representative city (e.g. nomad)
- baseline_city == resolved target city (identity transform)
- either city is unknown to the baseline lookup (degrade gracefully
rather than 400 a future Phase-2 scraper will close the gap)
"""
if not req.col_auto_adjust:
return req, None, None, None
target = req.col_target_city or representative_city_for(req.jurisdiction)
if target is None:
return req, None, None, None
if target == req.col_baseline_city:
return req, None, None, target
try:
ratio = compute_col_ratio(req.col_baseline_city, target)
except KeyError:
return req, None, None, target
adjusted_spend = req.spending_gbp * ratio
adjusted_req = req.model_copy(update={"spending_gbp": adjusted_spend})
return adjusted_req, ratio, adjusted_spend, target
def _shiller_paths(seed: int, n_paths: int, n_years: int) -> np.ndarray:
bundle = (load_from_csv(_RETURNS_CSV) if _RETURNS_CSV.exists() else synthetic_returns(seed=42))
rng = np.random.default_rng(seed)
@ -193,6 +224,9 @@ def _to_response(
result: SimulationResult,
elapsed: float,
req: SimulateRequest | None = None,
col_multiplier: Decimal | None = None,
col_adjusted_spend: Decimal | None = None,
col_target_city: str | None = None,
) -> SimulateResult:
# portfolio_real has n_years+1 columns (year 0 = seed, year k = end-of-year k).
# withdrawal_real / tax_real have n_years columns (year k = withdrawn in year k+1).
@ -243,27 +277,34 @@ def _to_response(
elapsed_seconds=Decimal(str(round(elapsed, 3))),
yearly=yearly,
goals_probability=goals_probability,
col_multiplier_applied=(Decimal(str(round(float(col_multiplier), 6)))
if col_multiplier is not None else None),
col_adjusted_spending_gbp=(Decimal(str(round(float(col_adjusted_spend), 2)))
if col_adjusted_spend is not None else None),
col_target_city=col_target_city,
)
@router.post("/simulate", response_model=SimulateResult)
async def simulate_one(req: SimulateRequest) -> SimulateResult:
"""Run one scenario synchronously, no DB write. ~1-3s for 5k paths."""
paths = await _build_paths(req)
adjusted_req, mult, adj_spend, target_city = _resolve_col_adjustment(req)
paths = await _build_paths(adjusted_req)
try:
result, elapsed = await asyncio.to_thread(_project, req, paths)
result, elapsed = await asyncio.to_thread(_project, adjusted_req, paths)
except KeyError as e:
raise HTTPException(status_code=400, detail=f"Unknown name: {e}") from None
return _to_response(result, elapsed, req)
return _to_response(result, elapsed, adjusted_req, mult, adj_spend, target_city)
@router.post("/compare", response_model=CompareResult)
async def compare_scenarios(req: CompareRequest) -> CompareResult:
"""Run 2-5 scenarios in parallel, return all results."""
async def one(s: SimulateRequest) -> SimulateResult:
paths = await _build_paths(s)
result, elapsed = await asyncio.to_thread(_project, s, paths)
return _to_response(result, elapsed, s)
adjusted_s, mult, adj_spend, target_city = _resolve_col_adjustment(s)
paths = await _build_paths(adjusted_s)
result, elapsed = await asyncio.to_thread(_project, adjusted_s, paths)
return _to_response(result, elapsed, adjusted_s, mult, adj_spend, target_city)
try:
results = await asyncio.gather(*(one(s) for s in req.scenarios))

View file

@ -0,0 +1,36 @@
"""Cost-of-living module — feeds the simulator with real per-city spend ratios.
The simulator's `spending_gbp` is denominated in the user's BASELINE city
(typically London). When a scenario moves the user to a different TARGET
city, this module returns the ratio `target_total / baseline_total` so
the simulator can scale `spending_gbp` to local prices before running
paths.
Phase 1 (current): hand-curated baselines from Numbeo public pages, with
source URLs and fetch dates embedded so future-us can refresh by hand.
Phase 2 (planned): live scrapers for Numbeo + Expatistan, DB cache with
30-day TTL, nightly refresh CronJob.
"""
from __future__ import annotations
from fire_planner.col.models import CategoryBreakdown, CityCostIndex, ColSource
from fire_planner.col.service import (
JURISDICTION_REPRESENTATIVE_CITY,
compute_col_ratio,
lookup_city,
lookup_city_cached,
reconcile_sources,
representative_city_for,
)
__all__ = [
"CategoryBreakdown",
"CityCostIndex",
"ColSource",
"JURISDICTION_REPRESENTATIVE_CITY",
"compute_col_ratio",
"lookup_city",
"lookup_city_cached",
"reconcile_sources",
"representative_city_for",
]

View file

@ -0,0 +1,342 @@
"""Hand-curated baselines from Numbeo public pages.
All figures are GBP/month for a single person. Source URLs and snapshot
dates are embedded so we can re-validate. Refresh by re-running the
WebFetch prompts that built this file (see `docs/col-baseline-refresh.md`
or the conversation in 2026-05-21).
Adding a new city: pull the Numbeo page, find "Estimated monthly costs
for a single person without rent" (the headline), then the rent + per-
category breakdowns. Add an entry below the simulator picks it up
automatically via `lookup_city()`.
Currency conversion uses the rate visible on Numbeo at fetch time
re-fetch when sterling moves >5% against the local currency.
"""
from __future__ import annotations
from datetime import date
from decimal import Decimal
from fire_planner.col.models import CategoryBreakdown, CityCostIndex, ColSource
def _src(url: str, snap: str, ccy: str, gbp_per_unit: Decimal | float) -> ColSource:
return ColSource(
name="numbeo",
url=url,
snapshot_date=date.fromisoformat(snap),
raw_currency=ccy,
gbp_per_unit=Decimal(str(gbp_per_unit)),
)
BASELINES: dict[str, CityCostIndex] = {
"london": CityCostIndex(
city="London",
city_slug="london",
country="United Kingdom",
total_single_no_rent_gbp=Decimal("1092.40"),
total_single_with_rent_gbp=Decimal("3409.59"),
breakdown=CategoryBreakdown(
rent_1bed_center=Decimal("2317.19"),
rent_1bed_outside=Decimal("1728.85"),
groceries=Decimal("420.00"),
restaurants=Decimal("285.00"),
transport=Decimal("190.00"),
utilities=Decimal("327.18"),
leisure=Decimal("127.40"),
),
source=_src("https://www.numbeo.com/cost-of-living/in/London", "2026-05-20", "GBP", 1.0),
),
"sofia": CityCostIndex(
city="Sofia",
city_slug="sofia",
country="Bulgaria",
total_single_no_rent_gbp=Decimal("712.54"),
total_single_with_rent_gbp=Decimal("1391.71"),
breakdown=CategoryBreakdown(
rent_1bed_center=Decimal("679.17"),
rent_1bed_outside=Decimal("520.26"),
groceries=Decimal("280.00"), # per-category figures sanity-checked
restaurants=Decimal("199.27"), # vs Numbeo summary; LLM extraction
transport=Decimal("28.50"), # of detail rows is noisy — headline
utilities=Decimal("130.00"), # totals (no_rent + with_rent) are
leisure=Decimal("75.00"), # the canonical anchors for ratios
),
source=_src("https://www.numbeo.com/cost-of-living/in/Sofia", "2026-05-20", "BGN", 0.435),
),
"limassol": CityCostIndex(
city="Limassol",
city_slug="limassol",
country="Cyprus",
total_single_no_rent_gbp=Decimal("932.30"),
total_single_with_rent_gbp=Decimal("2282.30"),
breakdown=CategoryBreakdown(
rent_1bed_center=Decimal("1350.00"),
rent_1bed_outside=Decimal("1162.94"),
groceries=Decimal("350.00"),
restaurants=Decimal("240.00"),
transport=Decimal("40.00"),
utilities=Decimal("233.43"),
leisure=Decimal("104.44"),
),
source=_src("https://www.numbeo.com/cost-of-living/in/Limassol", "2026-05-18",
"EUR", 0.862),
),
"dubai": CityCostIndex(
city="Dubai",
city_slug="dubai",
country="United Arab Emirates",
total_single_no_rent_gbp=Decimal("911.83"),
total_single_with_rent_gbp=Decimal("2768.31"),
breakdown=CategoryBreakdown(
rent_1bed_center=Decimal("1856.48"),
rent_1bed_outside=Decimal("1139.98"),
groceries=Decimal("96.77"), # Dubai groceries unusually low —
restaurants=Decimal("86.02"), # subsidised + lots of cheap labour
transport=Decimal("21.51"), # Metro pass AED 100. Sanity check
utilities=Decimal("188.24"), # in next refresh — could be Numbeo
leisure=Decimal("64.52"), # contributor undercounting
),
source=_src("https://www.numbeo.com/cost-of-living/in/Dubai", "2026-05-19", "AED", 0.21505),
),
"kuala-lumpur": CityCostIndex(
city="Kuala Lumpur",
city_slug="kuala-lumpur",
country="Malaysia",
total_single_no_rent_gbp=Decimal("420.64"),
total_single_with_rent_gbp=Decimal("865.08"),
breakdown=CategoryBreakdown(
rent_1bed_center=Decimal("444.44"),
rent_1bed_outside=Decimal("263.89"),
groceries=Decimal("76.95"),
restaurants=Decimal("145.35"),
transport=Decimal("17.10"),
utilities=Decimal("45.18"),
leisure=Decimal("42.75"),
),
source=_src("https://www.numbeo.com/cost-of-living/in/Kuala-Lumpur", "2026-05-17",
"MYR", 0.171),
),
"bangkok": CityCostIndex(
city="Bangkok",
city_slug="bangkok",
country="Thailand",
total_single_no_rent_gbp=Decimal("491.21"),
total_single_with_rent_gbp=Decimal("970.57"),
breakdown=CategoryBreakdown(
rent_1bed_center=Decimal("479.36"),
rent_1bed_outside=Decimal("233.76"),
groceries=Decimal("97.25"),
restaurants=Decimal("119.34"),
transport=Decimal("43.21"),
utilities=Decimal("69.04"),
leisure=Decimal("65.29"),
),
source=_src("https://www.numbeo.com/cost-of-living/in/Bangkok", "2026-05-20",
"THB", 0.02198),
),
# ── Expansion batch — fetched 2026-05-21, headline totals only ──
# Per-category breakdowns set to 0 where Numbeo LLM extraction was
# unreliable. Only `total_single_no_rent_gbp` / `total_single_with_rent_gbp`
# are used by the simulator's COL ratio; the breakdowns are for the
# UI / playbook. Refresh in Phase 3 (live scraper with HTML parsing).
"lisbon": CityCostIndex(
city="Lisbon", city_slug="lisbon", country="Portugal",
total_single_no_rent_gbp=Decimal("647.97"),
total_single_with_rent_gbp=Decimal("1856.03"),
breakdown=CategoryBreakdown(
rent_1bed_center=Decimal("1208.06"), rent_1bed_outside=Decimal("923.14"),
groceries=Decimal("0"), restaurants=Decimal("0"), transport=Decimal("0"),
utilities=Decimal("0"), leisure=Decimal("0"),
),
source=_src("https://www.numbeo.com/cost-of-living/in/Lisbon", "2026-05-21",
"EUR", 0.862),
),
"porto": CityCostIndex(
city="Porto", city_slug="porto", country="Portugal",
total_single_no_rent_gbp=Decimal("609.07"),
total_single_with_rent_gbp=Decimal("1562.50"),
breakdown=CategoryBreakdown(
rent_1bed_center=Decimal("953.43"), rent_1bed_outside=Decimal("726.19"),
groceries=Decimal("0"), restaurants=Decimal("0"), transport=Decimal("0"),
utilities=Decimal("0"), leisure=Decimal("0"),
),
source=_src("https://www.numbeo.com/cost-of-living/in/Porto", "2026-05-16",
"EUR", 0.862),
),
"madrid": CityCostIndex(
city="Madrid", city_slug="madrid", country="Spain",
total_single_no_rent_gbp=Decimal("706.87"),
total_single_with_rent_gbp=Decimal("1825.72"),
breakdown=CategoryBreakdown(
rent_1bed_center=Decimal("1118.85"), rent_1bed_outside=Decimal("873.06"),
groceries=Decimal("0"), restaurants=Decimal("0"), transport=Decimal("0"),
utilities=Decimal("0"), leisure=Decimal("0"),
),
source=_src("https://www.numbeo.com/cost-of-living/in/Madrid", "2026-05-21",
"EUR", 0.862),
),
"valencia": CityCostIndex(
city="Valencia", city_slug="valencia", country="Spain",
total_single_no_rent_gbp=Decimal("614.71"),
total_single_with_rent_gbp=Decimal("1663.97"),
breakdown=CategoryBreakdown(
rent_1bed_center=Decimal("1049.26"), rent_1bed_outside=Decimal("779.35"),
groceries=Decimal("0"), restaurants=Decimal("0"), transport=Decimal("0"),
utilities=Decimal("0"), leisure=Decimal("0"),
),
source=_src("https://www.numbeo.com/cost-of-living/in/Valencia", "2026-05-15",
"EUR", 0.862),
),
"athens": CityCostIndex(
city="Athens", city_slug="athens", country="Greece",
total_single_no_rent_gbp=Decimal("711.46"),
total_single_with_rent_gbp=Decimal("1245.89"),
breakdown=CategoryBreakdown(
rent_1bed_center=Decimal("534.43"), rent_1bed_outside=Decimal("453.23"),
groceries=Decimal("0"), restaurants=Decimal("0"), transport=Decimal("0"),
utilities=Decimal("0"), leisure=Decimal("0"),
),
source=_src("https://www.numbeo.com/cost-of-living/in/Athens", "2026-05-21",
"EUR", 0.862),
),
"bucharest": CityCostIndex(
city="Bucharest", city_slug="bucharest", country="Romania",
total_single_no_rent_gbp=Decimal("572.13"),
total_single_with_rent_gbp=Decimal("1102.46"),
breakdown=CategoryBreakdown(
rent_1bed_center=Decimal("530.33"), rent_1bed_outside=Decimal("363.06"),
groceries=Decimal("0"), restaurants=Decimal("0"), transport=Decimal("0"),
utilities=Decimal("0"), leisure=Decimal("0"),
),
source=_src("https://www.numbeo.com/cost-of-living/in/Bucharest", "2026-05-21",
"EUR", 0.862),
),
"tbilisi": CityCostIndex(
city="Tbilisi", city_slug="tbilisi", country="Georgia",
# LLM extraction unreliable; manual estimate of headline from
# secondary sources puts ex-rent ~€420-500 → £400.
total_single_no_rent_gbp=Decimal("400.00"),
total_single_with_rent_gbp=Decimal("941.43"),
breakdown=CategoryBreakdown(
rent_1bed_center=Decimal("541.43"), rent_1bed_outside=Decimal("350.82"),
groceries=Decimal("0"), restaurants=Decimal("0"), transport=Decimal("0"),
utilities=Decimal("0"), leisure=Decimal("0"),
),
source=_src("https://www.numbeo.com/cost-of-living/in/Tbilisi", "2026-05-18",
"GEL", 0.295),
),
"tallinn": CityCostIndex(
city="Tallinn", city_slug="tallinn", country="Estonia",
total_single_no_rent_gbp=Decimal("837.63"),
total_single_with_rent_gbp=Decimal("1441.06"),
breakdown=CategoryBreakdown(
rent_1bed_center=Decimal("603.43"), rent_1bed_outside=Decimal("434.23"),
groceries=Decimal("0"), restaurants=Decimal("0"), transport=Decimal("0"),
utilities=Decimal("0"), leisure=Decimal("0"),
),
source=_src("https://www.numbeo.com/cost-of-living/in/Tallinn", "2026-05-21",
"EUR", 0.862),
),
"penang": CityCostIndex(
city="Penang", city_slug="penang", country="Malaysia",
total_single_no_rent_gbp=Decimal("361.66"),
total_single_with_rent_gbp=Decimal("643.39"),
breakdown=CategoryBreakdown(
rent_1bed_center=Decimal("281.73"), rent_1bed_outside=Decimal("160.61"),
groceries=Decimal("0"), restaurants=Decimal("0"), transport=Decimal("0"),
utilities=Decimal("0"), leisure=Decimal("0"),
),
source=_src("https://www.numbeo.com/cost-of-living/in/Penang", "2026-05-18",
"MYR", 0.171),
),
"chiang-mai": CityCostIndex(
city="Chiang Mai", city_slug="chiang-mai", country="Thailand",
total_single_no_rent_gbp=Decimal("412.36"),
total_single_with_rent_gbp=Decimal("775.43"),
breakdown=CategoryBreakdown(
rent_1bed_center=Decimal("363.07"), rent_1bed_outside=Decimal("205.95"),
groceries=Decimal("0"), restaurants=Decimal("0"), transport=Decimal("0"),
utilities=Decimal("0"), leisure=Decimal("0"),
),
source=_src("https://www.numbeo.com/cost-of-living/in/Chiang-Mai", "2026-05-06",
"THB", 0.02198),
),
"bali": CityCostIndex(
city="Bali", city_slug="bali", country="Indonesia",
# Bali Numbeo conflates Ubud/Canggu/Denpasar; rent figures are
# manual estimates (Numbeo's £915 was implausibly high).
total_single_no_rent_gbp=Decimal("433.24"),
total_single_with_rent_gbp=Decimal("883.24"),
breakdown=CategoryBreakdown(
rent_1bed_center=Decimal("450.00"), rent_1bed_outside=Decimal("350.00"),
groceries=Decimal("0"), restaurants=Decimal("0"), transport=Decimal("0"),
utilities=Decimal("0"), leisure=Decimal("0"),
),
source=_src("https://www.numbeo.com/cost-of-living/in/Bali", "2026-05-15",
"IDR", 0.0000485),
),
"singapore": CityCostIndex(
city="Singapore", city_slug="singapore", country="Singapore",
total_single_no_rent_gbp=Decimal("579.63"),
total_single_with_rent_gbp=Decimal("2661.63"),
breakdown=CategoryBreakdown(
rent_1bed_center=Decimal("2082.00"), rent_1bed_outside=Decimal("1556.00"),
groceries=Decimal("0"), restaurants=Decimal("0"), transport=Decimal("0"),
utilities=Decimal("0"), leisure=Decimal("0"),
),
source=_src("https://www.numbeo.com/cost-of-living/in/Singapore", "2026-05-21",
"SGD", 0.585),
),
"taipei": CityCostIndex(
city="Taipei", city_slug="taipei", country="Taiwan",
total_single_no_rent_gbp=Decimal("646.50"),
total_single_with_rent_gbp=Decimal("1223.06"),
breakdown=CategoryBreakdown(
rent_1bed_center=Decimal("576.56"), rent_1bed_outside=Decimal("373.77"),
groceries=Decimal("0"), restaurants=Decimal("0"), transport=Decimal("0"),
utilities=Decimal("0"), leisure=Decimal("0"),
),
source=_src("https://www.numbeo.com/cost-of-living/in/Taipei", "2026-05-18",
"TWD", 0.0246),
),
"ho-chi-minh-city": CityCostIndex(
city="Ho Chi Minh City", city_slug="ho-chi-minh-city", country="Vietnam",
total_single_no_rent_gbp=Decimal("348.85"),
total_single_with_rent_gbp=Decimal("828.77"),
breakdown=CategoryBreakdown(
rent_1bed_center=Decimal("479.92"), rent_1bed_outside=Decimal("223.06"),
groceries=Decimal("0"), restaurants=Decimal("0"), transport=Decimal("0"),
utilities=Decimal("0"), leisure=Decimal("0"),
),
source=_src("https://www.numbeo.com/cost-of-living/in/Ho-Chi-Minh-City",
"2026-05-16", "VND", 0.0000316),
),
"mexico-city": CityCostIndex(
city="Mexico City", city_slug="mexico-city", country="Mexico",
total_single_no_rent_gbp=Decimal("600.47"),
total_single_with_rent_gbp=Decimal("1390.42"),
breakdown=CategoryBreakdown(
rent_1bed_center=Decimal("789.95"), rent_1bed_outside=Decimal("513.96"),
groceries=Decimal("0"), restaurants=Decimal("0"), transport=Decimal("0"),
utilities=Decimal("0"), leisure=Decimal("0"),
),
source=_src("https://www.numbeo.com/cost-of-living/in/Mexico-City", "2026-05-19",
"MXN", 0.0394),
),
"medellin": CityCostIndex(
city="Medellin", city_slug="medellin", country="Colombia",
# LLM extraction gave £105 — too low. Manual estimate ~£400.
total_single_no_rent_gbp=Decimal("400.00"),
total_single_with_rent_gbp=Decimal("902.13"),
breakdown=CategoryBreakdown(
rent_1bed_center=Decimal("502.13"), rent_1bed_outside=Decimal("373.02"),
groceries=Decimal("0"), restaurants=Decimal("0"), transport=Decimal("0"),
utilities=Decimal("0"), leisure=Decimal("0"),
),
source=_src("https://www.numbeo.com/cost-of-living/in/Medellin", "2026-05-21",
"COP", 0.000195),
),
}

145
fire_planner/col/cache.py Normal file
View file

@ -0,0 +1,145 @@
"""DB-backed cache for cost-of-living snapshots.
Architecture (Phase 2):
lookup_city(slug, sess)
1. SELECT FROM col_snapshot WHERE city_slug=slug ORDER BY fetched_at DESC LIMIT 1
2. if row and row.expires_at > now return row, "cache_hit"
3. else fetch via NumbeoScraper, INSERT/UPDATE, return, "scraped"
4. on scrape failure fall back to baseline.BASELINES[slug], "baseline_fallback"
TTL = 1 year (Viktor's choice on 2026-05-21 — Numbeo headline numbers
don't move fast enough to need monthly refresh, and the rate-limit risk
is real). The Phase-3 CronJob refreshes stale rows nightly in batch so
runtime lookups never have to scrape.
"""
from __future__ import annotations
from datetime import UTC, datetime
from decimal import Decimal
from typing import Final
from sqlalchemy import select
from sqlalchemy.dialects.postgresql import insert as pg_insert
from sqlalchemy.ext.asyncio import AsyncSession
from fire_planner.col.models import CategoryBreakdown, CityCostIndex, ColSource
from fire_planner.db import ColSnapshot
DEFAULT_TTL_DAYS: Final = 365
def _row_to_index(row: ColSnapshot) -> CityCostIndex:
return CityCostIndex(
city=row.city_display,
city_slug=row.city_slug,
country=row.country,
total_single_no_rent_gbp=row.total_no_rent_gbp,
total_single_with_rent_gbp=row.total_with_rent_gbp,
breakdown=CategoryBreakdown(
rent_1bed_center=row.rent_1bed_center_gbp,
rent_1bed_outside=row.rent_1bed_outside_gbp,
# by_category_json optional — not loaded into the Pydantic
# model in Phase 2; the simulator only needs the headlines.
groceries=Decimal("0"),
restaurants=Decimal("0"),
transport=Decimal("0"),
utilities=Decimal("0"),
leisure=Decimal("0"),
),
source=ColSource(
name=row.source_name, # type: ignore[arg-type]
url=row.source_url,
snapshot_date=row.snapshot_date,
raw_currency=row.raw_currency,
gbp_per_unit=row.gbp_per_unit,
),
)
async def read_fresh(
sess: AsyncSession,
city_slug: str,
*,
now: datetime | None = None,
) -> CityCostIndex | None:
"""Return the freshest non-expired snapshot, or None.
Picks the most-recently-fetched row across all sources for the city
(Numbeo + Expatistan etc.) service-layer reconciliation runs when
writing, so the cache stores already-reconciled values.
"""
now = now or datetime.now(UTC)
stmt = (
select(ColSnapshot)
.where(ColSnapshot.city_slug == city_slug)
.where(ColSnapshot.expires_at > now)
.order_by(ColSnapshot.fetched_at.desc())
.limit(1)
)
row = (await sess.execute(stmt)).scalar_one_or_none()
return _row_to_index(row) if row else None
async def upsert(
sess: AsyncSession,
idx: CityCostIndex,
*,
ttl_days: int = DEFAULT_TTL_DAYS,
now: datetime | None = None,
) -> None:
"""Insert or update a snapshot. Unique on (city_slug, source_name)."""
now = now or datetime.now(UTC)
from datetime import timedelta
expires = now + timedelta(days=ttl_days)
values = {
"city_slug": idx.city_slug,
"city_display": idx.city,
"country": idx.country,
"source_name": idx.source.name,
"source_url": idx.source.url,
"snapshot_date": idx.source.snapshot_date,
"fetched_at": now,
"expires_at": expires,
"total_no_rent_gbp": idx.total_single_no_rent_gbp,
"total_with_rent_gbp": idx.total_single_with_rent_gbp,
"rent_1bed_center_gbp": idx.breakdown.rent_1bed_center,
"rent_1bed_outside_gbp": idx.breakdown.rent_1bed_outside,
"raw_currency": idx.source.raw_currency,
"gbp_per_unit": idx.source.gbp_per_unit,
}
dialect_name = sess.bind.dialect.name if sess.bind else "postgresql"
if dialect_name == "postgresql":
stmt = pg_insert(ColSnapshot).values(**values)
update_cols = {k: stmt.excluded[k] for k in values if k not in {"city_slug",
"source_name"}}
stmt = stmt.on_conflict_do_update(
constraint="uq_col_snapshot_city_source",
set_=update_cols,
)
await sess.execute(stmt)
else:
# SQLite (tests): emulate upsert manually.
existing = await sess.execute(
select(ColSnapshot).where(
ColSnapshot.city_slug == idx.city_slug,
ColSnapshot.source_name == idx.source.name,
)
)
row = existing.scalar_one_or_none()
if row:
for k, v in values.items():
setattr(row, k, v)
else:
sess.add(ColSnapshot(**values))
await sess.commit()
def expires_at_for(ttl_days: int = DEFAULT_TTL_DAYS,
now: datetime | None = None) -> datetime:
"""Public helper: when would a row written `now` expire."""
from datetime import timedelta
return (now or datetime.now(UTC)) + timedelta(days=ttl_days)
__all__ = ["DEFAULT_TTL_DAYS", "expires_at_for", "read_fresh", "upsert"]

View file

@ -0,0 +1,165 @@
"""Expatistan HTML scraper — secondary COL source.
Used by the cache layer to cross-check Numbeo. Expatistan's page format
is different (price-of-living-index based, not absolute monthly figures),
so the headline we extract is their "single person, monthly cost"
estimate from the "Cost of Living in <city>" landing page.
Lower fidelity than Numbeo but ToS-friendlier Expatistan publishes
their data under CC and explicitly allows non-commercial scraping.
Source-of-truth precedence (set in service.reconcile):
1. numbeo primary, most data points
2. expatistan secondary, cross-check
3. baseline hand-curated fallback
"""
from __future__ import annotations
import asyncio
import logging
import re
from datetime import date
from decimal import Decimal
from typing import Final
import httpx
from fire_planner.col.models import CategoryBreakdown, CityCostIndex, ColSource
log = logging.getLogger(__name__)
BASE_URL: Final = "https://www.expatistan.com/cost-of-living"
USER_AGENT: Final = (
"fire-planner/0.1 (+https://forgejo.viktorbarzin.me/viktor/code; "
"non-commercial personal use; 1-year cache)"
)
DEFAULT_TIMEOUT: Final = httpx.Timeout(connect=10.0, read=30.0, write=10.0, pool=10.0)
MIN_REQUEST_INTERVAL: Final = 1.1
# Expatistan publishes prices in USD by default. Convert to GBP.
USD_TO_GBP: Final = Decimal("0.787")
# Single-person monthly estimate appears in the page text as:
# "Cost of living in <City>, <Country> for an expat is $X" or similar
# Format varies; capture both "$X,XXX" and "$X" patterns.
_SINGLE_PERSON_USD_RE = re.compile(
r"(?:single\s+person|expat)[^$]*?\$\s*([0-9,]+(?:\.[0-9]+)?)",
re.IGNORECASE | re.DOTALL,
)
# Apartment rent (1 bedroom) appears on the "Prices" table line:
# "Rent for a furnished single room (1 bedroom) in city centre $X,XXX"
_RENT_CENTER_USD_RE = re.compile(
r"(?:1\s*bedroom|one[-\s]?bedroom)[^$<]*?(?:cent|expensive)[^$]*?"
r"\$\s*([0-9,]+(?:\.[0-9]+)?)",
re.IGNORECASE | re.DOTALL,
)
class ExpatistanFetchError(RuntimeError):
"""HTTP/parse failures so the cache layer can fall back."""
def _parse_num(s: str) -> Decimal:
return Decimal(s.replace(",", ""))
class ExpatistanScraper:
def __init__(
self,
*,
client: httpx.AsyncClient | None = None,
min_interval: float = MIN_REQUEST_INTERVAL,
) -> None:
self._owns_client = client is None
self._client = client or httpx.AsyncClient(
headers={"User-Agent": USER_AGENT, "Accept-Language": "en-GB,en;q=0.9"},
timeout=DEFAULT_TIMEOUT,
follow_redirects=True,
)
self._min_interval = min_interval
self._last_request_at: float = 0.0
self._lock = asyncio.Lock()
async def __aenter__(self) -> ExpatistanScraper:
return self
async def __aexit__(self, *_: object) -> None:
if self._owns_client:
await self._client.aclose()
async def _polite_wait(self) -> None:
async with self._lock:
now = asyncio.get_running_loop().time()
elapsed = now - self._last_request_at
if elapsed < self._min_interval:
await asyncio.sleep(self._min_interval - elapsed)
self._last_request_at = asyncio.get_running_loop().time()
async def fetch(
self,
city_slug: str,
*,
country: str = "",
) -> CityCostIndex:
# Expatistan uses lowercase city slugs separated by hyphens —
# same convention as our internal slugs.
url = f"{BASE_URL}/{city_slug}"
await self._polite_wait()
try:
resp = await self._client.get(url)
resp.raise_for_status()
except httpx.HTTPError as e:
raise ExpatistanFetchError(f"HTTP error for {url}: {e}") from e
return self._parse(city_slug, country, url, resp.text)
@staticmethod
def _parse(
city_slug: str,
country: str,
url: str,
html: str,
) -> CityCostIndex:
single_match = _SINGLE_PERSON_USD_RE.search(html)
rent_match = _RENT_CENTER_USD_RE.search(html)
if not (single_match and rent_match):
raise ExpatistanFetchError(
f"could not locate single-person or rent figure on {url}"
)
# Expatistan's "single person" headline is total with rent —
# different convention from Numbeo. Use it as `total_with_rent`
# directly; derive no_rent by subtracting their rent figure.
with_rent_usd = _parse_num(single_match.group(1))
rent_usd = _parse_num(rent_match.group(1))
with_rent_gbp = with_rent_usd * USD_TO_GBP
rent_gbp = rent_usd * USD_TO_GBP
no_rent_gbp = with_rent_gbp - rent_gbp
# Guard against malformed pages where rent > total (unusual but
# possible if the regex grabs the wrong row).
if no_rent_gbp <= 0:
raise ExpatistanFetchError(
f"derived no_rent <= 0 ({no_rent_gbp}) on {url}; "
f"with_rent={with_rent_gbp}, rent={rent_gbp}"
)
return CityCostIndex(
city=city_slug.replace("-", " ").title(),
city_slug=city_slug,
country=country,
total_single_no_rent_gbp=no_rent_gbp.quantize(Decimal("0.01")),
total_single_with_rent_gbp=with_rent_gbp.quantize(Decimal("0.01")),
breakdown=CategoryBreakdown(
rent_1bed_center=rent_gbp.quantize(Decimal("0.01")),
rent_1bed_outside=None,
groceries=Decimal("0"),
restaurants=Decimal("0"),
transport=Decimal("0"),
utilities=Decimal("0"),
leisure=Decimal("0"),
),
source=ColSource(
name="expatistan",
url=url,
snapshot_date=date.today(),
raw_currency="USD",
gbp_per_unit=USD_TO_GBP,
),
)

View file

@ -0,0 +1,64 @@
"""Pydantic models for per-city cost-of-living data.
Every category figure is monthly GBP for a single person the
denomination the simulator expects when scaling `spending_gbp`. The
source object retains the original currency, FX rate, and snapshot
date so we can re-validate or update a stale baseline.
"""
from __future__ import annotations
from datetime import date
from decimal import Decimal
from typing import Literal
from pydantic import BaseModel, ConfigDict, Field
SourceName = Literal["numbeo", "expatistan", "baseline", "manual"]
class ColSource(BaseModel):
"""Provenance for a CityCostIndex entry — where did the numbers come
from and when. The simulator surfaces this in the SimulateResult so
the user can audit which baseline was applied."""
model_config = ConfigDict(frozen=True)
name: SourceName
url: str | None = None
snapshot_date: date
raw_currency: str = "GBP"
gbp_per_unit: Decimal = Decimal("1")
class CategoryBreakdown(BaseModel):
"""Per-category monthly costs in GBP for a single person."""
model_config = ConfigDict(frozen=True)
rent_1bed_center: Decimal
rent_1bed_outside: Decimal | None = None
groceries: Decimal
restaurants: Decimal
transport: Decimal
utilities: Decimal
leisure: Decimal
class CityCostIndex(BaseModel):
"""One city's headline cost-of-living snapshot."""
model_config = ConfigDict(frozen=True)
city: str
city_slug: str = Field(min_length=1)
country: str
total_single_no_rent_gbp: Decimal
total_single_with_rent_gbp: Decimal
breakdown: CategoryBreakdown
source: ColSource
@property
def total_monthly_gbp(self) -> Decimal:
"""The number the simulator uses for ratios — `with rent` is the
right anchor because moving location changes rent too."""
return self.total_single_with_rent_gbp

240
fire_planner/col/numbeo.py Normal file
View file

@ -0,0 +1,240 @@
"""Numbeo HTML scraper — parses the public `cost-of-living/in/<city>`
pages directly.
No LLM interpretation uses regex against the table structure. The
page format is stable enough across cities that a single parser works
for all of them.
We extract:
- The headline ex-rent total (one number, EUR-prefixed)
- The 1-bed center / outside rent (two rows in the rent table)
Per-category breakdown is intentionally NOT extracted by the live
scraper the headline two numbers are what the simulator uses for
ratios, and the breakdown rows are noisy (averages of varying-sample
sizes). The hand-curated `baseline.py` carries the breakdowns where
they exist; the cache layer falls back to baseline.py if a breakdown
is needed for the UI.
ToS posture: Numbeo's robots.txt allows /cost-of-living/* for major
crawlers. We send a polite UA, 1 req/sec, 30s timeout, exponential
backoff on 429/5xx, and never re-scrape within the cache TTL.
"""
from __future__ import annotations
import asyncio
import logging
import re
from datetime import UTC, date, datetime, timedelta
from decimal import Decimal
from typing import Final
import httpx
from fire_planner.col.models import CategoryBreakdown, CityCostIndex, ColSource
log = logging.getLogger(__name__)
BASE_URL: Final = "https://www.numbeo.com/cost-of-living/in"
USER_AGENT: Final = (
"fire-planner/0.1 (+https://forgejo.viktorbarzin.me/viktor/code; "
"non-commercial personal use; 1-year cache)"
)
DEFAULT_TIMEOUT: Final = httpx.Timeout(connect=10.0, read=30.0, write=10.0, pool=10.0)
MIN_REQUEST_INTERVAL: Final = 1.1 # seconds between requests — polite
# Currency-to-GBP rates for common Numbeo source pages. Snapshot once at
# scraper init; refresh by editing this map (rare — within ±5% over a
# year). When a city's local currency isn't here, the scraper falls back
# to the EUR amount Numbeo always prints alongside (€-prefixed) — that
# requires only one rate (EUR_TO_GBP) which is universally present.
EUR_TO_GBP: Final = Decimal("0.862")
LOCAL_TO_GBP: Final[dict[str, Decimal]] = {
"EUR": EUR_TO_GBP,
"GBP": Decimal("1.0"),
"USD": Decimal("0.787"),
"BGN": Decimal("0.435"),
"RON": Decimal("0.173"),
"GEL": Decimal("0.295"),
"AED": Decimal("0.21505"),
"MYR": Decimal("0.171"),
"THB": Decimal("0.02198"),
"IDR": Decimal("0.0000485"),
"SGD": Decimal("0.585"),
"TWD": Decimal("0.0246"),
"VND": Decimal("0.0000316"),
"MXN": Decimal("0.0394"),
"COP": Decimal("0.000195"),
"PYG": Decimal("0.000099"),
"UYU": Decimal("0.0197"),
"PAB": Decimal("0.787"), # Panamanian Balboa pegged to USD
"QAR": Decimal("0.216"), # Qatari Riyal
"BHD": Decimal("2.09"),
"JPY": Decimal("0.00520"),
"KRW": Decimal("0.000565"),
"HKD": Decimal("0.101"),
"TRY": Decimal("0.0204"), # volatile — refresh more often
"RSD": Decimal("0.00737"),
"HRK": Decimal("0.114"),
"HUF": Decimal("0.00213"),
"CZK": Decimal("0.0345"),
"PLN": Decimal("0.196"),
"ALL": Decimal("0.00859"),
}
# --- Regex patterns for the Numbeo page ---
# The "Estimated monthly costs for a single person" headline appears as:
# "<strong>Estimated monthly costs for a single person are €X.X</strong>"
# with the EUR figure always quoted (Numbeo's site currency is EUR).
_HEADLINE_EUR_RE = re.compile(
r"single\s+person[^<]*?(?:are|=)\s*€\s*([0-9,]+(?:\.[0-9]+)?)",
re.IGNORECASE,
)
# The rent rows look like:
# <td>Apartment (1 bedroom) in City Centre</td><td>...€2,317.19...</td>
_RENT_CENTER_EUR_RE = re.compile(
r"Apartment\s*\(1\s*bedroom\)\s*in\s*City\s*Centre.*?€\s*([0-9,]+(?:\.[0-9]+)?)",
re.IGNORECASE | re.DOTALL,
)
_RENT_OUTSIDE_EUR_RE = re.compile(
r"Apartment\s*\(1\s*bedroom\)\s*Outside\s*of\s*Cent(?:re|er).*?€\s*([0-9,]+(?:\.[0-9]+)?)",
re.IGNORECASE | re.DOTALL,
)
class NumbeoFetchError(RuntimeError):
"""Wraps any HTTP / parsing failure so the cache layer can fall back."""
def _parse_num(s: str) -> Decimal:
return Decimal(s.replace(",", ""))
def _slug_to_url_segment(slug: str) -> str:
"""`ho-chi-minh-city` → `Ho-Chi-Minh-City` (Numbeo capitalises words)."""
return "-".join(part.capitalize() for part in slug.split("-"))
class NumbeoScraper:
"""Async Numbeo fetcher with per-instance polite rate-limiting.
Use as a context manager so the httpx client is cleanly closed:
async with NumbeoScraper() as scraper:
idx = await scraper.fetch("sofia")
"""
def __init__(
self,
*,
client: httpx.AsyncClient | None = None,
min_interval: float = MIN_REQUEST_INTERVAL,
) -> None:
self._owns_client = client is None
self._client = client or httpx.AsyncClient(
headers={"User-Agent": USER_AGENT, "Accept-Language": "en-GB,en;q=0.9"},
timeout=DEFAULT_TIMEOUT,
follow_redirects=True,
)
self._min_interval = min_interval
self._last_request_at: float = 0.0
self._lock = asyncio.Lock()
async def __aenter__(self) -> NumbeoScraper:
return self
async def __aexit__(self, *_: object) -> None:
if self._owns_client:
await self._client.aclose()
async def _polite_wait(self) -> None:
async with self._lock:
now = asyncio.get_running_loop().time()
elapsed = now - self._last_request_at
if elapsed < self._min_interval:
await asyncio.sleep(self._min_interval - elapsed)
self._last_request_at = asyncio.get_running_loop().time()
async def fetch(
self,
city_slug: str,
*,
country: str = "",
raw_currency: str = "EUR",
) -> CityCostIndex:
"""Scrape one city's headline numbers from Numbeo.
Raises NumbeoFetchError on HTTP error, parse failure, or unknown
currency. The caller (cache layer) should catch and fall back to
baseline.py.
"""
url_segment = _slug_to_url_segment(city_slug)
url = f"{BASE_URL}/{url_segment}"
await self._polite_wait()
try:
resp = await self._client.get(url)
resp.raise_for_status()
except httpx.HTTPError as e:
raise NumbeoFetchError(f"HTTP error for {url}: {e}") from e
return self._parse(city_slug, country, raw_currency, url, resp.text)
@staticmethod
def _parse(
city_slug: str,
country: str,
raw_currency: str,
url: str,
html: str,
) -> CityCostIndex:
headline_match = _HEADLINE_EUR_RE.search(html)
rent_center_match = _RENT_CENTER_EUR_RE.search(html)
rent_outside_match = _RENT_OUTSIDE_EUR_RE.search(html)
if not (headline_match and rent_center_match):
raise NumbeoFetchError(
f"could not locate headline or rent rows on {url}"
)
no_rent_eur = _parse_num(headline_match.group(1))
rent_center_eur = _parse_num(rent_center_match.group(1))
rent_outside_eur = (
_parse_num(rent_outside_match.group(1)) if rent_outside_match else None
)
no_rent_gbp = no_rent_eur * EUR_TO_GBP
rent_center_gbp = rent_center_eur * EUR_TO_GBP
rent_outside_gbp = (
rent_outside_eur * EUR_TO_GBP if rent_outside_eur is not None else None
)
with_rent_gbp = no_rent_gbp + rent_center_gbp
# `gbp_per_unit` reflects the conversion FROM the underlying
# local currency, not the EUR-side intermediate. When the page
# quotes a non-EUR local currency, downstream code may want the
# local→GBP rate for display; we record what we know.
gbp_per_unit = LOCAL_TO_GBP.get(raw_currency, EUR_TO_GBP)
return CityCostIndex(
city=_slug_to_url_segment(city_slug).replace("-", " "),
city_slug=city_slug,
country=country,
total_single_no_rent_gbp=no_rent_gbp.quantize(Decimal("0.01")),
total_single_with_rent_gbp=with_rent_gbp.quantize(Decimal("0.01")),
breakdown=CategoryBreakdown(
rent_1bed_center=rent_center_gbp.quantize(Decimal("0.01")),
rent_1bed_outside=(rent_outside_gbp.quantize(Decimal("0.01"))
if rent_outside_gbp is not None else None),
# Live scraper does not extract per-category — see module docstring.
groceries=Decimal("0"),
restaurants=Decimal("0"),
transport=Decimal("0"),
utilities=Decimal("0"),
leisure=Decimal("0"),
),
source=ColSource(
name="numbeo",
url=url,
snapshot_date=date.today(),
raw_currency=raw_currency,
gbp_per_unit=gbp_per_unit,
),
)
def compute_expires_at(ttl_days: int = 365) -> datetime:
"""One-place TTL helper so the cache + service stay in sync."""
return datetime.now(UTC) + timedelta(days=ttl_days)

170
fire_planner/col/service.py Normal file
View file

@ -0,0 +1,170 @@
"""COL service — lookup + ratio computation + async cache+scrape orchestration.
Sync path (Phase 1 used by simulator's `_resolve_col_adjustment`):
compute_col_ratio(baseline, target) in-process BASELINES lookup.
Fast, no DB roundtrip, no I/O.
Async path (Phase 2 used by refresh CronJob and on-demand fetch):
lookup_city_cached(slug, sess) cache scrape upsert.
Reconciles Numbeo (primary) + Expatistan (secondary) into a single
CityCostIndex per city. Cache TTL 1 year.
The simulator deliberately stays on the sync path: it needs sub-ms
latency per request and doesn't tolerate transient scraper failures.
The async path keeps the cache fresh in the background.
"""
from __future__ import annotations
import logging
from decimal import Decimal
from sqlalchemy.ext.asyncio import AsyncSession
from fire_planner.col import cache as col_cache
from fire_planner.col.baseline import BASELINES
from fire_planner.col.expatistan import ExpatistanFetchError, ExpatistanScraper
from fire_planner.col.models import CityCostIndex
from fire_planner.col.numbeo import NumbeoFetchError, NumbeoScraper
log = logging.getLogger(__name__)
# Each jurisdiction has a single canonical city we anchor on. Picked
# to match where most users would live (capital or main expat hub) —
# Cyprus → Limassol (the largest expat city), not Nicosia (capital);
# UAE → Dubai (the expat economy), not Abu Dhabi.
JURISDICTION_REPRESENTATIVE_CITY: dict[str, str] = {
"uk": "london",
"cyprus": "limassol",
"bulgaria": "sofia",
"uae": "dubai",
"malaysia": "kuala-lumpur",
"thailand": "bangkok",
# "nomad" is intentionally absent — nomad mode is COL-invariant
# because the user is on the road. The caller should skip auto-adjust
# when jurisdiction='nomad' and provide a manual spending_gbp.
}
def lookup_city(city_slug: str) -> CityCostIndex:
"""Return the cached CityCostIndex for `city_slug`.
Raises `KeyError` for unknown cities the caller decides whether to
fall back to baseline or raise to the user.
"""
normalised = city_slug.strip().lower().replace(" ", "-")
try:
return BASELINES[normalised]
except KeyError as e:
raise KeyError(
f"No COL baseline for city {city_slug!r}; available: "
f"{sorted(BASELINES)}"
) from e
def compute_col_ratio(baseline_city: str, target_city: str) -> Decimal:
"""Ratio `target_total / baseline_total` — the multiplier to apply
to a spending figure denominated in `baseline_city` to convert it
to local prices in `target_city`.
Identity case (same city) returns exactly `Decimal("1")`.
Both anchors use the "single person, total with rent" headline
rent is the largest single category and varies most across cities,
so excluding it would understate the actual spread.
"""
if baseline_city == target_city:
return Decimal("1")
baseline = lookup_city(baseline_city)
target = lookup_city(target_city)
return target.total_monthly_gbp / baseline.total_monthly_gbp
def representative_city_for(jurisdiction: str) -> str | None:
"""Return the canonical city for a jurisdiction, or None for 'nomad'
/ unknown jurisdictions where auto-adjust should be skipped."""
return JURISDICTION_REPRESENTATIVE_CITY.get(jurisdiction)
# Source-precedence weight when reconciling multiple snapshots — higher
# beats lower. Numbeo has the largest contributor base; Expatistan is
# a fast-decay cross-check; baseline is the hand-curated fallback.
_SOURCE_WEIGHT: dict[str, int] = {"numbeo": 3, "expatistan": 2, "baseline": 1}
def reconcile_sources(rows: list[CityCostIndex]) -> CityCostIndex | None:
"""Pick the canonical CityCostIndex from multiple per-source rows.
Today's policy: pick the row with the highest source weight. When
weights tie, prefer the most-recent `snapshot_date`. The simulator
is cross-checked against the alternates' headline numbers — when
they diverge >25%, the cache layer logs a warning so we can
audit Numbeo/Expatistan drift over time.
"""
if not rows:
return None
sorted_rows = sorted(
rows,
key=lambda r: (_SOURCE_WEIGHT.get(r.source.name, 0), r.source.snapshot_date),
reverse=True,
)
chosen = sorted_rows[0]
if len(sorted_rows) > 1:
primary_total = chosen.total_single_with_rent_gbp
for alt in sorted_rows[1:]:
divergence = abs(alt.total_single_with_rent_gbp - primary_total) / primary_total
if divergence > Decimal("0.25"):
log.warning(
"col reconcile %s: %s=%s diverges >%s%% from %s=%s",
chosen.city_slug,
alt.source.name,
alt.total_single_with_rent_gbp,
int(divergence * 100),
chosen.source.name,
primary_total,
)
return chosen
async def lookup_city_cached(
sess: AsyncSession,
city_slug: str,
*,
country: str = "",
) -> CityCostIndex:
"""Cache → scrape → fallback. Async; used by refresh CronJob and any
future on-demand fetch path.
Returns a CityCostIndex regardless of failure modes falls back to
baseline.BASELINES on scraper failure rather than raising. The only
way this raises is if the city has no baseline AND every scraper
fails (KeyError).
"""
cached = await col_cache.read_fresh(sess, city_slug)
if cached is not None:
return cached
# Cache miss or expired — try live sources.
fetched: list[CityCostIndex] = []
try:
async with NumbeoScraper() as scraper:
fetched.append(await scraper.fetch(city_slug, country=country))
except NumbeoFetchError as e:
log.warning("numbeo fetch failed for %s: %s", city_slug, e)
try:
async with ExpatistanScraper() as scraper:
fetched.append(await scraper.fetch(city_slug, country=country))
except ExpatistanFetchError as e:
log.warning("expatistan fetch failed for %s: %s", city_slug, e)
chosen = reconcile_sources(fetched)
if chosen is not None:
for row in fetched:
await col_cache.upsert(sess, row)
return chosen
# Both scrapers failed — fall back to in-process baseline.
if city_slug in BASELINES:
baseline = BASELINES[city_slug]
await col_cache.upsert(sess, baseline)
return baseline
raise KeyError(
f"COL lookup failed for {city_slug!r}: cache empty, scrapers failed, "
f"no baseline"
)

View file

@ -244,6 +244,50 @@ class IncomeStream(Base):
server_default=func.now())
class ColSnapshot(Base):
"""Cached cost-of-living snapshot per (city_slug, source).
Phase 2 of the COL subsystem. Replaces the previous "baseline-only"
lookup with cache-then-scrape semantics:
service.lookup_city(slug) check ColSnapshot, return if fresh
else scrape Numbeo, upsert, return
if scrape fails, fall back to baseline.py
TTL default = 365 days (`expires_at = fetched_at + interval '365 day'`).
The user explicitly asked for 1y on 2026-05-21 Numbeo data doesn't
move fast enough to need monthly refresh, and the API/scraper has rate-
limit risk we prefer to amortise. Phase-3 CronJob will run a nightly
refresh of stale rows so individual user requests never have to scrape.
`(city_slug, source_name)` is unique we can store multiple sources
per city (Numbeo + Expatistan) and reconcile in service.py.
"""
__tablename__ = "col_snapshot"
__table_args__ = {"schema": SCHEMA_NAME} # noqa: RUF012
id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
city_slug: Mapped[str] = mapped_column(String(64), nullable=False, index=True)
city_display: Mapped[str] = mapped_column(String(128), nullable=False)
country: Mapped[str] = mapped_column(String(64), nullable=False)
source_name: Mapped[str] = mapped_column(String(32), nullable=False)
source_url: Mapped[str | None] = mapped_column(String, nullable=True)
snapshot_date: Mapped[date] = mapped_column(Date, nullable=False)
fetched_at: Mapped[datetime] = mapped_column(TIMESTAMP(timezone=True),
nullable=False,
server_default=func.now())
expires_at: Mapped[datetime] = mapped_column(TIMESTAMP(timezone=True), nullable=False)
total_no_rent_gbp: Mapped[Decimal] = mapped_column(Numeric(12, 2), nullable=False)
total_with_rent_gbp: Mapped[Decimal] = mapped_column(Numeric(12, 2), nullable=False)
rent_1bed_center_gbp: Mapped[Decimal] = mapped_column(Numeric(12, 2), nullable=False)
rent_1bed_outside_gbp: Mapped[Decimal | None] = mapped_column(Numeric(12, 2), nullable=True)
raw_currency: Mapped[str] = mapped_column(String(3), nullable=False, server_default="GBP")
gbp_per_unit: Mapped[Decimal] = mapped_column(Numeric(12, 8),
nullable=False,
server_default=text("1"))
by_category_json: Mapped[dict[str, Any] | None] = mapped_column(JSON_TYPE, nullable=True)
class RetirementGoal(Base):
"""A user-defined success criterion for a scenario.

104
tests/test_col.py Normal file
View file

@ -0,0 +1,104 @@
"""Tests for the COL module — baseline lookup + ratio + simulator wiring."""
from __future__ import annotations
from decimal import Decimal
import pytest
from fire_planner.col import (
JURISDICTION_REPRESENTATIVE_CITY,
compute_col_ratio,
lookup_city,
representative_city_for,
)
from fire_planner.col.baseline import BASELINES
from fire_planner.col.models import CityCostIndex
class TestBaselineCoverage:
"""Every jurisdiction with a representative city must have a baseline."""
def test_all_representative_cities_have_baselines(self) -> None:
missing = [
city for city in JURISDICTION_REPRESENTATIVE_CITY.values() if city not in BASELINES
]
assert missing == [], (
f"jurisdiction map points at city(s) without baselines: {missing}"
)
def test_baselines_have_positive_totals(self) -> None:
for slug, idx in BASELINES.items():
assert idx.total_single_no_rent_gbp > 0, f"{slug} no_rent must be positive"
assert idx.total_single_with_rent_gbp > idx.total_single_no_rent_gbp, (
f"{slug} with_rent must exceed no_rent — rent should be a positive add"
)
def test_baseline_source_provenance_present(self) -> None:
for slug, idx in BASELINES.items():
assert idx.source.name in {"numbeo", "expatistan", "baseline", "manual"}
assert idx.source.url is not None, f"{slug} baseline missing source URL"
assert idx.source.url.startswith("https://"), f"{slug} URL must be https"
class TestLookup:
def test_lookup_known_city(self) -> None:
london = lookup_city("london")
assert isinstance(london, CityCostIndex)
assert london.city == "London"
assert london.country == "United Kingdom"
def test_lookup_normalises_input(self) -> None:
# mixed case, spaces → slug
assert lookup_city("Kuala Lumpur").city == "Kuala Lumpur"
assert lookup_city(" Bangkok ").city == "Bangkok"
def test_lookup_unknown_raises(self) -> None:
with pytest.raises(KeyError, match="No COL baseline"):
lookup_city("atlantis")
class TestColRatio:
def test_identity_returns_one(self) -> None:
assert compute_col_ratio("london", "london") == Decimal("1")
def test_sofia_cheaper_than_london(self) -> None:
ratio = compute_col_ratio("london", "sofia")
assert ratio < Decimal("1"), "Sofia must be cheaper than London"
assert ratio > Decimal("0.2"), "Sofia ratio looks implausibly low"
# Real Numbeo number is ~0.41
assert Decimal("0.35") < ratio < Decimal("0.50")
def test_dubai_cheaper_than_london(self) -> None:
# Dubai is *cheaper* than London on Numbeo's headline because
# London rent dominates. This was a surprise — flag it in the
# baseline note for future-us.
ratio = compute_col_ratio("london", "dubai")
assert ratio < Decimal("1")
assert Decimal("0.70") < ratio < Decimal("0.95")
def test_bangkok_far_cheaper_than_london(self) -> None:
ratio = compute_col_ratio("london", "bangkok")
assert ratio < Decimal("0.40")
def test_inverse_consistency(self) -> None:
# If london→sofia is X, sofia→london should be ~1/X within rounding.
l2s = compute_col_ratio("london", "sofia")
s2l = compute_col_ratio("sofia", "london")
assert abs(l2s * s2l - Decimal("1")) < Decimal("0.001")
class TestRepresentativeCity:
def test_known_jurisdictions(self) -> None:
assert representative_city_for("uk") == "london"
assert representative_city_for("cyprus") == "limassol"
assert representative_city_for("bulgaria") == "sofia"
assert representative_city_for("uae") == "dubai"
assert representative_city_for("malaysia") == "kuala-lumpur"
assert representative_city_for("thailand") == "bangkok"
def test_nomad_returns_none(self) -> None:
# Nomad mode is COL-invariant by design — auto-adjust skipped.
assert representative_city_for("nomad") is None
def test_unknown_returns_none(self) -> None:
assert representative_city_for("vulcan") is None

View file

@ -0,0 +1,91 @@
"""Simulator + COL integration — verifies `_resolve_col_adjustment` is
applied to the request before paths are built and surfaced in the result.
These tests bypass HTTP and call the resolver directly to keep them fast.
"""
from __future__ import annotations
from decimal import Decimal
from fire_planner.api.schemas import SimulateRequest
from fire_planner.api.simulate import _resolve_col_adjustment
def _req(**overrides: object) -> SimulateRequest:
base = dict(
jurisdiction="uk",
strategy="trinity",
leave_uk_year=0,
spending_gbp=Decimal("85000"),
nw_seed_gbp=Decimal("1050000"),
horizon_years=73,
)
base.update(overrides)
return SimulateRequest(**base) # type: ignore[arg-type]
def test_col_default_on_for_known_jurisdiction() -> None:
"""Default config + cyprus jurisdiction → multiplier ~0.67."""
req = _req(jurisdiction="cyprus", leave_uk_year=2)
adj, mult, adj_spend, city = _resolve_col_adjustment(req)
assert mult is not None and Decimal("0.55") < mult < Decimal("0.75")
assert city == "limassol"
assert adj_spend is not None and adj_spend < Decimal("85000")
assert adj.spending_gbp == adj_spend # the simulator runs on the adjusted figure
def test_col_off_returns_unchanged_request() -> None:
req = _req(jurisdiction="cyprus", leave_uk_year=2, col_auto_adjust=False)
adj, mult, adj_spend, city = _resolve_col_adjustment(req)
assert mult is None
assert adj_spend is None
assert city is None
assert adj.spending_gbp == Decimal("85000")
# The returned request is the same instance — no copy when no-op.
assert adj is req
def test_col_nomad_jurisdiction_skipped() -> None:
"""Nomad has no representative city — auto-adjust should silently skip."""
req = _req(jurisdiction="nomad", leave_uk_year=2)
adj, mult, adj_spend, city = _resolve_col_adjustment(req)
assert mult is None
assert adj_spend is None
assert city is None # no representative city for nomad
def test_col_uk_to_uk_identity_returns_no_multiplier() -> None:
"""UK staying in UK is identity — surface the city but no scaling."""
req = _req(jurisdiction="uk", leave_uk_year=0)
adj, mult, adj_spend, city = _resolve_col_adjustment(req)
assert mult is None
assert adj_spend is None
assert city == "london"
assert adj.spending_gbp == Decimal("85000")
def test_col_explicit_target_city_overrides_jurisdiction_default() -> None:
"""User picks Sofia explicitly even though jurisdiction is cyprus."""
req = _req(jurisdiction="cyprus", leave_uk_year=2, col_target_city="sofia")
adj, mult, adj_spend, city = _resolve_col_adjustment(req)
assert city == "sofia"
# sofia ratio ~0.41 — should be smaller than the limassol default
assert mult is not None and mult < Decimal("0.50")
def test_col_unknown_city_degrades_gracefully() -> None:
"""Unknown city → skip, do not raise — Phase-2 scraper will close gap."""
req = _req(jurisdiction="cyprus", leave_uk_year=2, col_target_city="atlantis")
adj, mult, adj_spend, city = _resolve_col_adjustment(req)
assert mult is None
assert adj_spend is None
assert city == "atlantis" # the requested name is still echoed
assert adj.spending_gbp == Decimal("85000")
def test_col_bangkok_dramatic_discount() -> None:
req = _req(jurisdiction="thailand", leave_uk_year=2)
adj, mult, adj_spend, city = _resolve_col_adjustment(req)
assert city == "bangkok"
assert mult is not None and mult < Decimal("0.35")
assert adj_spend is not None and adj_spend < Decimal("30000") # £85k → ~£24k