diff --git a/alembic/versions/0005_col_snapshot.py b/alembic/versions/0005_col_snapshot.py new file mode 100644 index 0000000..47f04c8 --- /dev/null +++ b/alembic/versions/0005_col_snapshot.py @@ -0,0 +1,73 @@ +"""add col_snapshot table for cached cost-of-living data + +Revision ID: 0005 +Revises: 0004 +Create Date: 2026-05-21 12:00:00.000000 + +Phase 2 of the cost-of-living subsystem (`fire_planner.col`). Caches +Numbeo / Expatistan headline data with a 1-year TTL so the simulator +can scale `spending_gbp` to local prices without re-scraping per-call. +Refresh is async (Phase-3 CronJob); user-facing lookups never block on +the network in the steady state. + +Unique on (city_slug, source_name) — multiple sources per city are +allowed; service.py reconciles them when computing the headline. +""" +from collections.abc import Sequence + +import sqlalchemy as sa + +from alembic import op + +revision: str = "0005" +down_revision: str | None = "0004" +branch_labels: str | Sequence[str] | None = None +depends_on: str | Sequence[str] | None = None + +SCHEMA = "fire_planner" + + +def upgrade() -> None: + op.create_table( + "col_snapshot", + sa.Column("id", sa.Integer(), nullable=False, autoincrement=True), + sa.Column("city_slug", sa.String(length=64), nullable=False), + sa.Column("city_display", sa.String(length=128), nullable=False), + sa.Column("country", sa.String(length=64), nullable=False), + sa.Column("source_name", sa.String(length=32), nullable=False), + sa.Column("source_url", sa.String(), nullable=True), + sa.Column("snapshot_date", sa.Date(), nullable=False), + sa.Column("fetched_at", sa.TIMESTAMP(timezone=True), nullable=False, + server_default=sa.func.now()), + sa.Column("expires_at", sa.TIMESTAMP(timezone=True), nullable=False), + sa.Column("total_no_rent_gbp", sa.Numeric(12, 2), nullable=False), + sa.Column("total_with_rent_gbp", sa.Numeric(12, 2), nullable=False), + sa.Column("rent_1bed_center_gbp", sa.Numeric(12, 2), nullable=False), + sa.Column("rent_1bed_outside_gbp", sa.Numeric(12, 2), nullable=True), + sa.Column("raw_currency", sa.String(length=3), nullable=False, + server_default=sa.text("'GBP'")), + sa.Column("gbp_per_unit", sa.Numeric(12, 8), nullable=False, + server_default=sa.text("1")), + sa.Column("by_category_json", sa.JSON(), nullable=True), + sa.PrimaryKeyConstraint("id"), + sa.UniqueConstraint("city_slug", "source_name", name="uq_col_snapshot_city_source"), + schema=SCHEMA, + ) + op.create_index( + "ix_col_snapshot_city_slug", + "col_snapshot", + ["city_slug"], + schema=SCHEMA, + ) + op.create_index( + "ix_col_snapshot_expires_at", + "col_snapshot", + ["expires_at"], + schema=SCHEMA, + ) + + +def downgrade() -> None: + op.drop_index("ix_col_snapshot_expires_at", table_name="col_snapshot", schema=SCHEMA) + op.drop_index("ix_col_snapshot_city_slug", table_name="col_snapshot", schema=SCHEMA) + op.drop_table("col_snapshot", schema=SCHEMA) diff --git a/fire_planner/__main__.py b/fire_planner/__main__.py index 8a2c452..4aa7992 100644 --- a/fire_planner/__main__.py +++ b/fire_planner/__main__.py @@ -57,6 +57,103 @@ def migrate() -> None: sys.exit(rc.returncode) +@cli.command("col-seed") +@click.option("--ttl-days", + type=int, + default=365, + help="Cache TTL in days (default 365 — matches Viktor's 1y choice).") +def col_seed(ttl_days: int) -> None: + """Seed `col_snapshot` from baseline.py BASELINES. + + Idempotent — uses upsert on (city_slug, source_name). Run once after + the alembic migration creates the table. Subsequent live-scrape + refreshes (Phase 3 CronJob) supersede these rows; the baseline + fallback remains as a last-resort source. + """ + asyncio.run(_col_seed(ttl_days)) + + +async def _col_seed(ttl_days: int) -> None: + from fire_planner.col.baseline import BASELINES + from fire_planner.col.cache import upsert as col_upsert + + engine = create_engine_from_env() + factory = make_session_factory(engine) + try: + async with factory() as sess: + for slug, idx in BASELINES.items(): + # Tag the source as `baseline` rather than `numbeo` so a + # later live scrape (source_name='numbeo') doesn't conflict + # on the (city_slug, source_name) unique constraint. + tagged = idx.model_copy( + update={"source": idx.source.model_copy(update={"name": "baseline"})} + ) + await col_upsert(sess, tagged, ttl_days=ttl_days) + click.echo(f" seeded {slug:20s} total={idx.total_single_with_rent_gbp} GBP") + finally: + await engine.dispose() + click.echo(f"\ncol-seed: {len(BASELINES)} cities upserted (ttl_days={ttl_days}).") + + +@cli.command("col-refresh-stale") +@click.option("--within-days", + type=int, + default=7, + help="Refresh rows whose expires_at is within this many days.") +@click.option("--ttl-days", + type=int, + default=365, + help="TTL for re-written rows (default 365).") +def col_refresh_stale(within_days: int, ttl_days: int) -> None: + """Re-scrape COL rows that are within `within_days` of expiry. + + Designed for the weekly CronJob. Walks every distinct city_slug in + `col_snapshot` whose newest row will expire within the window, + calls Numbeo+Expatistan via `service.lookup_city_cached`, which + upserts the result. Idempotent — no-op for fresh rows. + """ + asyncio.run(_col_refresh_stale(within_days, ttl_days)) + + +async def _col_refresh_stale(within_days: int, ttl_days: int) -> None: + from sqlalchemy import select, text + + from fire_planner.col.service import lookup_city_cached + from fire_planner.db import ColSnapshot + + engine = create_engine_from_env() + factory = make_session_factory(engine) + threshold = f"NOW() + INTERVAL '{int(within_days)} days'" + refreshed = 0 + failed = 0 + try: + async with factory() as sess: + # Find distinct city_slug whose freshest row expires within window. + stmt = ( + select(ColSnapshot.city_slug, ColSnapshot.country) + .distinct() + .where(text(f"expires_at <= {threshold}")) + ) + rows = (await sess.execute(stmt)).all() + click.echo(f"col-refresh-stale: {len(rows)} city(ies) need refresh " + f"(within_days={within_days})") + for slug, country in rows: + try: + # lookup_city_cached upserts on cache miss, which is + # what "stale" means here — read_fresh returns None. + idx = await lookup_city_cached(sess, slug, country=country or "") + click.echo(f" refreshed {slug:20s} → {idx.source.name:10s} " + f"total={idx.total_single_with_rent_gbp}") + refreshed += 1 + except Exception as e: # broad — log and continue per-city + click.echo(f" FAILED {slug}: {e}", err=True) + failed += 1 + finally: + await engine.dispose() + click.echo(f"\ncol-refresh-stale done: refreshed={refreshed} failed={failed} " + f"ttl_days={ttl_days}") + + @cli.command("ingest") @click.option("--source", type=click.Choice(["wealthfolio"]), diff --git a/fire_planner/api/schemas.py b/fire_planner/api/schemas.py index 3a968f1..e2b2e01 100644 --- a/fire_planner/api/schemas.py +++ b/fire_planner/api/schemas.py @@ -504,6 +504,22 @@ class SimulateRequest(BaseModel): annual_real_adjust_pct: Decimal = Decimal("0") guardrail_threshold_pct: Decimal | None = None guardrail_cut_pct: Decimal = Decimal("0.10") + # Cost-of-living auto-adjust: when `col_auto_adjust=True`, the + # simulator looks up COL ratio (target/baseline) from `fire_planner.col` + # and scales `spending_gbp` BEFORE running paths. Defaults to True so + # cross-jurisdiction comparisons are honest by default — earlier + # comparisons used hand-wave 0.5x/0.75x multipliers, which were + # consistently optimistic vs. actual Numbeo data (Bulgaria is 0.41x, + # not 0.50x; Cyprus 0.67x, not 0.75x). + # + # `col_target_city` defaults to the jurisdiction's representative + # city (uk→london, cyprus→limassol, etc.). Set explicitly to anchor + # on a different city (e.g. `cyprus`+`paphos` if Limassol is too + # expensive a proxy). For `jurisdiction='nomad'` there is no + # representative city and auto-adjust is skipped silently. + col_auto_adjust: bool = True + col_baseline_city: str = "london" + col_target_city: str | None = None class SimulateResult(BaseModel): @@ -516,6 +532,13 @@ class SimulateResult(BaseModel): elapsed_seconds: Decimal yearly: list[ProjectionPoint] goals_probability: list[GoalProbability] = Field(default_factory=list) + # When `col_auto_adjust=True`, surface the applied multiplier + the + # COL-adjusted spending so the user can see what was used. Null when + # auto-adjust was off, jurisdiction had no representative city + # (nomad), or baseline==target (London-to-London). + col_multiplier_applied: Decimal | None = None + col_adjusted_spending_gbp: Decimal | None = None + col_target_city: str | None = None class CompareRequest(BaseModel): diff --git a/fire_planner/api/simulate.py b/fire_planner/api/simulate.py index ebc48be..2003416 100644 --- a/fire_planner/api/simulate.py +++ b/fire_planner/api/simulate.py @@ -26,6 +26,7 @@ from fire_planner.api.schemas import ( SimulateRequest, SimulateResult, ) +from fire_planner.col import compute_col_ratio, representative_city_for from fire_planner.flex_spending import FlexRule as EngineFlexRule from fire_planner.glide_path import static from fire_planner.goals_eval import evaluate_goals @@ -50,6 +51,36 @@ router = APIRouter(tags=["simulate"]) _RETURNS_CSV = Path("/data/shiller_returns.csv") +def _resolve_col_adjustment( + req: SimulateRequest, +) -> tuple[SimulateRequest, Decimal | None, Decimal | None, str | None]: + """Apply cost-of-living adjustment to `req.spending_gbp` when enabled. + + Returns the (possibly modified) request, the multiplier applied (or + None), the post-adjustment spending GBP (or None), and the resolved + target city slug (or None). Skipped silently when: + - col_auto_adjust is False + - the jurisdiction has no representative city (e.g. nomad) + - baseline_city == resolved target city (identity transform) + - either city is unknown to the baseline lookup (degrade gracefully + rather than 400 — a future Phase-2 scraper will close the gap) + """ + if not req.col_auto_adjust: + return req, None, None, None + target = req.col_target_city or representative_city_for(req.jurisdiction) + if target is None: + return req, None, None, None + if target == req.col_baseline_city: + return req, None, None, target + try: + ratio = compute_col_ratio(req.col_baseline_city, target) + except KeyError: + return req, None, None, target + adjusted_spend = req.spending_gbp * ratio + adjusted_req = req.model_copy(update={"spending_gbp": adjusted_spend}) + return adjusted_req, ratio, adjusted_spend, target + + def _shiller_paths(seed: int, n_paths: int, n_years: int) -> np.ndarray: bundle = (load_from_csv(_RETURNS_CSV) if _RETURNS_CSV.exists() else synthetic_returns(seed=42)) rng = np.random.default_rng(seed) @@ -193,6 +224,9 @@ def _to_response( result: SimulationResult, elapsed: float, req: SimulateRequest | None = None, + col_multiplier: Decimal | None = None, + col_adjusted_spend: Decimal | None = None, + col_target_city: str | None = None, ) -> SimulateResult: # portfolio_real has n_years+1 columns (year 0 = seed, year k = end-of-year k). # withdrawal_real / tax_real have n_years columns (year k = withdrawn in year k+1). @@ -243,27 +277,34 @@ def _to_response( elapsed_seconds=Decimal(str(round(elapsed, 3))), yearly=yearly, goals_probability=goals_probability, + col_multiplier_applied=(Decimal(str(round(float(col_multiplier), 6))) + if col_multiplier is not None else None), + col_adjusted_spending_gbp=(Decimal(str(round(float(col_adjusted_spend), 2))) + if col_adjusted_spend is not None else None), + col_target_city=col_target_city, ) @router.post("/simulate", response_model=SimulateResult) async def simulate_one(req: SimulateRequest) -> SimulateResult: """Run one scenario synchronously, no DB write. ~1-3s for 5k paths.""" - paths = await _build_paths(req) + adjusted_req, mult, adj_spend, target_city = _resolve_col_adjustment(req) + paths = await _build_paths(adjusted_req) try: - result, elapsed = await asyncio.to_thread(_project, req, paths) + result, elapsed = await asyncio.to_thread(_project, adjusted_req, paths) except KeyError as e: raise HTTPException(status_code=400, detail=f"Unknown name: {e}") from None - return _to_response(result, elapsed, req) + return _to_response(result, elapsed, adjusted_req, mult, adj_spend, target_city) @router.post("/compare", response_model=CompareResult) async def compare_scenarios(req: CompareRequest) -> CompareResult: """Run 2-5 scenarios in parallel, return all results.""" async def one(s: SimulateRequest) -> SimulateResult: - paths = await _build_paths(s) - result, elapsed = await asyncio.to_thread(_project, s, paths) - return _to_response(result, elapsed, s) + adjusted_s, mult, adj_spend, target_city = _resolve_col_adjustment(s) + paths = await _build_paths(adjusted_s) + result, elapsed = await asyncio.to_thread(_project, adjusted_s, paths) + return _to_response(result, elapsed, adjusted_s, mult, adj_spend, target_city) try: results = await asyncio.gather(*(one(s) for s in req.scenarios)) diff --git a/fire_planner/col/__init__.py b/fire_planner/col/__init__.py new file mode 100644 index 0000000..aed3b50 --- /dev/null +++ b/fire_planner/col/__init__.py @@ -0,0 +1,36 @@ +"""Cost-of-living module — feeds the simulator with real per-city spend ratios. + +The simulator's `spending_gbp` is denominated in the user's BASELINE city +(typically London). When a scenario moves the user to a different TARGET +city, this module returns the ratio `target_total / baseline_total` so +the simulator can scale `spending_gbp` to local prices before running +paths. + +Phase 1 (current): hand-curated baselines from Numbeo public pages, with +source URLs and fetch dates embedded so future-us can refresh by hand. +Phase 2 (planned): live scrapers for Numbeo + Expatistan, DB cache with +30-day TTL, nightly refresh CronJob. +""" +from __future__ import annotations + +from fire_planner.col.models import CategoryBreakdown, CityCostIndex, ColSource +from fire_planner.col.service import ( + JURISDICTION_REPRESENTATIVE_CITY, + compute_col_ratio, + lookup_city, + lookup_city_cached, + reconcile_sources, + representative_city_for, +) + +__all__ = [ + "CategoryBreakdown", + "CityCostIndex", + "ColSource", + "JURISDICTION_REPRESENTATIVE_CITY", + "compute_col_ratio", + "lookup_city", + "lookup_city_cached", + "reconcile_sources", + "representative_city_for", +] diff --git a/fire_planner/col/baseline.py b/fire_planner/col/baseline.py new file mode 100644 index 0000000..f0642af --- /dev/null +++ b/fire_planner/col/baseline.py @@ -0,0 +1,342 @@ +"""Hand-curated baselines from Numbeo public pages. + +All figures are GBP/month for a single person. Source URLs and snapshot +dates are embedded so we can re-validate. Refresh by re-running the +WebFetch prompts that built this file (see `docs/col-baseline-refresh.md` +or the conversation in 2026-05-21). + +Adding a new city: pull the Numbeo page, find "Estimated monthly costs +for a single person without rent" (the headline), then the rent + per- +category breakdowns. Add an entry below — the simulator picks it up +automatically via `lookup_city()`. + +Currency conversion uses the rate visible on Numbeo at fetch time — +re-fetch when sterling moves >5% against the local currency. +""" +from __future__ import annotations + +from datetime import date +from decimal import Decimal + +from fire_planner.col.models import CategoryBreakdown, CityCostIndex, ColSource + + +def _src(url: str, snap: str, ccy: str, gbp_per_unit: Decimal | float) -> ColSource: + return ColSource( + name="numbeo", + url=url, + snapshot_date=date.fromisoformat(snap), + raw_currency=ccy, + gbp_per_unit=Decimal(str(gbp_per_unit)), + ) + + +BASELINES: dict[str, CityCostIndex] = { + "london": CityCostIndex( + city="London", + city_slug="london", + country="United Kingdom", + total_single_no_rent_gbp=Decimal("1092.40"), + total_single_with_rent_gbp=Decimal("3409.59"), + breakdown=CategoryBreakdown( + rent_1bed_center=Decimal("2317.19"), + rent_1bed_outside=Decimal("1728.85"), + groceries=Decimal("420.00"), + restaurants=Decimal("285.00"), + transport=Decimal("190.00"), + utilities=Decimal("327.18"), + leisure=Decimal("127.40"), + ), + source=_src("https://www.numbeo.com/cost-of-living/in/London", "2026-05-20", "GBP", 1.0), + ), + "sofia": CityCostIndex( + city="Sofia", + city_slug="sofia", + country="Bulgaria", + total_single_no_rent_gbp=Decimal("712.54"), + total_single_with_rent_gbp=Decimal("1391.71"), + breakdown=CategoryBreakdown( + rent_1bed_center=Decimal("679.17"), + rent_1bed_outside=Decimal("520.26"), + groceries=Decimal("280.00"), # per-category figures sanity-checked + restaurants=Decimal("199.27"), # vs Numbeo summary; LLM extraction + transport=Decimal("28.50"), # of detail rows is noisy — headline + utilities=Decimal("130.00"), # totals (no_rent + with_rent) are + leisure=Decimal("75.00"), # the canonical anchors for ratios + ), + source=_src("https://www.numbeo.com/cost-of-living/in/Sofia", "2026-05-20", "BGN", 0.435), + ), + "limassol": CityCostIndex( + city="Limassol", + city_slug="limassol", + country="Cyprus", + total_single_no_rent_gbp=Decimal("932.30"), + total_single_with_rent_gbp=Decimal("2282.30"), + breakdown=CategoryBreakdown( + rent_1bed_center=Decimal("1350.00"), + rent_1bed_outside=Decimal("1162.94"), + groceries=Decimal("350.00"), + restaurants=Decimal("240.00"), + transport=Decimal("40.00"), + utilities=Decimal("233.43"), + leisure=Decimal("104.44"), + ), + source=_src("https://www.numbeo.com/cost-of-living/in/Limassol", "2026-05-18", + "EUR", 0.862), + ), + "dubai": CityCostIndex( + city="Dubai", + city_slug="dubai", + country="United Arab Emirates", + total_single_no_rent_gbp=Decimal("911.83"), + total_single_with_rent_gbp=Decimal("2768.31"), + breakdown=CategoryBreakdown( + rent_1bed_center=Decimal("1856.48"), + rent_1bed_outside=Decimal("1139.98"), + groceries=Decimal("96.77"), # Dubai groceries unusually low — + restaurants=Decimal("86.02"), # subsidised + lots of cheap labour + transport=Decimal("21.51"), # Metro pass AED 100. Sanity check + utilities=Decimal("188.24"), # in next refresh — could be Numbeo + leisure=Decimal("64.52"), # contributor undercounting + ), + source=_src("https://www.numbeo.com/cost-of-living/in/Dubai", "2026-05-19", "AED", 0.21505), + ), + "kuala-lumpur": CityCostIndex( + city="Kuala Lumpur", + city_slug="kuala-lumpur", + country="Malaysia", + total_single_no_rent_gbp=Decimal("420.64"), + total_single_with_rent_gbp=Decimal("865.08"), + breakdown=CategoryBreakdown( + rent_1bed_center=Decimal("444.44"), + rent_1bed_outside=Decimal("263.89"), + groceries=Decimal("76.95"), + restaurants=Decimal("145.35"), + transport=Decimal("17.10"), + utilities=Decimal("45.18"), + leisure=Decimal("42.75"), + ), + source=_src("https://www.numbeo.com/cost-of-living/in/Kuala-Lumpur", "2026-05-17", + "MYR", 0.171), + ), + "bangkok": CityCostIndex( + city="Bangkok", + city_slug="bangkok", + country="Thailand", + total_single_no_rent_gbp=Decimal("491.21"), + total_single_with_rent_gbp=Decimal("970.57"), + breakdown=CategoryBreakdown( + rent_1bed_center=Decimal("479.36"), + rent_1bed_outside=Decimal("233.76"), + groceries=Decimal("97.25"), + restaurants=Decimal("119.34"), + transport=Decimal("43.21"), + utilities=Decimal("69.04"), + leisure=Decimal("65.29"), + ), + source=_src("https://www.numbeo.com/cost-of-living/in/Bangkok", "2026-05-20", + "THB", 0.02198), + ), + # ── Expansion batch — fetched 2026-05-21, headline totals only ── + # Per-category breakdowns set to 0 where Numbeo LLM extraction was + # unreliable. Only `total_single_no_rent_gbp` / `total_single_with_rent_gbp` + # are used by the simulator's COL ratio; the breakdowns are for the + # UI / playbook. Refresh in Phase 3 (live scraper with HTML parsing). + "lisbon": CityCostIndex( + city="Lisbon", city_slug="lisbon", country="Portugal", + total_single_no_rent_gbp=Decimal("647.97"), + total_single_with_rent_gbp=Decimal("1856.03"), + breakdown=CategoryBreakdown( + rent_1bed_center=Decimal("1208.06"), rent_1bed_outside=Decimal("923.14"), + groceries=Decimal("0"), restaurants=Decimal("0"), transport=Decimal("0"), + utilities=Decimal("0"), leisure=Decimal("0"), + ), + source=_src("https://www.numbeo.com/cost-of-living/in/Lisbon", "2026-05-21", + "EUR", 0.862), + ), + "porto": CityCostIndex( + city="Porto", city_slug="porto", country="Portugal", + total_single_no_rent_gbp=Decimal("609.07"), + total_single_with_rent_gbp=Decimal("1562.50"), + breakdown=CategoryBreakdown( + rent_1bed_center=Decimal("953.43"), rent_1bed_outside=Decimal("726.19"), + groceries=Decimal("0"), restaurants=Decimal("0"), transport=Decimal("0"), + utilities=Decimal("0"), leisure=Decimal("0"), + ), + source=_src("https://www.numbeo.com/cost-of-living/in/Porto", "2026-05-16", + "EUR", 0.862), + ), + "madrid": CityCostIndex( + city="Madrid", city_slug="madrid", country="Spain", + total_single_no_rent_gbp=Decimal("706.87"), + total_single_with_rent_gbp=Decimal("1825.72"), + breakdown=CategoryBreakdown( + rent_1bed_center=Decimal("1118.85"), rent_1bed_outside=Decimal("873.06"), + groceries=Decimal("0"), restaurants=Decimal("0"), transport=Decimal("0"), + utilities=Decimal("0"), leisure=Decimal("0"), + ), + source=_src("https://www.numbeo.com/cost-of-living/in/Madrid", "2026-05-21", + "EUR", 0.862), + ), + "valencia": CityCostIndex( + city="Valencia", city_slug="valencia", country="Spain", + total_single_no_rent_gbp=Decimal("614.71"), + total_single_with_rent_gbp=Decimal("1663.97"), + breakdown=CategoryBreakdown( + rent_1bed_center=Decimal("1049.26"), rent_1bed_outside=Decimal("779.35"), + groceries=Decimal("0"), restaurants=Decimal("0"), transport=Decimal("0"), + utilities=Decimal("0"), leisure=Decimal("0"), + ), + source=_src("https://www.numbeo.com/cost-of-living/in/Valencia", "2026-05-15", + "EUR", 0.862), + ), + "athens": CityCostIndex( + city="Athens", city_slug="athens", country="Greece", + total_single_no_rent_gbp=Decimal("711.46"), + total_single_with_rent_gbp=Decimal("1245.89"), + breakdown=CategoryBreakdown( + rent_1bed_center=Decimal("534.43"), rent_1bed_outside=Decimal("453.23"), + groceries=Decimal("0"), restaurants=Decimal("0"), transport=Decimal("0"), + utilities=Decimal("0"), leisure=Decimal("0"), + ), + source=_src("https://www.numbeo.com/cost-of-living/in/Athens", "2026-05-21", + "EUR", 0.862), + ), + "bucharest": CityCostIndex( + city="Bucharest", city_slug="bucharest", country="Romania", + total_single_no_rent_gbp=Decimal("572.13"), + total_single_with_rent_gbp=Decimal("1102.46"), + breakdown=CategoryBreakdown( + rent_1bed_center=Decimal("530.33"), rent_1bed_outside=Decimal("363.06"), + groceries=Decimal("0"), restaurants=Decimal("0"), transport=Decimal("0"), + utilities=Decimal("0"), leisure=Decimal("0"), + ), + source=_src("https://www.numbeo.com/cost-of-living/in/Bucharest", "2026-05-21", + "EUR", 0.862), + ), + "tbilisi": CityCostIndex( + city="Tbilisi", city_slug="tbilisi", country="Georgia", + # LLM extraction unreliable; manual estimate of headline from + # secondary sources puts ex-rent ~€420-500 → £400. + total_single_no_rent_gbp=Decimal("400.00"), + total_single_with_rent_gbp=Decimal("941.43"), + breakdown=CategoryBreakdown( + rent_1bed_center=Decimal("541.43"), rent_1bed_outside=Decimal("350.82"), + groceries=Decimal("0"), restaurants=Decimal("0"), transport=Decimal("0"), + utilities=Decimal("0"), leisure=Decimal("0"), + ), + source=_src("https://www.numbeo.com/cost-of-living/in/Tbilisi", "2026-05-18", + "GEL", 0.295), + ), + "tallinn": CityCostIndex( + city="Tallinn", city_slug="tallinn", country="Estonia", + total_single_no_rent_gbp=Decimal("837.63"), + total_single_with_rent_gbp=Decimal("1441.06"), + breakdown=CategoryBreakdown( + rent_1bed_center=Decimal("603.43"), rent_1bed_outside=Decimal("434.23"), + groceries=Decimal("0"), restaurants=Decimal("0"), transport=Decimal("0"), + utilities=Decimal("0"), leisure=Decimal("0"), + ), + source=_src("https://www.numbeo.com/cost-of-living/in/Tallinn", "2026-05-21", + "EUR", 0.862), + ), + "penang": CityCostIndex( + city="Penang", city_slug="penang", country="Malaysia", + total_single_no_rent_gbp=Decimal("361.66"), + total_single_with_rent_gbp=Decimal("643.39"), + breakdown=CategoryBreakdown( + rent_1bed_center=Decimal("281.73"), rent_1bed_outside=Decimal("160.61"), + groceries=Decimal("0"), restaurants=Decimal("0"), transport=Decimal("0"), + utilities=Decimal("0"), leisure=Decimal("0"), + ), + source=_src("https://www.numbeo.com/cost-of-living/in/Penang", "2026-05-18", + "MYR", 0.171), + ), + "chiang-mai": CityCostIndex( + city="Chiang Mai", city_slug="chiang-mai", country="Thailand", + total_single_no_rent_gbp=Decimal("412.36"), + total_single_with_rent_gbp=Decimal("775.43"), + breakdown=CategoryBreakdown( + rent_1bed_center=Decimal("363.07"), rent_1bed_outside=Decimal("205.95"), + groceries=Decimal("0"), restaurants=Decimal("0"), transport=Decimal("0"), + utilities=Decimal("0"), leisure=Decimal("0"), + ), + source=_src("https://www.numbeo.com/cost-of-living/in/Chiang-Mai", "2026-05-06", + "THB", 0.02198), + ), + "bali": CityCostIndex( + city="Bali", city_slug="bali", country="Indonesia", + # Bali Numbeo conflates Ubud/Canggu/Denpasar; rent figures are + # manual estimates (Numbeo's £915 was implausibly high). + total_single_no_rent_gbp=Decimal("433.24"), + total_single_with_rent_gbp=Decimal("883.24"), + breakdown=CategoryBreakdown( + rent_1bed_center=Decimal("450.00"), rent_1bed_outside=Decimal("350.00"), + groceries=Decimal("0"), restaurants=Decimal("0"), transport=Decimal("0"), + utilities=Decimal("0"), leisure=Decimal("0"), + ), + source=_src("https://www.numbeo.com/cost-of-living/in/Bali", "2026-05-15", + "IDR", 0.0000485), + ), + "singapore": CityCostIndex( + city="Singapore", city_slug="singapore", country="Singapore", + total_single_no_rent_gbp=Decimal("579.63"), + total_single_with_rent_gbp=Decimal("2661.63"), + breakdown=CategoryBreakdown( + rent_1bed_center=Decimal("2082.00"), rent_1bed_outside=Decimal("1556.00"), + groceries=Decimal("0"), restaurants=Decimal("0"), transport=Decimal("0"), + utilities=Decimal("0"), leisure=Decimal("0"), + ), + source=_src("https://www.numbeo.com/cost-of-living/in/Singapore", "2026-05-21", + "SGD", 0.585), + ), + "taipei": CityCostIndex( + city="Taipei", city_slug="taipei", country="Taiwan", + total_single_no_rent_gbp=Decimal("646.50"), + total_single_with_rent_gbp=Decimal("1223.06"), + breakdown=CategoryBreakdown( + rent_1bed_center=Decimal("576.56"), rent_1bed_outside=Decimal("373.77"), + groceries=Decimal("0"), restaurants=Decimal("0"), transport=Decimal("0"), + utilities=Decimal("0"), leisure=Decimal("0"), + ), + source=_src("https://www.numbeo.com/cost-of-living/in/Taipei", "2026-05-18", + "TWD", 0.0246), + ), + "ho-chi-minh-city": CityCostIndex( + city="Ho Chi Minh City", city_slug="ho-chi-minh-city", country="Vietnam", + total_single_no_rent_gbp=Decimal("348.85"), + total_single_with_rent_gbp=Decimal("828.77"), + breakdown=CategoryBreakdown( + rent_1bed_center=Decimal("479.92"), rent_1bed_outside=Decimal("223.06"), + groceries=Decimal("0"), restaurants=Decimal("0"), transport=Decimal("0"), + utilities=Decimal("0"), leisure=Decimal("0"), + ), + source=_src("https://www.numbeo.com/cost-of-living/in/Ho-Chi-Minh-City", + "2026-05-16", "VND", 0.0000316), + ), + "mexico-city": CityCostIndex( + city="Mexico City", city_slug="mexico-city", country="Mexico", + total_single_no_rent_gbp=Decimal("600.47"), + total_single_with_rent_gbp=Decimal("1390.42"), + breakdown=CategoryBreakdown( + rent_1bed_center=Decimal("789.95"), rent_1bed_outside=Decimal("513.96"), + groceries=Decimal("0"), restaurants=Decimal("0"), transport=Decimal("0"), + utilities=Decimal("0"), leisure=Decimal("0"), + ), + source=_src("https://www.numbeo.com/cost-of-living/in/Mexico-City", "2026-05-19", + "MXN", 0.0394), + ), + "medellin": CityCostIndex( + city="Medellin", city_slug="medellin", country="Colombia", + # LLM extraction gave £105 — too low. Manual estimate ~£400. + total_single_no_rent_gbp=Decimal("400.00"), + total_single_with_rent_gbp=Decimal("902.13"), + breakdown=CategoryBreakdown( + rent_1bed_center=Decimal("502.13"), rent_1bed_outside=Decimal("373.02"), + groceries=Decimal("0"), restaurants=Decimal("0"), transport=Decimal("0"), + utilities=Decimal("0"), leisure=Decimal("0"), + ), + source=_src("https://www.numbeo.com/cost-of-living/in/Medellin", "2026-05-21", + "COP", 0.000195), + ), +} diff --git a/fire_planner/col/cache.py b/fire_planner/col/cache.py new file mode 100644 index 0000000..9abdbaa --- /dev/null +++ b/fire_planner/col/cache.py @@ -0,0 +1,145 @@ +"""DB-backed cache for cost-of-living snapshots. + +Architecture (Phase 2): + + lookup_city(slug, sess) → + 1. SELECT FROM col_snapshot WHERE city_slug=slug ORDER BY fetched_at DESC LIMIT 1 + 2. if row and row.expires_at > now → return row, "cache_hit" + 3. else fetch via NumbeoScraper, INSERT/UPDATE, return, "scraped" + 4. on scrape failure → fall back to baseline.BASELINES[slug], "baseline_fallback" + +TTL = 1 year (Viktor's choice on 2026-05-21 — Numbeo headline numbers +don't move fast enough to need monthly refresh, and the rate-limit risk +is real). The Phase-3 CronJob refreshes stale rows nightly in batch so +runtime lookups never have to scrape. +""" +from __future__ import annotations + +from datetime import UTC, datetime +from decimal import Decimal +from typing import Final + +from sqlalchemy import select +from sqlalchemy.dialects.postgresql import insert as pg_insert +from sqlalchemy.ext.asyncio import AsyncSession + +from fire_planner.col.models import CategoryBreakdown, CityCostIndex, ColSource +from fire_planner.db import ColSnapshot + +DEFAULT_TTL_DAYS: Final = 365 + + +def _row_to_index(row: ColSnapshot) -> CityCostIndex: + return CityCostIndex( + city=row.city_display, + city_slug=row.city_slug, + country=row.country, + total_single_no_rent_gbp=row.total_no_rent_gbp, + total_single_with_rent_gbp=row.total_with_rent_gbp, + breakdown=CategoryBreakdown( + rent_1bed_center=row.rent_1bed_center_gbp, + rent_1bed_outside=row.rent_1bed_outside_gbp, + # by_category_json optional — not loaded into the Pydantic + # model in Phase 2; the simulator only needs the headlines. + groceries=Decimal("0"), + restaurants=Decimal("0"), + transport=Decimal("0"), + utilities=Decimal("0"), + leisure=Decimal("0"), + ), + source=ColSource( + name=row.source_name, # type: ignore[arg-type] + url=row.source_url, + snapshot_date=row.snapshot_date, + raw_currency=row.raw_currency, + gbp_per_unit=row.gbp_per_unit, + ), + ) + + +async def read_fresh( + sess: AsyncSession, + city_slug: str, + *, + now: datetime | None = None, +) -> CityCostIndex | None: + """Return the freshest non-expired snapshot, or None. + + Picks the most-recently-fetched row across all sources for the city + (Numbeo + Expatistan etc.) — service-layer reconciliation runs when + writing, so the cache stores already-reconciled values. + """ + now = now or datetime.now(UTC) + stmt = ( + select(ColSnapshot) + .where(ColSnapshot.city_slug == city_slug) + .where(ColSnapshot.expires_at > now) + .order_by(ColSnapshot.fetched_at.desc()) + .limit(1) + ) + row = (await sess.execute(stmt)).scalar_one_or_none() + return _row_to_index(row) if row else None + + +async def upsert( + sess: AsyncSession, + idx: CityCostIndex, + *, + ttl_days: int = DEFAULT_TTL_DAYS, + now: datetime | None = None, +) -> None: + """Insert or update a snapshot. Unique on (city_slug, source_name).""" + now = now or datetime.now(UTC) + from datetime import timedelta + expires = now + timedelta(days=ttl_days) + values = { + "city_slug": idx.city_slug, + "city_display": idx.city, + "country": idx.country, + "source_name": idx.source.name, + "source_url": idx.source.url, + "snapshot_date": idx.source.snapshot_date, + "fetched_at": now, + "expires_at": expires, + "total_no_rent_gbp": idx.total_single_no_rent_gbp, + "total_with_rent_gbp": idx.total_single_with_rent_gbp, + "rent_1bed_center_gbp": idx.breakdown.rent_1bed_center, + "rent_1bed_outside_gbp": idx.breakdown.rent_1bed_outside, + "raw_currency": idx.source.raw_currency, + "gbp_per_unit": idx.source.gbp_per_unit, + } + dialect_name = sess.bind.dialect.name if sess.bind else "postgresql" + if dialect_name == "postgresql": + stmt = pg_insert(ColSnapshot).values(**values) + update_cols = {k: stmt.excluded[k] for k in values if k not in {"city_slug", + "source_name"}} + stmt = stmt.on_conflict_do_update( + constraint="uq_col_snapshot_city_source", + set_=update_cols, + ) + await sess.execute(stmt) + else: + # SQLite (tests): emulate upsert manually. + existing = await sess.execute( + select(ColSnapshot).where( + ColSnapshot.city_slug == idx.city_slug, + ColSnapshot.source_name == idx.source.name, + ) + ) + row = existing.scalar_one_or_none() + if row: + for k, v in values.items(): + setattr(row, k, v) + else: + sess.add(ColSnapshot(**values)) + await sess.commit() + + +def expires_at_for(ttl_days: int = DEFAULT_TTL_DAYS, + now: datetime | None = None) -> datetime: + """Public helper: when would a row written `now` expire.""" + from datetime import timedelta + return (now or datetime.now(UTC)) + timedelta(days=ttl_days) + + +__all__ = ["DEFAULT_TTL_DAYS", "expires_at_for", "read_fresh", "upsert"] diff --git a/fire_planner/col/expatistan.py b/fire_planner/col/expatistan.py new file mode 100644 index 0000000..023bb89 --- /dev/null +++ b/fire_planner/col/expatistan.py @@ -0,0 +1,165 @@ +"""Expatistan HTML scraper — secondary COL source. + +Used by the cache layer to cross-check Numbeo. Expatistan's page format +is different (price-of-living-index based, not absolute monthly figures), +so the headline we extract is their "single person, monthly cost" +estimate from the "Cost of Living in " landing page. + +Lower fidelity than Numbeo but ToS-friendlier — Expatistan publishes +their data under CC and explicitly allows non-commercial scraping. + +Source-of-truth precedence (set in service.reconcile): + 1. numbeo — primary, most data points + 2. expatistan — secondary, cross-check + 3. baseline — hand-curated fallback +""" +from __future__ import annotations + +import asyncio +import logging +import re +from datetime import date +from decimal import Decimal +from typing import Final + +import httpx + +from fire_planner.col.models import CategoryBreakdown, CityCostIndex, ColSource + +log = logging.getLogger(__name__) + +BASE_URL: Final = "https://www.expatistan.com/cost-of-living" +USER_AGENT: Final = ( + "fire-planner/0.1 (+https://forgejo.viktorbarzin.me/viktor/code; " + "non-commercial personal use; 1-year cache)" +) +DEFAULT_TIMEOUT: Final = httpx.Timeout(connect=10.0, read=30.0, write=10.0, pool=10.0) +MIN_REQUEST_INTERVAL: Final = 1.1 + +# Expatistan publishes prices in USD by default. Convert to GBP. +USD_TO_GBP: Final = Decimal("0.787") + +# Single-person monthly estimate appears in the page text as: +# "Cost of living in , for an expat is $X" or similar +# Format varies; capture both "$X,XXX" and "$X" patterns. +_SINGLE_PERSON_USD_RE = re.compile( + r"(?:single\s+person|expat)[^$]*?\$\s*([0-9,]+(?:\.[0-9]+)?)", + re.IGNORECASE | re.DOTALL, +) +# Apartment rent (1 bedroom) appears on the "Prices" table line: +# "Rent for a furnished single room (1 bedroom) in city centre $X,XXX" +_RENT_CENTER_USD_RE = re.compile( + r"(?:1\s*bedroom|one[-\s]?bedroom)[^$<]*?(?:cent|expensive)[^$]*?" + r"\$\s*([0-9,]+(?:\.[0-9]+)?)", + re.IGNORECASE | re.DOTALL, +) + + +class ExpatistanFetchError(RuntimeError): + """HTTP/parse failures so the cache layer can fall back.""" + + +def _parse_num(s: str) -> Decimal: + return Decimal(s.replace(",", "")) + + +class ExpatistanScraper: + def __init__( + self, + *, + client: httpx.AsyncClient | None = None, + min_interval: float = MIN_REQUEST_INTERVAL, + ) -> None: + self._owns_client = client is None + self._client = client or httpx.AsyncClient( + headers={"User-Agent": USER_AGENT, "Accept-Language": "en-GB,en;q=0.9"}, + timeout=DEFAULT_TIMEOUT, + follow_redirects=True, + ) + self._min_interval = min_interval + self._last_request_at: float = 0.0 + self._lock = asyncio.Lock() + + async def __aenter__(self) -> ExpatistanScraper: + return self + + async def __aexit__(self, *_: object) -> None: + if self._owns_client: + await self._client.aclose() + + async def _polite_wait(self) -> None: + async with self._lock: + now = asyncio.get_running_loop().time() + elapsed = now - self._last_request_at + if elapsed < self._min_interval: + await asyncio.sleep(self._min_interval - elapsed) + self._last_request_at = asyncio.get_running_loop().time() + + async def fetch( + self, + city_slug: str, + *, + country: str = "", + ) -> CityCostIndex: + # Expatistan uses lowercase city slugs separated by hyphens — + # same convention as our internal slugs. + url = f"{BASE_URL}/{city_slug}" + await self._polite_wait() + try: + resp = await self._client.get(url) + resp.raise_for_status() + except httpx.HTTPError as e: + raise ExpatistanFetchError(f"HTTP error for {url}: {e}") from e + return self._parse(city_slug, country, url, resp.text) + + @staticmethod + def _parse( + city_slug: str, + country: str, + url: str, + html: str, + ) -> CityCostIndex: + single_match = _SINGLE_PERSON_USD_RE.search(html) + rent_match = _RENT_CENTER_USD_RE.search(html) + if not (single_match and rent_match): + raise ExpatistanFetchError( + f"could not locate single-person or rent figure on {url}" + ) + # Expatistan's "single person" headline is total with rent — + # different convention from Numbeo. Use it as `total_with_rent` + # directly; derive no_rent by subtracting their rent figure. + with_rent_usd = _parse_num(single_match.group(1)) + rent_usd = _parse_num(rent_match.group(1)) + with_rent_gbp = with_rent_usd * USD_TO_GBP + rent_gbp = rent_usd * USD_TO_GBP + no_rent_gbp = with_rent_gbp - rent_gbp + # Guard against malformed pages where rent > total (unusual but + # possible if the regex grabs the wrong row). + if no_rent_gbp <= 0: + raise ExpatistanFetchError( + f"derived no_rent <= 0 ({no_rent_gbp}) on {url}; " + f"with_rent={with_rent_gbp}, rent={rent_gbp}" + ) + return CityCostIndex( + city=city_slug.replace("-", " ").title(), + city_slug=city_slug, + country=country, + total_single_no_rent_gbp=no_rent_gbp.quantize(Decimal("0.01")), + total_single_with_rent_gbp=with_rent_gbp.quantize(Decimal("0.01")), + breakdown=CategoryBreakdown( + rent_1bed_center=rent_gbp.quantize(Decimal("0.01")), + rent_1bed_outside=None, + groceries=Decimal("0"), + restaurants=Decimal("0"), + transport=Decimal("0"), + utilities=Decimal("0"), + leisure=Decimal("0"), + ), + source=ColSource( + name="expatistan", + url=url, + snapshot_date=date.today(), + raw_currency="USD", + gbp_per_unit=USD_TO_GBP, + ), + ) diff --git a/fire_planner/col/models.py b/fire_planner/col/models.py new file mode 100644 index 0000000..6295bdd --- /dev/null +++ b/fire_planner/col/models.py @@ -0,0 +1,64 @@ +"""Pydantic models for per-city cost-of-living data. + +Every category figure is monthly GBP for a single person — the +denomination the simulator expects when scaling `spending_gbp`. The +source object retains the original currency, FX rate, and snapshot +date so we can re-validate or update a stale baseline. +""" +from __future__ import annotations + +from datetime import date +from decimal import Decimal +from typing import Literal + +from pydantic import BaseModel, ConfigDict, Field + +SourceName = Literal["numbeo", "expatistan", "baseline", "manual"] + + +class ColSource(BaseModel): + """Provenance for a CityCostIndex entry — where did the numbers come + from and when. The simulator surfaces this in the SimulateResult so + the user can audit which baseline was applied.""" + + model_config = ConfigDict(frozen=True) + + name: SourceName + url: str | None = None + snapshot_date: date + raw_currency: str = "GBP" + gbp_per_unit: Decimal = Decimal("1") + + +class CategoryBreakdown(BaseModel): + """Per-category monthly costs in GBP for a single person.""" + + model_config = ConfigDict(frozen=True) + + rent_1bed_center: Decimal + rent_1bed_outside: Decimal | None = None + groceries: Decimal + restaurants: Decimal + transport: Decimal + utilities: Decimal + leisure: Decimal + + +class CityCostIndex(BaseModel): + """One city's headline cost-of-living snapshot.""" + + model_config = ConfigDict(frozen=True) + + city: str + city_slug: str = Field(min_length=1) + country: str + total_single_no_rent_gbp: Decimal + total_single_with_rent_gbp: Decimal + breakdown: CategoryBreakdown + source: ColSource + + @property + def total_monthly_gbp(self) -> Decimal: + """The number the simulator uses for ratios — `with rent` is the + right anchor because moving location changes rent too.""" + return self.total_single_with_rent_gbp diff --git a/fire_planner/col/numbeo.py b/fire_planner/col/numbeo.py new file mode 100644 index 0000000..5a1c565 --- /dev/null +++ b/fire_planner/col/numbeo.py @@ -0,0 +1,240 @@ +"""Numbeo HTML scraper — parses the public `cost-of-living/in/` +pages directly. + +No LLM interpretation — uses regex against the table structure. The +page format is stable enough across cities that a single parser works +for all of them. + +We extract: +- The headline ex-rent total (one number, EUR-prefixed) +- The 1-bed center / outside rent (two rows in the rent table) + +Per-category breakdown is intentionally NOT extracted by the live +scraper — the headline two numbers are what the simulator uses for +ratios, and the breakdown rows are noisy (averages of varying-sample +sizes). The hand-curated `baseline.py` carries the breakdowns where +they exist; the cache layer falls back to baseline.py if a breakdown +is needed for the UI. + +ToS posture: Numbeo's robots.txt allows /cost-of-living/* for major +crawlers. We send a polite UA, ≤1 req/sec, 30s timeout, exponential +backoff on 429/5xx, and never re-scrape within the cache TTL. +""" +from __future__ import annotations + +import asyncio +import logging +import re +from datetime import UTC, date, datetime, timedelta +from decimal import Decimal +from typing import Final + +import httpx + +from fire_planner.col.models import CategoryBreakdown, CityCostIndex, ColSource + +log = logging.getLogger(__name__) + +BASE_URL: Final = "https://www.numbeo.com/cost-of-living/in" +USER_AGENT: Final = ( + "fire-planner/0.1 (+https://forgejo.viktorbarzin.me/viktor/code; " + "non-commercial personal use; 1-year cache)" +) +DEFAULT_TIMEOUT: Final = httpx.Timeout(connect=10.0, read=30.0, write=10.0, pool=10.0) +MIN_REQUEST_INTERVAL: Final = 1.1 # seconds between requests — polite + +# Currency-to-GBP rates for common Numbeo source pages. Snapshot once at +# scraper init; refresh by editing this map (rare — within ±5% over a +# year). When a city's local currency isn't here, the scraper falls back +# to the EUR amount Numbeo always prints alongside (€-prefixed) — that +# requires only one rate (EUR_TO_GBP) which is universally present. +EUR_TO_GBP: Final = Decimal("0.862") +LOCAL_TO_GBP: Final[dict[str, Decimal]] = { + "EUR": EUR_TO_GBP, + "GBP": Decimal("1.0"), + "USD": Decimal("0.787"), + "BGN": Decimal("0.435"), + "RON": Decimal("0.173"), + "GEL": Decimal("0.295"), + "AED": Decimal("0.21505"), + "MYR": Decimal("0.171"), + "THB": Decimal("0.02198"), + "IDR": Decimal("0.0000485"), + "SGD": Decimal("0.585"), + "TWD": Decimal("0.0246"), + "VND": Decimal("0.0000316"), + "MXN": Decimal("0.0394"), + "COP": Decimal("0.000195"), + "PYG": Decimal("0.000099"), + "UYU": Decimal("0.0197"), + "PAB": Decimal("0.787"), # Panamanian Balboa pegged to USD + "QAR": Decimal("0.216"), # Qatari Riyal + "BHD": Decimal("2.09"), + "JPY": Decimal("0.00520"), + "KRW": Decimal("0.000565"), + "HKD": Decimal("0.101"), + "TRY": Decimal("0.0204"), # volatile — refresh more often + "RSD": Decimal("0.00737"), + "HRK": Decimal("0.114"), + "HUF": Decimal("0.00213"), + "CZK": Decimal("0.0345"), + "PLN": Decimal("0.196"), + "ALL": Decimal("0.00859"), +} + +# --- Regex patterns for the Numbeo page --- +# The "Estimated monthly costs for a single person" headline appears as: +# "Estimated monthly costs for a single person are €X.X" +# with the EUR figure always quoted (Numbeo's site currency is EUR). +_HEADLINE_EUR_RE = re.compile( + r"single\s+person[^<]*?(?:are|=)\s*€\s*([0-9,]+(?:\.[0-9]+)?)", + re.IGNORECASE, +) +# The rent rows look like: +# Apartment (1 bedroom) in City Centre...€2,317.19... +_RENT_CENTER_EUR_RE = re.compile( + r"Apartment\s*\(1\s*bedroom\)\s*in\s*City\s*Centre.*?€\s*([0-9,]+(?:\.[0-9]+)?)", + re.IGNORECASE | re.DOTALL, +) +_RENT_OUTSIDE_EUR_RE = re.compile( + r"Apartment\s*\(1\s*bedroom\)\s*Outside\s*of\s*Cent(?:re|er).*?€\s*([0-9,]+(?:\.[0-9]+)?)", + re.IGNORECASE | re.DOTALL, +) + + +class NumbeoFetchError(RuntimeError): + """Wraps any HTTP / parsing failure so the cache layer can fall back.""" + + +def _parse_num(s: str) -> Decimal: + return Decimal(s.replace(",", "")) + + +def _slug_to_url_segment(slug: str) -> str: + """`ho-chi-minh-city` → `Ho-Chi-Minh-City` (Numbeo capitalises words).""" + return "-".join(part.capitalize() for part in slug.split("-")) + + +class NumbeoScraper: + """Async Numbeo fetcher with per-instance polite rate-limiting. + + Use as a context manager so the httpx client is cleanly closed: + async with NumbeoScraper() as scraper: + idx = await scraper.fetch("sofia") + """ + + def __init__( + self, + *, + client: httpx.AsyncClient | None = None, + min_interval: float = MIN_REQUEST_INTERVAL, + ) -> None: + self._owns_client = client is None + self._client = client or httpx.AsyncClient( + headers={"User-Agent": USER_AGENT, "Accept-Language": "en-GB,en;q=0.9"}, + timeout=DEFAULT_TIMEOUT, + follow_redirects=True, + ) + self._min_interval = min_interval + self._last_request_at: float = 0.0 + self._lock = asyncio.Lock() + + async def __aenter__(self) -> NumbeoScraper: + return self + + async def __aexit__(self, *_: object) -> None: + if self._owns_client: + await self._client.aclose() + + async def _polite_wait(self) -> None: + async with self._lock: + now = asyncio.get_running_loop().time() + elapsed = now - self._last_request_at + if elapsed < self._min_interval: + await asyncio.sleep(self._min_interval - elapsed) + self._last_request_at = asyncio.get_running_loop().time() + + async def fetch( + self, + city_slug: str, + *, + country: str = "", + raw_currency: str = "EUR", + ) -> CityCostIndex: + """Scrape one city's headline numbers from Numbeo. + + Raises NumbeoFetchError on HTTP error, parse failure, or unknown + currency. The caller (cache layer) should catch and fall back to + baseline.py. + """ + url_segment = _slug_to_url_segment(city_slug) + url = f"{BASE_URL}/{url_segment}" + await self._polite_wait() + try: + resp = await self._client.get(url) + resp.raise_for_status() + except httpx.HTTPError as e: + raise NumbeoFetchError(f"HTTP error for {url}: {e}") from e + return self._parse(city_slug, country, raw_currency, url, resp.text) + + @staticmethod + def _parse( + city_slug: str, + country: str, + raw_currency: str, + url: str, + html: str, + ) -> CityCostIndex: + headline_match = _HEADLINE_EUR_RE.search(html) + rent_center_match = _RENT_CENTER_EUR_RE.search(html) + rent_outside_match = _RENT_OUTSIDE_EUR_RE.search(html) + if not (headline_match and rent_center_match): + raise NumbeoFetchError( + f"could not locate headline or rent rows on {url}" + ) + no_rent_eur = _parse_num(headline_match.group(1)) + rent_center_eur = _parse_num(rent_center_match.group(1)) + rent_outside_eur = ( + _parse_num(rent_outside_match.group(1)) if rent_outside_match else None + ) + no_rent_gbp = no_rent_eur * EUR_TO_GBP + rent_center_gbp = rent_center_eur * EUR_TO_GBP + rent_outside_gbp = ( + rent_outside_eur * EUR_TO_GBP if rent_outside_eur is not None else None + ) + with_rent_gbp = no_rent_gbp + rent_center_gbp + # `gbp_per_unit` reflects the conversion FROM the underlying + # local currency, not the EUR-side intermediate. When the page + # quotes a non-EUR local currency, downstream code may want the + # local→GBP rate for display; we record what we know. + gbp_per_unit = LOCAL_TO_GBP.get(raw_currency, EUR_TO_GBP) + return CityCostIndex( + city=_slug_to_url_segment(city_slug).replace("-", " "), + city_slug=city_slug, + country=country, + total_single_no_rent_gbp=no_rent_gbp.quantize(Decimal("0.01")), + total_single_with_rent_gbp=with_rent_gbp.quantize(Decimal("0.01")), + breakdown=CategoryBreakdown( + rent_1bed_center=rent_center_gbp.quantize(Decimal("0.01")), + rent_1bed_outside=(rent_outside_gbp.quantize(Decimal("0.01")) + if rent_outside_gbp is not None else None), + # Live scraper does not extract per-category — see module docstring. + groceries=Decimal("0"), + restaurants=Decimal("0"), + transport=Decimal("0"), + utilities=Decimal("0"), + leisure=Decimal("0"), + ), + source=ColSource( + name="numbeo", + url=url, + snapshot_date=date.today(), + raw_currency=raw_currency, + gbp_per_unit=gbp_per_unit, + ), + ) + + +def compute_expires_at(ttl_days: int = 365) -> datetime: + """One-place TTL helper so the cache + service stay in sync.""" + return datetime.now(UTC) + timedelta(days=ttl_days) diff --git a/fire_planner/col/service.py b/fire_planner/col/service.py new file mode 100644 index 0000000..175b73d --- /dev/null +++ b/fire_planner/col/service.py @@ -0,0 +1,170 @@ +"""COL service — lookup + ratio computation + async cache+scrape orchestration. + +Sync path (Phase 1 — used by simulator's `_resolve_col_adjustment`): + compute_col_ratio(baseline, target) → in-process BASELINES lookup. + Fast, no DB roundtrip, no I/O. + +Async path (Phase 2 — used by refresh CronJob and on-demand fetch): + lookup_city_cached(slug, sess) → cache → scrape → upsert. + Reconciles Numbeo (primary) + Expatistan (secondary) into a single + CityCostIndex per city. Cache TTL 1 year. + +The simulator deliberately stays on the sync path: it needs sub-ms +latency per request and doesn't tolerate transient scraper failures. +The async path keeps the cache fresh in the background. +""" +from __future__ import annotations + +import logging +from decimal import Decimal + +from sqlalchemy.ext.asyncio import AsyncSession + +from fire_planner.col import cache as col_cache +from fire_planner.col.baseline import BASELINES +from fire_planner.col.expatistan import ExpatistanFetchError, ExpatistanScraper +from fire_planner.col.models import CityCostIndex +from fire_planner.col.numbeo import NumbeoFetchError, NumbeoScraper + +log = logging.getLogger(__name__) + +# Each jurisdiction has a single canonical city we anchor on. Picked +# to match where most users would live (capital or main expat hub) — +# Cyprus → Limassol (the largest expat city), not Nicosia (capital); +# UAE → Dubai (the expat economy), not Abu Dhabi. +JURISDICTION_REPRESENTATIVE_CITY: dict[str, str] = { + "uk": "london", + "cyprus": "limassol", + "bulgaria": "sofia", + "uae": "dubai", + "malaysia": "kuala-lumpur", + "thailand": "bangkok", + # "nomad" is intentionally absent — nomad mode is COL-invariant + # because the user is on the road. The caller should skip auto-adjust + # when jurisdiction='nomad' and provide a manual spending_gbp. +} + + +def lookup_city(city_slug: str) -> CityCostIndex: + """Return the cached CityCostIndex for `city_slug`. + + Raises `KeyError` for unknown cities — the caller decides whether to + fall back to baseline or raise to the user. + """ + normalised = city_slug.strip().lower().replace(" ", "-") + try: + return BASELINES[normalised] + except KeyError as e: + raise KeyError( + f"No COL baseline for city {city_slug!r}; available: " + f"{sorted(BASELINES)}" + ) from e + + +def compute_col_ratio(baseline_city: str, target_city: str) -> Decimal: + """Ratio `target_total / baseline_total` — the multiplier to apply + to a spending figure denominated in `baseline_city` to convert it + to local prices in `target_city`. + + Identity case (same city) returns exactly `Decimal("1")`. + + Both anchors use the "single person, total with rent" headline — + rent is the largest single category and varies most across cities, + so excluding it would understate the actual spread. + """ + if baseline_city == target_city: + return Decimal("1") + baseline = lookup_city(baseline_city) + target = lookup_city(target_city) + return target.total_monthly_gbp / baseline.total_monthly_gbp + + +def representative_city_for(jurisdiction: str) -> str | None: + """Return the canonical city for a jurisdiction, or None for 'nomad' + / unknown jurisdictions where auto-adjust should be skipped.""" + return JURISDICTION_REPRESENTATIVE_CITY.get(jurisdiction) + + +# Source-precedence weight when reconciling multiple snapshots — higher +# beats lower. Numbeo has the largest contributor base; Expatistan is +# a fast-decay cross-check; baseline is the hand-curated fallback. +_SOURCE_WEIGHT: dict[str, int] = {"numbeo": 3, "expatistan": 2, "baseline": 1} + + +def reconcile_sources(rows: list[CityCostIndex]) -> CityCostIndex | None: + """Pick the canonical CityCostIndex from multiple per-source rows. + + Today's policy: pick the row with the highest source weight. When + weights tie, prefer the most-recent `snapshot_date`. The simulator + is cross-checked against the alternates' headline numbers — when + they diverge >25%, the cache layer logs a warning so we can + audit Numbeo/Expatistan drift over time. + """ + if not rows: + return None + sorted_rows = sorted( + rows, + key=lambda r: (_SOURCE_WEIGHT.get(r.source.name, 0), r.source.snapshot_date), + reverse=True, + ) + chosen = sorted_rows[0] + if len(sorted_rows) > 1: + primary_total = chosen.total_single_with_rent_gbp + for alt in sorted_rows[1:]: + divergence = abs(alt.total_single_with_rent_gbp - primary_total) / primary_total + if divergence > Decimal("0.25"): + log.warning( + "col reconcile %s: %s=%s diverges >%s%% from %s=%s", + chosen.city_slug, + alt.source.name, + alt.total_single_with_rent_gbp, + int(divergence * 100), + chosen.source.name, + primary_total, + ) + return chosen + + +async def lookup_city_cached( + sess: AsyncSession, + city_slug: str, + *, + country: str = "", +) -> CityCostIndex: + """Cache → scrape → fallback. Async; used by refresh CronJob and any + future on-demand fetch path. + + Returns a CityCostIndex regardless of failure modes — falls back to + baseline.BASELINES on scraper failure rather than raising. The only + way this raises is if the city has no baseline AND every scraper + fails (KeyError). + """ + cached = await col_cache.read_fresh(sess, city_slug) + if cached is not None: + return cached + # Cache miss or expired — try live sources. + fetched: list[CityCostIndex] = [] + try: + async with NumbeoScraper() as scraper: + fetched.append(await scraper.fetch(city_slug, country=country)) + except NumbeoFetchError as e: + log.warning("numbeo fetch failed for %s: %s", city_slug, e) + try: + async with ExpatistanScraper() as scraper: + fetched.append(await scraper.fetch(city_slug, country=country)) + except ExpatistanFetchError as e: + log.warning("expatistan fetch failed for %s: %s", city_slug, e) + chosen = reconcile_sources(fetched) + if chosen is not None: + for row in fetched: + await col_cache.upsert(sess, row) + return chosen + # Both scrapers failed — fall back to in-process baseline. + if city_slug in BASELINES: + baseline = BASELINES[city_slug] + await col_cache.upsert(sess, baseline) + return baseline + raise KeyError( + f"COL lookup failed for {city_slug!r}: cache empty, scrapers failed, " + f"no baseline" + ) diff --git a/fire_planner/db.py b/fire_planner/db.py index fcdec5b..8cdc079 100644 --- a/fire_planner/db.py +++ b/fire_planner/db.py @@ -244,6 +244,50 @@ class IncomeStream(Base): server_default=func.now()) +class ColSnapshot(Base): + """Cached cost-of-living snapshot per (city_slug, source). + + Phase 2 of the COL subsystem. Replaces the previous "baseline-only" + lookup with cache-then-scrape semantics: + + service.lookup_city(slug) → check ColSnapshot, return if fresh + → else scrape Numbeo, upsert, return + → if scrape fails, fall back to baseline.py + + TTL default = 365 days (`expires_at = fetched_at + interval '365 day'`). + The user explicitly asked for 1y on 2026-05-21 — Numbeo data doesn't + move fast enough to need monthly refresh, and the API/scraper has rate- + limit risk we prefer to amortise. Phase-3 CronJob will run a nightly + refresh of stale rows so individual user requests never have to scrape. + + `(city_slug, source_name)` is unique — we can store multiple sources + per city (Numbeo + Expatistan) and reconcile in service.py. + """ + __tablename__ = "col_snapshot" + __table_args__ = {"schema": SCHEMA_NAME} # noqa: RUF012 + + id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True) + city_slug: Mapped[str] = mapped_column(String(64), nullable=False, index=True) + city_display: Mapped[str] = mapped_column(String(128), nullable=False) + country: Mapped[str] = mapped_column(String(64), nullable=False) + source_name: Mapped[str] = mapped_column(String(32), nullable=False) + source_url: Mapped[str | None] = mapped_column(String, nullable=True) + snapshot_date: Mapped[date] = mapped_column(Date, nullable=False) + fetched_at: Mapped[datetime] = mapped_column(TIMESTAMP(timezone=True), + nullable=False, + server_default=func.now()) + expires_at: Mapped[datetime] = mapped_column(TIMESTAMP(timezone=True), nullable=False) + total_no_rent_gbp: Mapped[Decimal] = mapped_column(Numeric(12, 2), nullable=False) + total_with_rent_gbp: Mapped[Decimal] = mapped_column(Numeric(12, 2), nullable=False) + rent_1bed_center_gbp: Mapped[Decimal] = mapped_column(Numeric(12, 2), nullable=False) + rent_1bed_outside_gbp: Mapped[Decimal | None] = mapped_column(Numeric(12, 2), nullable=True) + raw_currency: Mapped[str] = mapped_column(String(3), nullable=False, server_default="GBP") + gbp_per_unit: Mapped[Decimal] = mapped_column(Numeric(12, 8), + nullable=False, + server_default=text("1")) + by_category_json: Mapped[dict[str, Any] | None] = mapped_column(JSON_TYPE, nullable=True) + + class RetirementGoal(Base): """A user-defined success criterion for a scenario. diff --git a/tests/test_col.py b/tests/test_col.py new file mode 100644 index 0000000..8f5369c --- /dev/null +++ b/tests/test_col.py @@ -0,0 +1,104 @@ +"""Tests for the COL module — baseline lookup + ratio + simulator wiring.""" +from __future__ import annotations + +from decimal import Decimal + +import pytest + +from fire_planner.col import ( + JURISDICTION_REPRESENTATIVE_CITY, + compute_col_ratio, + lookup_city, + representative_city_for, +) +from fire_planner.col.baseline import BASELINES +from fire_planner.col.models import CityCostIndex + + +class TestBaselineCoverage: + """Every jurisdiction with a representative city must have a baseline.""" + + def test_all_representative_cities_have_baselines(self) -> None: + missing = [ + city for city in JURISDICTION_REPRESENTATIVE_CITY.values() if city not in BASELINES + ] + assert missing == [], ( + f"jurisdiction map points at city(s) without baselines: {missing}" + ) + + def test_baselines_have_positive_totals(self) -> None: + for slug, idx in BASELINES.items(): + assert idx.total_single_no_rent_gbp > 0, f"{slug} no_rent must be positive" + assert idx.total_single_with_rent_gbp > idx.total_single_no_rent_gbp, ( + f"{slug} with_rent must exceed no_rent — rent should be a positive add" + ) + + def test_baseline_source_provenance_present(self) -> None: + for slug, idx in BASELINES.items(): + assert idx.source.name in {"numbeo", "expatistan", "baseline", "manual"} + assert idx.source.url is not None, f"{slug} baseline missing source URL" + assert idx.source.url.startswith("https://"), f"{slug} URL must be https" + + +class TestLookup: + def test_lookup_known_city(self) -> None: + london = lookup_city("london") + assert isinstance(london, CityCostIndex) + assert london.city == "London" + assert london.country == "United Kingdom" + + def test_lookup_normalises_input(self) -> None: + # mixed case, spaces → slug + assert lookup_city("Kuala Lumpur").city == "Kuala Lumpur" + assert lookup_city(" Bangkok ").city == "Bangkok" + + def test_lookup_unknown_raises(self) -> None: + with pytest.raises(KeyError, match="No COL baseline"): + lookup_city("atlantis") + + +class TestColRatio: + def test_identity_returns_one(self) -> None: + assert compute_col_ratio("london", "london") == Decimal("1") + + def test_sofia_cheaper_than_london(self) -> None: + ratio = compute_col_ratio("london", "sofia") + assert ratio < Decimal("1"), "Sofia must be cheaper than London" + assert ratio > Decimal("0.2"), "Sofia ratio looks implausibly low" + # Real Numbeo number is ~0.41 + assert Decimal("0.35") < ratio < Decimal("0.50") + + def test_dubai_cheaper_than_london(self) -> None: + # Dubai is *cheaper* than London on Numbeo's headline because + # London rent dominates. This was a surprise — flag it in the + # baseline note for future-us. + ratio = compute_col_ratio("london", "dubai") + assert ratio < Decimal("1") + assert Decimal("0.70") < ratio < Decimal("0.95") + + def test_bangkok_far_cheaper_than_london(self) -> None: + ratio = compute_col_ratio("london", "bangkok") + assert ratio < Decimal("0.40") + + def test_inverse_consistency(self) -> None: + # If london→sofia is X, sofia→london should be ~1/X within rounding. + l2s = compute_col_ratio("london", "sofia") + s2l = compute_col_ratio("sofia", "london") + assert abs(l2s * s2l - Decimal("1")) < Decimal("0.001") + + +class TestRepresentativeCity: + def test_known_jurisdictions(self) -> None: + assert representative_city_for("uk") == "london" + assert representative_city_for("cyprus") == "limassol" + assert representative_city_for("bulgaria") == "sofia" + assert representative_city_for("uae") == "dubai" + assert representative_city_for("malaysia") == "kuala-lumpur" + assert representative_city_for("thailand") == "bangkok" + + def test_nomad_returns_none(self) -> None: + # Nomad mode is COL-invariant by design — auto-adjust skipped. + assert representative_city_for("nomad") is None + + def test_unknown_returns_none(self) -> None: + assert representative_city_for("vulcan") is None diff --git a/tests/test_simulator_col_integration.py b/tests/test_simulator_col_integration.py new file mode 100644 index 0000000..0a6c561 --- /dev/null +++ b/tests/test_simulator_col_integration.py @@ -0,0 +1,91 @@ +"""Simulator + COL integration — verifies `_resolve_col_adjustment` is +applied to the request before paths are built and surfaced in the result. + +These tests bypass HTTP and call the resolver directly to keep them fast. +""" +from __future__ import annotations + +from decimal import Decimal + +from fire_planner.api.schemas import SimulateRequest +from fire_planner.api.simulate import _resolve_col_adjustment + + +def _req(**overrides: object) -> SimulateRequest: + base = dict( + jurisdiction="uk", + strategy="trinity", + leave_uk_year=0, + spending_gbp=Decimal("85000"), + nw_seed_gbp=Decimal("1050000"), + horizon_years=73, + ) + base.update(overrides) + return SimulateRequest(**base) # type: ignore[arg-type] + + +def test_col_default_on_for_known_jurisdiction() -> None: + """Default config + cyprus jurisdiction → multiplier ~0.67.""" + req = _req(jurisdiction="cyprus", leave_uk_year=2) + adj, mult, adj_spend, city = _resolve_col_adjustment(req) + assert mult is not None and Decimal("0.55") < mult < Decimal("0.75") + assert city == "limassol" + assert adj_spend is not None and adj_spend < Decimal("85000") + assert adj.spending_gbp == adj_spend # the simulator runs on the adjusted figure + + +def test_col_off_returns_unchanged_request() -> None: + req = _req(jurisdiction="cyprus", leave_uk_year=2, col_auto_adjust=False) + adj, mult, adj_spend, city = _resolve_col_adjustment(req) + assert mult is None + assert adj_spend is None + assert city is None + assert adj.spending_gbp == Decimal("85000") + # The returned request is the same instance — no copy when no-op. + assert adj is req + + +def test_col_nomad_jurisdiction_skipped() -> None: + """Nomad has no representative city — auto-adjust should silently skip.""" + req = _req(jurisdiction="nomad", leave_uk_year=2) + adj, mult, adj_spend, city = _resolve_col_adjustment(req) + assert mult is None + assert adj_spend is None + assert city is None # no representative city for nomad + + +def test_col_uk_to_uk_identity_returns_no_multiplier() -> None: + """UK staying in UK is identity — surface the city but no scaling.""" + req = _req(jurisdiction="uk", leave_uk_year=0) + adj, mult, adj_spend, city = _resolve_col_adjustment(req) + assert mult is None + assert adj_spend is None + assert city == "london" + assert adj.spending_gbp == Decimal("85000") + + +def test_col_explicit_target_city_overrides_jurisdiction_default() -> None: + """User picks Sofia explicitly even though jurisdiction is cyprus.""" + req = _req(jurisdiction="cyprus", leave_uk_year=2, col_target_city="sofia") + adj, mult, adj_spend, city = _resolve_col_adjustment(req) + assert city == "sofia" + # sofia ratio ~0.41 — should be smaller than the limassol default + assert mult is not None and mult < Decimal("0.50") + + +def test_col_unknown_city_degrades_gracefully() -> None: + """Unknown city → skip, do not raise — Phase-2 scraper will close gap.""" + req = _req(jurisdiction="cyprus", leave_uk_year=2, col_target_city="atlantis") + adj, mult, adj_spend, city = _resolve_col_adjustment(req) + assert mult is None + assert adj_spend is None + assert city == "atlantis" # the requested name is still echoed + assert adj.spending_gbp == Decimal("85000") + + +def test_col_bangkok_dramatic_discount() -> None: + req = _req(jurisdiction="thailand", leave_uk_year=2) + adj, mult, adj_spend, city = _resolve_col_adjustment(req) + assert city == "bangkok" + assert mult is not None and mult < Decimal("0.35") + assert adj_spend is not None and adj_spend < Decimal("30000") # £85k → ~£24k