col: simulator auto-adjusts spending to local prices via Numbeo+Expatistan
The Monte Carlo used to compare jurisdictions at a flat London-equivalent spend, which silently overstated the cost-of-living for any move to a cheaper region. Now every cross-jurisdiction simulation auto-scales spending_gbp by the real Numbeo/Expatistan ratio between the user's baseline city and the target city. Architecture: - fire_planner/col/baseline.py — 22 cities with headline Numbeo data (source URLs + snapshot dates embedded) — fallback when scraper fails - col/numbeo.py + col/expatistan.py — httpx async scrapers, regex-parsed, polite 1.1s rate-limit, EUR/USD anchored - col/cache.py — PG-backed cache (col_snapshot table, 1-year TTL) - col/service.py — sync compute_col_ratio() for the simulator; async lookup_city_cached() with source reconciliation for the refresh CronJob - alembic 0005 — col_snapshot table, UNIQUE(city_slug, source_name) Simulator wiring: - SimulateRequest gains col_auto_adjust=True (default), col_baseline_city, col_target_city. Defaults pick the jurisdiction's representative city. - _resolve_col_adjustment scales spending_gbp before path-building. - SimulateResult surfaces col_multiplier_applied + col_adjusted_spending_gbp. CLIs: - python -m fire_planner col-seed — loads BASELINES into col_snapshot (post-migration seed step) - python -m fire_planner col-refresh-stale --within-days 7 — used by the weekly fire-planner-col-refresh CronJob 268 tests pass. Mypy strict + ruff clean. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
parent
70101c836c
commit
e72fd22a17
14 changed files with 1641 additions and 6 deletions
73
alembic/versions/0005_col_snapshot.py
Normal file
73
alembic/versions/0005_col_snapshot.py
Normal file
|
|
@ -0,0 +1,73 @@
|
|||
"""add col_snapshot table for cached cost-of-living data
|
||||
|
||||
Revision ID: 0005
|
||||
Revises: 0004
|
||||
Create Date: 2026-05-21 12:00:00.000000
|
||||
|
||||
Phase 2 of the cost-of-living subsystem (`fire_planner.col`). Caches
|
||||
Numbeo / Expatistan headline data with a 1-year TTL so the simulator
|
||||
can scale `spending_gbp` to local prices without re-scraping per-call.
|
||||
Refresh is async (Phase-3 CronJob); user-facing lookups never block on
|
||||
the network in the steady state.
|
||||
|
||||
Unique on (city_slug, source_name) — multiple sources per city are
|
||||
allowed; service.py reconciles them when computing the headline.
|
||||
"""
|
||||
from collections.abc import Sequence
|
||||
|
||||
import sqlalchemy as sa
|
||||
|
||||
from alembic import op
|
||||
|
||||
revision: str = "0005"
|
||||
down_revision: str | None = "0004"
|
||||
branch_labels: str | Sequence[str] | None = None
|
||||
depends_on: str | Sequence[str] | None = None
|
||||
|
||||
SCHEMA = "fire_planner"
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
op.create_table(
|
||||
"col_snapshot",
|
||||
sa.Column("id", sa.Integer(), nullable=False, autoincrement=True),
|
||||
sa.Column("city_slug", sa.String(length=64), nullable=False),
|
||||
sa.Column("city_display", sa.String(length=128), nullable=False),
|
||||
sa.Column("country", sa.String(length=64), nullable=False),
|
||||
sa.Column("source_name", sa.String(length=32), nullable=False),
|
||||
sa.Column("source_url", sa.String(), nullable=True),
|
||||
sa.Column("snapshot_date", sa.Date(), nullable=False),
|
||||
sa.Column("fetched_at", sa.TIMESTAMP(timezone=True), nullable=False,
|
||||
server_default=sa.func.now()),
|
||||
sa.Column("expires_at", sa.TIMESTAMP(timezone=True), nullable=False),
|
||||
sa.Column("total_no_rent_gbp", sa.Numeric(12, 2), nullable=False),
|
||||
sa.Column("total_with_rent_gbp", sa.Numeric(12, 2), nullable=False),
|
||||
sa.Column("rent_1bed_center_gbp", sa.Numeric(12, 2), nullable=False),
|
||||
sa.Column("rent_1bed_outside_gbp", sa.Numeric(12, 2), nullable=True),
|
||||
sa.Column("raw_currency", sa.String(length=3), nullable=False,
|
||||
server_default=sa.text("'GBP'")),
|
||||
sa.Column("gbp_per_unit", sa.Numeric(12, 8), nullable=False,
|
||||
server_default=sa.text("1")),
|
||||
sa.Column("by_category_json", sa.JSON(), nullable=True),
|
||||
sa.PrimaryKeyConstraint("id"),
|
||||
sa.UniqueConstraint("city_slug", "source_name", name="uq_col_snapshot_city_source"),
|
||||
schema=SCHEMA,
|
||||
)
|
||||
op.create_index(
|
||||
"ix_col_snapshot_city_slug",
|
||||
"col_snapshot",
|
||||
["city_slug"],
|
||||
schema=SCHEMA,
|
||||
)
|
||||
op.create_index(
|
||||
"ix_col_snapshot_expires_at",
|
||||
"col_snapshot",
|
||||
["expires_at"],
|
||||
schema=SCHEMA,
|
||||
)
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
op.drop_index("ix_col_snapshot_expires_at", table_name="col_snapshot", schema=SCHEMA)
|
||||
op.drop_index("ix_col_snapshot_city_slug", table_name="col_snapshot", schema=SCHEMA)
|
||||
op.drop_table("col_snapshot", schema=SCHEMA)
|
||||
|
|
@ -57,6 +57,103 @@ def migrate() -> None:
|
|||
sys.exit(rc.returncode)
|
||||
|
||||
|
||||
@cli.command("col-seed")
|
||||
@click.option("--ttl-days",
|
||||
type=int,
|
||||
default=365,
|
||||
help="Cache TTL in days (default 365 — matches Viktor's 1y choice).")
|
||||
def col_seed(ttl_days: int) -> None:
|
||||
"""Seed `col_snapshot` from baseline.py BASELINES.
|
||||
|
||||
Idempotent — uses upsert on (city_slug, source_name). Run once after
|
||||
the alembic migration creates the table. Subsequent live-scrape
|
||||
refreshes (Phase 3 CronJob) supersede these rows; the baseline
|
||||
fallback remains as a last-resort source.
|
||||
"""
|
||||
asyncio.run(_col_seed(ttl_days))
|
||||
|
||||
|
||||
async def _col_seed(ttl_days: int) -> None:
|
||||
from fire_planner.col.baseline import BASELINES
|
||||
from fire_planner.col.cache import upsert as col_upsert
|
||||
|
||||
engine = create_engine_from_env()
|
||||
factory = make_session_factory(engine)
|
||||
try:
|
||||
async with factory() as sess:
|
||||
for slug, idx in BASELINES.items():
|
||||
# Tag the source as `baseline` rather than `numbeo` so a
|
||||
# later live scrape (source_name='numbeo') doesn't conflict
|
||||
# on the (city_slug, source_name) unique constraint.
|
||||
tagged = idx.model_copy(
|
||||
update={"source": idx.source.model_copy(update={"name": "baseline"})}
|
||||
)
|
||||
await col_upsert(sess, tagged, ttl_days=ttl_days)
|
||||
click.echo(f" seeded {slug:20s} total={idx.total_single_with_rent_gbp} GBP")
|
||||
finally:
|
||||
await engine.dispose()
|
||||
click.echo(f"\ncol-seed: {len(BASELINES)} cities upserted (ttl_days={ttl_days}).")
|
||||
|
||||
|
||||
@cli.command("col-refresh-stale")
|
||||
@click.option("--within-days",
|
||||
type=int,
|
||||
default=7,
|
||||
help="Refresh rows whose expires_at is within this many days.")
|
||||
@click.option("--ttl-days",
|
||||
type=int,
|
||||
default=365,
|
||||
help="TTL for re-written rows (default 365).")
|
||||
def col_refresh_stale(within_days: int, ttl_days: int) -> None:
|
||||
"""Re-scrape COL rows that are within `within_days` of expiry.
|
||||
|
||||
Designed for the weekly CronJob. Walks every distinct city_slug in
|
||||
`col_snapshot` whose newest row will expire within the window,
|
||||
calls Numbeo+Expatistan via `service.lookup_city_cached`, which
|
||||
upserts the result. Idempotent — no-op for fresh rows.
|
||||
"""
|
||||
asyncio.run(_col_refresh_stale(within_days, ttl_days))
|
||||
|
||||
|
||||
async def _col_refresh_stale(within_days: int, ttl_days: int) -> None:
|
||||
from sqlalchemy import select, text
|
||||
|
||||
from fire_planner.col.service import lookup_city_cached
|
||||
from fire_planner.db import ColSnapshot
|
||||
|
||||
engine = create_engine_from_env()
|
||||
factory = make_session_factory(engine)
|
||||
threshold = f"NOW() + INTERVAL '{int(within_days)} days'"
|
||||
refreshed = 0
|
||||
failed = 0
|
||||
try:
|
||||
async with factory() as sess:
|
||||
# Find distinct city_slug whose freshest row expires within window.
|
||||
stmt = (
|
||||
select(ColSnapshot.city_slug, ColSnapshot.country)
|
||||
.distinct()
|
||||
.where(text(f"expires_at <= {threshold}"))
|
||||
)
|
||||
rows = (await sess.execute(stmt)).all()
|
||||
click.echo(f"col-refresh-stale: {len(rows)} city(ies) need refresh "
|
||||
f"(within_days={within_days})")
|
||||
for slug, country in rows:
|
||||
try:
|
||||
# lookup_city_cached upserts on cache miss, which is
|
||||
# what "stale" means here — read_fresh returns None.
|
||||
idx = await lookup_city_cached(sess, slug, country=country or "")
|
||||
click.echo(f" refreshed {slug:20s} → {idx.source.name:10s} "
|
||||
f"total={idx.total_single_with_rent_gbp}")
|
||||
refreshed += 1
|
||||
except Exception as e: # broad — log and continue per-city
|
||||
click.echo(f" FAILED {slug}: {e}", err=True)
|
||||
failed += 1
|
||||
finally:
|
||||
await engine.dispose()
|
||||
click.echo(f"\ncol-refresh-stale done: refreshed={refreshed} failed={failed} "
|
||||
f"ttl_days={ttl_days}")
|
||||
|
||||
|
||||
@cli.command("ingest")
|
||||
@click.option("--source",
|
||||
type=click.Choice(["wealthfolio"]),
|
||||
|
|
|
|||
|
|
@ -504,6 +504,22 @@ class SimulateRequest(BaseModel):
|
|||
annual_real_adjust_pct: Decimal = Decimal("0")
|
||||
guardrail_threshold_pct: Decimal | None = None
|
||||
guardrail_cut_pct: Decimal = Decimal("0.10")
|
||||
# Cost-of-living auto-adjust: when `col_auto_adjust=True`, the
|
||||
# simulator looks up COL ratio (target/baseline) from `fire_planner.col`
|
||||
# and scales `spending_gbp` BEFORE running paths. Defaults to True so
|
||||
# cross-jurisdiction comparisons are honest by default — earlier
|
||||
# comparisons used hand-wave 0.5x/0.75x multipliers, which were
|
||||
# consistently optimistic vs. actual Numbeo data (Bulgaria is 0.41x,
|
||||
# not 0.50x; Cyprus 0.67x, not 0.75x).
|
||||
#
|
||||
# `col_target_city` defaults to the jurisdiction's representative
|
||||
# city (uk→london, cyprus→limassol, etc.). Set explicitly to anchor
|
||||
# on a different city (e.g. `cyprus`+`paphos` if Limassol is too
|
||||
# expensive a proxy). For `jurisdiction='nomad'` there is no
|
||||
# representative city and auto-adjust is skipped silently.
|
||||
col_auto_adjust: bool = True
|
||||
col_baseline_city: str = "london"
|
||||
col_target_city: str | None = None
|
||||
|
||||
|
||||
class SimulateResult(BaseModel):
|
||||
|
|
@ -516,6 +532,13 @@ class SimulateResult(BaseModel):
|
|||
elapsed_seconds: Decimal
|
||||
yearly: list[ProjectionPoint]
|
||||
goals_probability: list[GoalProbability] = Field(default_factory=list)
|
||||
# When `col_auto_adjust=True`, surface the applied multiplier + the
|
||||
# COL-adjusted spending so the user can see what was used. Null when
|
||||
# auto-adjust was off, jurisdiction had no representative city
|
||||
# (nomad), or baseline==target (London-to-London).
|
||||
col_multiplier_applied: Decimal | None = None
|
||||
col_adjusted_spending_gbp: Decimal | None = None
|
||||
col_target_city: str | None = None
|
||||
|
||||
|
||||
class CompareRequest(BaseModel):
|
||||
|
|
|
|||
|
|
@ -26,6 +26,7 @@ from fire_planner.api.schemas import (
|
|||
SimulateRequest,
|
||||
SimulateResult,
|
||||
)
|
||||
from fire_planner.col import compute_col_ratio, representative_city_for
|
||||
from fire_planner.flex_spending import FlexRule as EngineFlexRule
|
||||
from fire_planner.glide_path import static
|
||||
from fire_planner.goals_eval import evaluate_goals
|
||||
|
|
@ -50,6 +51,36 @@ router = APIRouter(tags=["simulate"])
|
|||
_RETURNS_CSV = Path("/data/shiller_returns.csv")
|
||||
|
||||
|
||||
def _resolve_col_adjustment(
|
||||
req: SimulateRequest,
|
||||
) -> tuple[SimulateRequest, Decimal | None, Decimal | None, str | None]:
|
||||
"""Apply cost-of-living adjustment to `req.spending_gbp` when enabled.
|
||||
|
||||
Returns the (possibly modified) request, the multiplier applied (or
|
||||
None), the post-adjustment spending GBP (or None), and the resolved
|
||||
target city slug (or None). Skipped silently when:
|
||||
- col_auto_adjust is False
|
||||
- the jurisdiction has no representative city (e.g. nomad)
|
||||
- baseline_city == resolved target city (identity transform)
|
||||
- either city is unknown to the baseline lookup (degrade gracefully
|
||||
rather than 400 — a future Phase-2 scraper will close the gap)
|
||||
"""
|
||||
if not req.col_auto_adjust:
|
||||
return req, None, None, None
|
||||
target = req.col_target_city or representative_city_for(req.jurisdiction)
|
||||
if target is None:
|
||||
return req, None, None, None
|
||||
if target == req.col_baseline_city:
|
||||
return req, None, None, target
|
||||
try:
|
||||
ratio = compute_col_ratio(req.col_baseline_city, target)
|
||||
except KeyError:
|
||||
return req, None, None, target
|
||||
adjusted_spend = req.spending_gbp * ratio
|
||||
adjusted_req = req.model_copy(update={"spending_gbp": adjusted_spend})
|
||||
return adjusted_req, ratio, adjusted_spend, target
|
||||
|
||||
|
||||
def _shiller_paths(seed: int, n_paths: int, n_years: int) -> np.ndarray:
|
||||
bundle = (load_from_csv(_RETURNS_CSV) if _RETURNS_CSV.exists() else synthetic_returns(seed=42))
|
||||
rng = np.random.default_rng(seed)
|
||||
|
|
@ -193,6 +224,9 @@ def _to_response(
|
|||
result: SimulationResult,
|
||||
elapsed: float,
|
||||
req: SimulateRequest | None = None,
|
||||
col_multiplier: Decimal | None = None,
|
||||
col_adjusted_spend: Decimal | None = None,
|
||||
col_target_city: str | None = None,
|
||||
) -> SimulateResult:
|
||||
# portfolio_real has n_years+1 columns (year 0 = seed, year k = end-of-year k).
|
||||
# withdrawal_real / tax_real have n_years columns (year k = withdrawn in year k+1).
|
||||
|
|
@ -243,27 +277,34 @@ def _to_response(
|
|||
elapsed_seconds=Decimal(str(round(elapsed, 3))),
|
||||
yearly=yearly,
|
||||
goals_probability=goals_probability,
|
||||
col_multiplier_applied=(Decimal(str(round(float(col_multiplier), 6)))
|
||||
if col_multiplier is not None else None),
|
||||
col_adjusted_spending_gbp=(Decimal(str(round(float(col_adjusted_spend), 2)))
|
||||
if col_adjusted_spend is not None else None),
|
||||
col_target_city=col_target_city,
|
||||
)
|
||||
|
||||
|
||||
@router.post("/simulate", response_model=SimulateResult)
|
||||
async def simulate_one(req: SimulateRequest) -> SimulateResult:
|
||||
"""Run one scenario synchronously, no DB write. ~1-3s for 5k paths."""
|
||||
paths = await _build_paths(req)
|
||||
adjusted_req, mult, adj_spend, target_city = _resolve_col_adjustment(req)
|
||||
paths = await _build_paths(adjusted_req)
|
||||
try:
|
||||
result, elapsed = await asyncio.to_thread(_project, req, paths)
|
||||
result, elapsed = await asyncio.to_thread(_project, adjusted_req, paths)
|
||||
except KeyError as e:
|
||||
raise HTTPException(status_code=400, detail=f"Unknown name: {e}") from None
|
||||
return _to_response(result, elapsed, req)
|
||||
return _to_response(result, elapsed, adjusted_req, mult, adj_spend, target_city)
|
||||
|
||||
|
||||
@router.post("/compare", response_model=CompareResult)
|
||||
async def compare_scenarios(req: CompareRequest) -> CompareResult:
|
||||
"""Run 2-5 scenarios in parallel, return all results."""
|
||||
async def one(s: SimulateRequest) -> SimulateResult:
|
||||
paths = await _build_paths(s)
|
||||
result, elapsed = await asyncio.to_thread(_project, s, paths)
|
||||
return _to_response(result, elapsed, s)
|
||||
adjusted_s, mult, adj_spend, target_city = _resolve_col_adjustment(s)
|
||||
paths = await _build_paths(adjusted_s)
|
||||
result, elapsed = await asyncio.to_thread(_project, adjusted_s, paths)
|
||||
return _to_response(result, elapsed, adjusted_s, mult, adj_spend, target_city)
|
||||
|
||||
try:
|
||||
results = await asyncio.gather(*(one(s) for s in req.scenarios))
|
||||
|
|
|
|||
36
fire_planner/col/__init__.py
Normal file
36
fire_planner/col/__init__.py
Normal file
|
|
@ -0,0 +1,36 @@
|
|||
"""Cost-of-living module — feeds the simulator with real per-city spend ratios.
|
||||
|
||||
The simulator's `spending_gbp` is denominated in the user's BASELINE city
|
||||
(typically London). When a scenario moves the user to a different TARGET
|
||||
city, this module returns the ratio `target_total / baseline_total` so
|
||||
the simulator can scale `spending_gbp` to local prices before running
|
||||
paths.
|
||||
|
||||
Phase 1 (current): hand-curated baselines from Numbeo public pages, with
|
||||
source URLs and fetch dates embedded so future-us can refresh by hand.
|
||||
Phase 2 (planned): live scrapers for Numbeo + Expatistan, DB cache with
|
||||
30-day TTL, nightly refresh CronJob.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from fire_planner.col.models import CategoryBreakdown, CityCostIndex, ColSource
|
||||
from fire_planner.col.service import (
|
||||
JURISDICTION_REPRESENTATIVE_CITY,
|
||||
compute_col_ratio,
|
||||
lookup_city,
|
||||
lookup_city_cached,
|
||||
reconcile_sources,
|
||||
representative_city_for,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
"CategoryBreakdown",
|
||||
"CityCostIndex",
|
||||
"ColSource",
|
||||
"JURISDICTION_REPRESENTATIVE_CITY",
|
||||
"compute_col_ratio",
|
||||
"lookup_city",
|
||||
"lookup_city_cached",
|
||||
"reconcile_sources",
|
||||
"representative_city_for",
|
||||
]
|
||||
342
fire_planner/col/baseline.py
Normal file
342
fire_planner/col/baseline.py
Normal file
|
|
@ -0,0 +1,342 @@
|
|||
"""Hand-curated baselines from Numbeo public pages.
|
||||
|
||||
All figures are GBP/month for a single person. Source URLs and snapshot
|
||||
dates are embedded so we can re-validate. Refresh by re-running the
|
||||
WebFetch prompts that built this file (see `docs/col-baseline-refresh.md`
|
||||
or the conversation in 2026-05-21).
|
||||
|
||||
Adding a new city: pull the Numbeo page, find "Estimated monthly costs
|
||||
for a single person without rent" (the headline), then the rent + per-
|
||||
category breakdowns. Add an entry below — the simulator picks it up
|
||||
automatically via `lookup_city()`.
|
||||
|
||||
Currency conversion uses the rate visible on Numbeo at fetch time —
|
||||
re-fetch when sterling moves >5% against the local currency.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from datetime import date
|
||||
from decimal import Decimal
|
||||
|
||||
from fire_planner.col.models import CategoryBreakdown, CityCostIndex, ColSource
|
||||
|
||||
|
||||
def _src(url: str, snap: str, ccy: str, gbp_per_unit: Decimal | float) -> ColSource:
|
||||
return ColSource(
|
||||
name="numbeo",
|
||||
url=url,
|
||||
snapshot_date=date.fromisoformat(snap),
|
||||
raw_currency=ccy,
|
||||
gbp_per_unit=Decimal(str(gbp_per_unit)),
|
||||
)
|
||||
|
||||
|
||||
BASELINES: dict[str, CityCostIndex] = {
|
||||
"london": CityCostIndex(
|
||||
city="London",
|
||||
city_slug="london",
|
||||
country="United Kingdom",
|
||||
total_single_no_rent_gbp=Decimal("1092.40"),
|
||||
total_single_with_rent_gbp=Decimal("3409.59"),
|
||||
breakdown=CategoryBreakdown(
|
||||
rent_1bed_center=Decimal("2317.19"),
|
||||
rent_1bed_outside=Decimal("1728.85"),
|
||||
groceries=Decimal("420.00"),
|
||||
restaurants=Decimal("285.00"),
|
||||
transport=Decimal("190.00"),
|
||||
utilities=Decimal("327.18"),
|
||||
leisure=Decimal("127.40"),
|
||||
),
|
||||
source=_src("https://www.numbeo.com/cost-of-living/in/London", "2026-05-20", "GBP", 1.0),
|
||||
),
|
||||
"sofia": CityCostIndex(
|
||||
city="Sofia",
|
||||
city_slug="sofia",
|
||||
country="Bulgaria",
|
||||
total_single_no_rent_gbp=Decimal("712.54"),
|
||||
total_single_with_rent_gbp=Decimal("1391.71"),
|
||||
breakdown=CategoryBreakdown(
|
||||
rent_1bed_center=Decimal("679.17"),
|
||||
rent_1bed_outside=Decimal("520.26"),
|
||||
groceries=Decimal("280.00"), # per-category figures sanity-checked
|
||||
restaurants=Decimal("199.27"), # vs Numbeo summary; LLM extraction
|
||||
transport=Decimal("28.50"), # of detail rows is noisy — headline
|
||||
utilities=Decimal("130.00"), # totals (no_rent + with_rent) are
|
||||
leisure=Decimal("75.00"), # the canonical anchors for ratios
|
||||
),
|
||||
source=_src("https://www.numbeo.com/cost-of-living/in/Sofia", "2026-05-20", "BGN", 0.435),
|
||||
),
|
||||
"limassol": CityCostIndex(
|
||||
city="Limassol",
|
||||
city_slug="limassol",
|
||||
country="Cyprus",
|
||||
total_single_no_rent_gbp=Decimal("932.30"),
|
||||
total_single_with_rent_gbp=Decimal("2282.30"),
|
||||
breakdown=CategoryBreakdown(
|
||||
rent_1bed_center=Decimal("1350.00"),
|
||||
rent_1bed_outside=Decimal("1162.94"),
|
||||
groceries=Decimal("350.00"),
|
||||
restaurants=Decimal("240.00"),
|
||||
transport=Decimal("40.00"),
|
||||
utilities=Decimal("233.43"),
|
||||
leisure=Decimal("104.44"),
|
||||
),
|
||||
source=_src("https://www.numbeo.com/cost-of-living/in/Limassol", "2026-05-18",
|
||||
"EUR", 0.862),
|
||||
),
|
||||
"dubai": CityCostIndex(
|
||||
city="Dubai",
|
||||
city_slug="dubai",
|
||||
country="United Arab Emirates",
|
||||
total_single_no_rent_gbp=Decimal("911.83"),
|
||||
total_single_with_rent_gbp=Decimal("2768.31"),
|
||||
breakdown=CategoryBreakdown(
|
||||
rent_1bed_center=Decimal("1856.48"),
|
||||
rent_1bed_outside=Decimal("1139.98"),
|
||||
groceries=Decimal("96.77"), # Dubai groceries unusually low —
|
||||
restaurants=Decimal("86.02"), # subsidised + lots of cheap labour
|
||||
transport=Decimal("21.51"), # Metro pass AED 100. Sanity check
|
||||
utilities=Decimal("188.24"), # in next refresh — could be Numbeo
|
||||
leisure=Decimal("64.52"), # contributor undercounting
|
||||
),
|
||||
source=_src("https://www.numbeo.com/cost-of-living/in/Dubai", "2026-05-19", "AED", 0.21505),
|
||||
),
|
||||
"kuala-lumpur": CityCostIndex(
|
||||
city="Kuala Lumpur",
|
||||
city_slug="kuala-lumpur",
|
||||
country="Malaysia",
|
||||
total_single_no_rent_gbp=Decimal("420.64"),
|
||||
total_single_with_rent_gbp=Decimal("865.08"),
|
||||
breakdown=CategoryBreakdown(
|
||||
rent_1bed_center=Decimal("444.44"),
|
||||
rent_1bed_outside=Decimal("263.89"),
|
||||
groceries=Decimal("76.95"),
|
||||
restaurants=Decimal("145.35"),
|
||||
transport=Decimal("17.10"),
|
||||
utilities=Decimal("45.18"),
|
||||
leisure=Decimal("42.75"),
|
||||
),
|
||||
source=_src("https://www.numbeo.com/cost-of-living/in/Kuala-Lumpur", "2026-05-17",
|
||||
"MYR", 0.171),
|
||||
),
|
||||
"bangkok": CityCostIndex(
|
||||
city="Bangkok",
|
||||
city_slug="bangkok",
|
||||
country="Thailand",
|
||||
total_single_no_rent_gbp=Decimal("491.21"),
|
||||
total_single_with_rent_gbp=Decimal("970.57"),
|
||||
breakdown=CategoryBreakdown(
|
||||
rent_1bed_center=Decimal("479.36"),
|
||||
rent_1bed_outside=Decimal("233.76"),
|
||||
groceries=Decimal("97.25"),
|
||||
restaurants=Decimal("119.34"),
|
||||
transport=Decimal("43.21"),
|
||||
utilities=Decimal("69.04"),
|
||||
leisure=Decimal("65.29"),
|
||||
),
|
||||
source=_src("https://www.numbeo.com/cost-of-living/in/Bangkok", "2026-05-20",
|
||||
"THB", 0.02198),
|
||||
),
|
||||
# ── Expansion batch — fetched 2026-05-21, headline totals only ──
|
||||
# Per-category breakdowns set to 0 where Numbeo LLM extraction was
|
||||
# unreliable. Only `total_single_no_rent_gbp` / `total_single_with_rent_gbp`
|
||||
# are used by the simulator's COL ratio; the breakdowns are for the
|
||||
# UI / playbook. Refresh in Phase 3 (live scraper with HTML parsing).
|
||||
"lisbon": CityCostIndex(
|
||||
city="Lisbon", city_slug="lisbon", country="Portugal",
|
||||
total_single_no_rent_gbp=Decimal("647.97"),
|
||||
total_single_with_rent_gbp=Decimal("1856.03"),
|
||||
breakdown=CategoryBreakdown(
|
||||
rent_1bed_center=Decimal("1208.06"), rent_1bed_outside=Decimal("923.14"),
|
||||
groceries=Decimal("0"), restaurants=Decimal("0"), transport=Decimal("0"),
|
||||
utilities=Decimal("0"), leisure=Decimal("0"),
|
||||
),
|
||||
source=_src("https://www.numbeo.com/cost-of-living/in/Lisbon", "2026-05-21",
|
||||
"EUR", 0.862),
|
||||
),
|
||||
"porto": CityCostIndex(
|
||||
city="Porto", city_slug="porto", country="Portugal",
|
||||
total_single_no_rent_gbp=Decimal("609.07"),
|
||||
total_single_with_rent_gbp=Decimal("1562.50"),
|
||||
breakdown=CategoryBreakdown(
|
||||
rent_1bed_center=Decimal("953.43"), rent_1bed_outside=Decimal("726.19"),
|
||||
groceries=Decimal("0"), restaurants=Decimal("0"), transport=Decimal("0"),
|
||||
utilities=Decimal("0"), leisure=Decimal("0"),
|
||||
),
|
||||
source=_src("https://www.numbeo.com/cost-of-living/in/Porto", "2026-05-16",
|
||||
"EUR", 0.862),
|
||||
),
|
||||
"madrid": CityCostIndex(
|
||||
city="Madrid", city_slug="madrid", country="Spain",
|
||||
total_single_no_rent_gbp=Decimal("706.87"),
|
||||
total_single_with_rent_gbp=Decimal("1825.72"),
|
||||
breakdown=CategoryBreakdown(
|
||||
rent_1bed_center=Decimal("1118.85"), rent_1bed_outside=Decimal("873.06"),
|
||||
groceries=Decimal("0"), restaurants=Decimal("0"), transport=Decimal("0"),
|
||||
utilities=Decimal("0"), leisure=Decimal("0"),
|
||||
),
|
||||
source=_src("https://www.numbeo.com/cost-of-living/in/Madrid", "2026-05-21",
|
||||
"EUR", 0.862),
|
||||
),
|
||||
"valencia": CityCostIndex(
|
||||
city="Valencia", city_slug="valencia", country="Spain",
|
||||
total_single_no_rent_gbp=Decimal("614.71"),
|
||||
total_single_with_rent_gbp=Decimal("1663.97"),
|
||||
breakdown=CategoryBreakdown(
|
||||
rent_1bed_center=Decimal("1049.26"), rent_1bed_outside=Decimal("779.35"),
|
||||
groceries=Decimal("0"), restaurants=Decimal("0"), transport=Decimal("0"),
|
||||
utilities=Decimal("0"), leisure=Decimal("0"),
|
||||
),
|
||||
source=_src("https://www.numbeo.com/cost-of-living/in/Valencia", "2026-05-15",
|
||||
"EUR", 0.862),
|
||||
),
|
||||
"athens": CityCostIndex(
|
||||
city="Athens", city_slug="athens", country="Greece",
|
||||
total_single_no_rent_gbp=Decimal("711.46"),
|
||||
total_single_with_rent_gbp=Decimal("1245.89"),
|
||||
breakdown=CategoryBreakdown(
|
||||
rent_1bed_center=Decimal("534.43"), rent_1bed_outside=Decimal("453.23"),
|
||||
groceries=Decimal("0"), restaurants=Decimal("0"), transport=Decimal("0"),
|
||||
utilities=Decimal("0"), leisure=Decimal("0"),
|
||||
),
|
||||
source=_src("https://www.numbeo.com/cost-of-living/in/Athens", "2026-05-21",
|
||||
"EUR", 0.862),
|
||||
),
|
||||
"bucharest": CityCostIndex(
|
||||
city="Bucharest", city_slug="bucharest", country="Romania",
|
||||
total_single_no_rent_gbp=Decimal("572.13"),
|
||||
total_single_with_rent_gbp=Decimal("1102.46"),
|
||||
breakdown=CategoryBreakdown(
|
||||
rent_1bed_center=Decimal("530.33"), rent_1bed_outside=Decimal("363.06"),
|
||||
groceries=Decimal("0"), restaurants=Decimal("0"), transport=Decimal("0"),
|
||||
utilities=Decimal("0"), leisure=Decimal("0"),
|
||||
),
|
||||
source=_src("https://www.numbeo.com/cost-of-living/in/Bucharest", "2026-05-21",
|
||||
"EUR", 0.862),
|
||||
),
|
||||
"tbilisi": CityCostIndex(
|
||||
city="Tbilisi", city_slug="tbilisi", country="Georgia",
|
||||
# LLM extraction unreliable; manual estimate of headline from
|
||||
# secondary sources puts ex-rent ~€420-500 → £400.
|
||||
total_single_no_rent_gbp=Decimal("400.00"),
|
||||
total_single_with_rent_gbp=Decimal("941.43"),
|
||||
breakdown=CategoryBreakdown(
|
||||
rent_1bed_center=Decimal("541.43"), rent_1bed_outside=Decimal("350.82"),
|
||||
groceries=Decimal("0"), restaurants=Decimal("0"), transport=Decimal("0"),
|
||||
utilities=Decimal("0"), leisure=Decimal("0"),
|
||||
),
|
||||
source=_src("https://www.numbeo.com/cost-of-living/in/Tbilisi", "2026-05-18",
|
||||
"GEL", 0.295),
|
||||
),
|
||||
"tallinn": CityCostIndex(
|
||||
city="Tallinn", city_slug="tallinn", country="Estonia",
|
||||
total_single_no_rent_gbp=Decimal("837.63"),
|
||||
total_single_with_rent_gbp=Decimal("1441.06"),
|
||||
breakdown=CategoryBreakdown(
|
||||
rent_1bed_center=Decimal("603.43"), rent_1bed_outside=Decimal("434.23"),
|
||||
groceries=Decimal("0"), restaurants=Decimal("0"), transport=Decimal("0"),
|
||||
utilities=Decimal("0"), leisure=Decimal("0"),
|
||||
),
|
||||
source=_src("https://www.numbeo.com/cost-of-living/in/Tallinn", "2026-05-21",
|
||||
"EUR", 0.862),
|
||||
),
|
||||
"penang": CityCostIndex(
|
||||
city="Penang", city_slug="penang", country="Malaysia",
|
||||
total_single_no_rent_gbp=Decimal("361.66"),
|
||||
total_single_with_rent_gbp=Decimal("643.39"),
|
||||
breakdown=CategoryBreakdown(
|
||||
rent_1bed_center=Decimal("281.73"), rent_1bed_outside=Decimal("160.61"),
|
||||
groceries=Decimal("0"), restaurants=Decimal("0"), transport=Decimal("0"),
|
||||
utilities=Decimal("0"), leisure=Decimal("0"),
|
||||
),
|
||||
source=_src("https://www.numbeo.com/cost-of-living/in/Penang", "2026-05-18",
|
||||
"MYR", 0.171),
|
||||
),
|
||||
"chiang-mai": CityCostIndex(
|
||||
city="Chiang Mai", city_slug="chiang-mai", country="Thailand",
|
||||
total_single_no_rent_gbp=Decimal("412.36"),
|
||||
total_single_with_rent_gbp=Decimal("775.43"),
|
||||
breakdown=CategoryBreakdown(
|
||||
rent_1bed_center=Decimal("363.07"), rent_1bed_outside=Decimal("205.95"),
|
||||
groceries=Decimal("0"), restaurants=Decimal("0"), transport=Decimal("0"),
|
||||
utilities=Decimal("0"), leisure=Decimal("0"),
|
||||
),
|
||||
source=_src("https://www.numbeo.com/cost-of-living/in/Chiang-Mai", "2026-05-06",
|
||||
"THB", 0.02198),
|
||||
),
|
||||
"bali": CityCostIndex(
|
||||
city="Bali", city_slug="bali", country="Indonesia",
|
||||
# Bali Numbeo conflates Ubud/Canggu/Denpasar; rent figures are
|
||||
# manual estimates (Numbeo's £915 was implausibly high).
|
||||
total_single_no_rent_gbp=Decimal("433.24"),
|
||||
total_single_with_rent_gbp=Decimal("883.24"),
|
||||
breakdown=CategoryBreakdown(
|
||||
rent_1bed_center=Decimal("450.00"), rent_1bed_outside=Decimal("350.00"),
|
||||
groceries=Decimal("0"), restaurants=Decimal("0"), transport=Decimal("0"),
|
||||
utilities=Decimal("0"), leisure=Decimal("0"),
|
||||
),
|
||||
source=_src("https://www.numbeo.com/cost-of-living/in/Bali", "2026-05-15",
|
||||
"IDR", 0.0000485),
|
||||
),
|
||||
"singapore": CityCostIndex(
|
||||
city="Singapore", city_slug="singapore", country="Singapore",
|
||||
total_single_no_rent_gbp=Decimal("579.63"),
|
||||
total_single_with_rent_gbp=Decimal("2661.63"),
|
||||
breakdown=CategoryBreakdown(
|
||||
rent_1bed_center=Decimal("2082.00"), rent_1bed_outside=Decimal("1556.00"),
|
||||
groceries=Decimal("0"), restaurants=Decimal("0"), transport=Decimal("0"),
|
||||
utilities=Decimal("0"), leisure=Decimal("0"),
|
||||
),
|
||||
source=_src("https://www.numbeo.com/cost-of-living/in/Singapore", "2026-05-21",
|
||||
"SGD", 0.585),
|
||||
),
|
||||
"taipei": CityCostIndex(
|
||||
city="Taipei", city_slug="taipei", country="Taiwan",
|
||||
total_single_no_rent_gbp=Decimal("646.50"),
|
||||
total_single_with_rent_gbp=Decimal("1223.06"),
|
||||
breakdown=CategoryBreakdown(
|
||||
rent_1bed_center=Decimal("576.56"), rent_1bed_outside=Decimal("373.77"),
|
||||
groceries=Decimal("0"), restaurants=Decimal("0"), transport=Decimal("0"),
|
||||
utilities=Decimal("0"), leisure=Decimal("0"),
|
||||
),
|
||||
source=_src("https://www.numbeo.com/cost-of-living/in/Taipei", "2026-05-18",
|
||||
"TWD", 0.0246),
|
||||
),
|
||||
"ho-chi-minh-city": CityCostIndex(
|
||||
city="Ho Chi Minh City", city_slug="ho-chi-minh-city", country="Vietnam",
|
||||
total_single_no_rent_gbp=Decimal("348.85"),
|
||||
total_single_with_rent_gbp=Decimal("828.77"),
|
||||
breakdown=CategoryBreakdown(
|
||||
rent_1bed_center=Decimal("479.92"), rent_1bed_outside=Decimal("223.06"),
|
||||
groceries=Decimal("0"), restaurants=Decimal("0"), transport=Decimal("0"),
|
||||
utilities=Decimal("0"), leisure=Decimal("0"),
|
||||
),
|
||||
source=_src("https://www.numbeo.com/cost-of-living/in/Ho-Chi-Minh-City",
|
||||
"2026-05-16", "VND", 0.0000316),
|
||||
),
|
||||
"mexico-city": CityCostIndex(
|
||||
city="Mexico City", city_slug="mexico-city", country="Mexico",
|
||||
total_single_no_rent_gbp=Decimal("600.47"),
|
||||
total_single_with_rent_gbp=Decimal("1390.42"),
|
||||
breakdown=CategoryBreakdown(
|
||||
rent_1bed_center=Decimal("789.95"), rent_1bed_outside=Decimal("513.96"),
|
||||
groceries=Decimal("0"), restaurants=Decimal("0"), transport=Decimal("0"),
|
||||
utilities=Decimal("0"), leisure=Decimal("0"),
|
||||
),
|
||||
source=_src("https://www.numbeo.com/cost-of-living/in/Mexico-City", "2026-05-19",
|
||||
"MXN", 0.0394),
|
||||
),
|
||||
"medellin": CityCostIndex(
|
||||
city="Medellin", city_slug="medellin", country="Colombia",
|
||||
# LLM extraction gave £105 — too low. Manual estimate ~£400.
|
||||
total_single_no_rent_gbp=Decimal("400.00"),
|
||||
total_single_with_rent_gbp=Decimal("902.13"),
|
||||
breakdown=CategoryBreakdown(
|
||||
rent_1bed_center=Decimal("502.13"), rent_1bed_outside=Decimal("373.02"),
|
||||
groceries=Decimal("0"), restaurants=Decimal("0"), transport=Decimal("0"),
|
||||
utilities=Decimal("0"), leisure=Decimal("0"),
|
||||
),
|
||||
source=_src("https://www.numbeo.com/cost-of-living/in/Medellin", "2026-05-21",
|
||||
"COP", 0.000195),
|
||||
),
|
||||
}
|
||||
145
fire_planner/col/cache.py
Normal file
145
fire_planner/col/cache.py
Normal file
|
|
@ -0,0 +1,145 @@
|
|||
"""DB-backed cache for cost-of-living snapshots.
|
||||
|
||||
Architecture (Phase 2):
|
||||
|
||||
lookup_city(slug, sess) →
|
||||
1. SELECT FROM col_snapshot WHERE city_slug=slug ORDER BY fetched_at DESC LIMIT 1
|
||||
2. if row and row.expires_at > now → return row, "cache_hit"
|
||||
3. else fetch via NumbeoScraper, INSERT/UPDATE, return, "scraped"
|
||||
4. on scrape failure → fall back to baseline.BASELINES[slug], "baseline_fallback"
|
||||
|
||||
TTL = 1 year (Viktor's choice on 2026-05-21 — Numbeo headline numbers
|
||||
don't move fast enough to need monthly refresh, and the rate-limit risk
|
||||
is real). The Phase-3 CronJob refreshes stale rows nightly in batch so
|
||||
runtime lookups never have to scrape.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from datetime import UTC, datetime
|
||||
from decimal import Decimal
|
||||
from typing import Final
|
||||
|
||||
from sqlalchemy import select
|
||||
from sqlalchemy.dialects.postgresql import insert as pg_insert
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
from fire_planner.col.models import CategoryBreakdown, CityCostIndex, ColSource
|
||||
from fire_planner.db import ColSnapshot
|
||||
|
||||
DEFAULT_TTL_DAYS: Final = 365
|
||||
|
||||
|
||||
def _row_to_index(row: ColSnapshot) -> CityCostIndex:
|
||||
return CityCostIndex(
|
||||
city=row.city_display,
|
||||
city_slug=row.city_slug,
|
||||
country=row.country,
|
||||
total_single_no_rent_gbp=row.total_no_rent_gbp,
|
||||
total_single_with_rent_gbp=row.total_with_rent_gbp,
|
||||
breakdown=CategoryBreakdown(
|
||||
rent_1bed_center=row.rent_1bed_center_gbp,
|
||||
rent_1bed_outside=row.rent_1bed_outside_gbp,
|
||||
# by_category_json optional — not loaded into the Pydantic
|
||||
# model in Phase 2; the simulator only needs the headlines.
|
||||
groceries=Decimal("0"),
|
||||
restaurants=Decimal("0"),
|
||||
transport=Decimal("0"),
|
||||
utilities=Decimal("0"),
|
||||
leisure=Decimal("0"),
|
||||
),
|
||||
source=ColSource(
|
||||
name=row.source_name, # type: ignore[arg-type]
|
||||
url=row.source_url,
|
||||
snapshot_date=row.snapshot_date,
|
||||
raw_currency=row.raw_currency,
|
||||
gbp_per_unit=row.gbp_per_unit,
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
async def read_fresh(
|
||||
sess: AsyncSession,
|
||||
city_slug: str,
|
||||
*,
|
||||
now: datetime | None = None,
|
||||
) -> CityCostIndex | None:
|
||||
"""Return the freshest non-expired snapshot, or None.
|
||||
|
||||
Picks the most-recently-fetched row across all sources for the city
|
||||
(Numbeo + Expatistan etc.) — service-layer reconciliation runs when
|
||||
writing, so the cache stores already-reconciled values.
|
||||
"""
|
||||
now = now or datetime.now(UTC)
|
||||
stmt = (
|
||||
select(ColSnapshot)
|
||||
.where(ColSnapshot.city_slug == city_slug)
|
||||
.where(ColSnapshot.expires_at > now)
|
||||
.order_by(ColSnapshot.fetched_at.desc())
|
||||
.limit(1)
|
||||
)
|
||||
row = (await sess.execute(stmt)).scalar_one_or_none()
|
||||
return _row_to_index(row) if row else None
|
||||
|
||||
|
||||
async def upsert(
|
||||
sess: AsyncSession,
|
||||
idx: CityCostIndex,
|
||||
*,
|
||||
ttl_days: int = DEFAULT_TTL_DAYS,
|
||||
now: datetime | None = None,
|
||||
) -> None:
|
||||
"""Insert or update a snapshot. Unique on (city_slug, source_name)."""
|
||||
now = now or datetime.now(UTC)
|
||||
from datetime import timedelta
|
||||
expires = now + timedelta(days=ttl_days)
|
||||
values = {
|
||||
"city_slug": idx.city_slug,
|
||||
"city_display": idx.city,
|
||||
"country": idx.country,
|
||||
"source_name": idx.source.name,
|
||||
"source_url": idx.source.url,
|
||||
"snapshot_date": idx.source.snapshot_date,
|
||||
"fetched_at": now,
|
||||
"expires_at": expires,
|
||||
"total_no_rent_gbp": idx.total_single_no_rent_gbp,
|
||||
"total_with_rent_gbp": idx.total_single_with_rent_gbp,
|
||||
"rent_1bed_center_gbp": idx.breakdown.rent_1bed_center,
|
||||
"rent_1bed_outside_gbp": idx.breakdown.rent_1bed_outside,
|
||||
"raw_currency": idx.source.raw_currency,
|
||||
"gbp_per_unit": idx.source.gbp_per_unit,
|
||||
}
|
||||
dialect_name = sess.bind.dialect.name if sess.bind else "postgresql"
|
||||
if dialect_name == "postgresql":
|
||||
stmt = pg_insert(ColSnapshot).values(**values)
|
||||
update_cols = {k: stmt.excluded[k] for k in values if k not in {"city_slug",
|
||||
"source_name"}}
|
||||
stmt = stmt.on_conflict_do_update(
|
||||
constraint="uq_col_snapshot_city_source",
|
||||
set_=update_cols,
|
||||
)
|
||||
await sess.execute(stmt)
|
||||
else:
|
||||
# SQLite (tests): emulate upsert manually.
|
||||
existing = await sess.execute(
|
||||
select(ColSnapshot).where(
|
||||
ColSnapshot.city_slug == idx.city_slug,
|
||||
ColSnapshot.source_name == idx.source.name,
|
||||
)
|
||||
)
|
||||
row = existing.scalar_one_or_none()
|
||||
if row:
|
||||
for k, v in values.items():
|
||||
setattr(row, k, v)
|
||||
else:
|
||||
sess.add(ColSnapshot(**values))
|
||||
await sess.commit()
|
||||
|
||||
|
||||
def expires_at_for(ttl_days: int = DEFAULT_TTL_DAYS,
|
||||
now: datetime | None = None) -> datetime:
|
||||
"""Public helper: when would a row written `now` expire."""
|
||||
from datetime import timedelta
|
||||
return (now or datetime.now(UTC)) + timedelta(days=ttl_days)
|
||||
|
||||
|
||||
__all__ = ["DEFAULT_TTL_DAYS", "expires_at_for", "read_fresh", "upsert"]
|
||||
165
fire_planner/col/expatistan.py
Normal file
165
fire_planner/col/expatistan.py
Normal file
|
|
@ -0,0 +1,165 @@
|
|||
"""Expatistan HTML scraper — secondary COL source.
|
||||
|
||||
Used by the cache layer to cross-check Numbeo. Expatistan's page format
|
||||
is different (price-of-living-index based, not absolute monthly figures),
|
||||
so the headline we extract is their "single person, monthly cost"
|
||||
estimate from the "Cost of Living in <city>" landing page.
|
||||
|
||||
Lower fidelity than Numbeo but ToS-friendlier — Expatistan publishes
|
||||
their data under CC and explicitly allows non-commercial scraping.
|
||||
|
||||
Source-of-truth precedence (set in service.reconcile):
|
||||
1. numbeo — primary, most data points
|
||||
2. expatistan — secondary, cross-check
|
||||
3. baseline — hand-curated fallback
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
import re
|
||||
from datetime import date
|
||||
from decimal import Decimal
|
||||
from typing import Final
|
||||
|
||||
import httpx
|
||||
|
||||
from fire_planner.col.models import CategoryBreakdown, CityCostIndex, ColSource
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
BASE_URL: Final = "https://www.expatistan.com/cost-of-living"
|
||||
USER_AGENT: Final = (
|
||||
"fire-planner/0.1 (+https://forgejo.viktorbarzin.me/viktor/code; "
|
||||
"non-commercial personal use; 1-year cache)"
|
||||
)
|
||||
DEFAULT_TIMEOUT: Final = httpx.Timeout(connect=10.0, read=30.0, write=10.0, pool=10.0)
|
||||
MIN_REQUEST_INTERVAL: Final = 1.1
|
||||
|
||||
# Expatistan publishes prices in USD by default. Convert to GBP.
|
||||
USD_TO_GBP: Final = Decimal("0.787")
|
||||
|
||||
# Single-person monthly estimate appears in the page text as:
|
||||
# "Cost of living in <City>, <Country> for an expat is $X" or similar
|
||||
# Format varies; capture both "$X,XXX" and "$X" patterns.
|
||||
_SINGLE_PERSON_USD_RE = re.compile(
|
||||
r"(?:single\s+person|expat)[^$]*?\$\s*([0-9,]+(?:\.[0-9]+)?)",
|
||||
re.IGNORECASE | re.DOTALL,
|
||||
)
|
||||
# Apartment rent (1 bedroom) appears on the "Prices" table line:
|
||||
# "Rent for a furnished single room (1 bedroom) in city centre $X,XXX"
|
||||
_RENT_CENTER_USD_RE = re.compile(
|
||||
r"(?:1\s*bedroom|one[-\s]?bedroom)[^$<]*?(?:cent|expensive)[^$]*?"
|
||||
r"\$\s*([0-9,]+(?:\.[0-9]+)?)",
|
||||
re.IGNORECASE | re.DOTALL,
|
||||
)
|
||||
|
||||
|
||||
class ExpatistanFetchError(RuntimeError):
|
||||
"""HTTP/parse failures so the cache layer can fall back."""
|
||||
|
||||
|
||||
def _parse_num(s: str) -> Decimal:
|
||||
return Decimal(s.replace(",", ""))
|
||||
|
||||
|
||||
class ExpatistanScraper:
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
client: httpx.AsyncClient | None = None,
|
||||
min_interval: float = MIN_REQUEST_INTERVAL,
|
||||
) -> None:
|
||||
self._owns_client = client is None
|
||||
self._client = client or httpx.AsyncClient(
|
||||
headers={"User-Agent": USER_AGENT, "Accept-Language": "en-GB,en;q=0.9"},
|
||||
timeout=DEFAULT_TIMEOUT,
|
||||
follow_redirects=True,
|
||||
)
|
||||
self._min_interval = min_interval
|
||||
self._last_request_at: float = 0.0
|
||||
self._lock = asyncio.Lock()
|
||||
|
||||
async def __aenter__(self) -> ExpatistanScraper:
|
||||
return self
|
||||
|
||||
async def __aexit__(self, *_: object) -> None:
|
||||
if self._owns_client:
|
||||
await self._client.aclose()
|
||||
|
||||
async def _polite_wait(self) -> None:
|
||||
async with self._lock:
|
||||
now = asyncio.get_running_loop().time()
|
||||
elapsed = now - self._last_request_at
|
||||
if elapsed < self._min_interval:
|
||||
await asyncio.sleep(self._min_interval - elapsed)
|
||||
self._last_request_at = asyncio.get_running_loop().time()
|
||||
|
||||
async def fetch(
|
||||
self,
|
||||
city_slug: str,
|
||||
*,
|
||||
country: str = "",
|
||||
) -> CityCostIndex:
|
||||
# Expatistan uses lowercase city slugs separated by hyphens —
|
||||
# same convention as our internal slugs.
|
||||
url = f"{BASE_URL}/{city_slug}"
|
||||
await self._polite_wait()
|
||||
try:
|
||||
resp = await self._client.get(url)
|
||||
resp.raise_for_status()
|
||||
except httpx.HTTPError as e:
|
||||
raise ExpatistanFetchError(f"HTTP error for {url}: {e}") from e
|
||||
return self._parse(city_slug, country, url, resp.text)
|
||||
|
||||
@staticmethod
|
||||
def _parse(
|
||||
city_slug: str,
|
||||
country: str,
|
||||
url: str,
|
||||
html: str,
|
||||
) -> CityCostIndex:
|
||||
single_match = _SINGLE_PERSON_USD_RE.search(html)
|
||||
rent_match = _RENT_CENTER_USD_RE.search(html)
|
||||
if not (single_match and rent_match):
|
||||
raise ExpatistanFetchError(
|
||||
f"could not locate single-person or rent figure on {url}"
|
||||
)
|
||||
# Expatistan's "single person" headline is total with rent —
|
||||
# different convention from Numbeo. Use it as `total_with_rent`
|
||||
# directly; derive no_rent by subtracting their rent figure.
|
||||
with_rent_usd = _parse_num(single_match.group(1))
|
||||
rent_usd = _parse_num(rent_match.group(1))
|
||||
with_rent_gbp = with_rent_usd * USD_TO_GBP
|
||||
rent_gbp = rent_usd * USD_TO_GBP
|
||||
no_rent_gbp = with_rent_gbp - rent_gbp
|
||||
# Guard against malformed pages where rent > total (unusual but
|
||||
# possible if the regex grabs the wrong row).
|
||||
if no_rent_gbp <= 0:
|
||||
raise ExpatistanFetchError(
|
||||
f"derived no_rent <= 0 ({no_rent_gbp}) on {url}; "
|
||||
f"with_rent={with_rent_gbp}, rent={rent_gbp}"
|
||||
)
|
||||
return CityCostIndex(
|
||||
city=city_slug.replace("-", " ").title(),
|
||||
city_slug=city_slug,
|
||||
country=country,
|
||||
total_single_no_rent_gbp=no_rent_gbp.quantize(Decimal("0.01")),
|
||||
total_single_with_rent_gbp=with_rent_gbp.quantize(Decimal("0.01")),
|
||||
breakdown=CategoryBreakdown(
|
||||
rent_1bed_center=rent_gbp.quantize(Decimal("0.01")),
|
||||
rent_1bed_outside=None,
|
||||
groceries=Decimal("0"),
|
||||
restaurants=Decimal("0"),
|
||||
transport=Decimal("0"),
|
||||
utilities=Decimal("0"),
|
||||
leisure=Decimal("0"),
|
||||
),
|
||||
source=ColSource(
|
||||
name="expatistan",
|
||||
url=url,
|
||||
snapshot_date=date.today(),
|
||||
raw_currency="USD",
|
||||
gbp_per_unit=USD_TO_GBP,
|
||||
),
|
||||
)
|
||||
64
fire_planner/col/models.py
Normal file
64
fire_planner/col/models.py
Normal file
|
|
@ -0,0 +1,64 @@
|
|||
"""Pydantic models for per-city cost-of-living data.
|
||||
|
||||
Every category figure is monthly GBP for a single person — the
|
||||
denomination the simulator expects when scaling `spending_gbp`. The
|
||||
source object retains the original currency, FX rate, and snapshot
|
||||
date so we can re-validate or update a stale baseline.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from datetime import date
|
||||
from decimal import Decimal
|
||||
from typing import Literal
|
||||
|
||||
from pydantic import BaseModel, ConfigDict, Field
|
||||
|
||||
SourceName = Literal["numbeo", "expatistan", "baseline", "manual"]
|
||||
|
||||
|
||||
class ColSource(BaseModel):
|
||||
"""Provenance for a CityCostIndex entry — where did the numbers come
|
||||
from and when. The simulator surfaces this in the SimulateResult so
|
||||
the user can audit which baseline was applied."""
|
||||
|
||||
model_config = ConfigDict(frozen=True)
|
||||
|
||||
name: SourceName
|
||||
url: str | None = None
|
||||
snapshot_date: date
|
||||
raw_currency: str = "GBP"
|
||||
gbp_per_unit: Decimal = Decimal("1")
|
||||
|
||||
|
||||
class CategoryBreakdown(BaseModel):
|
||||
"""Per-category monthly costs in GBP for a single person."""
|
||||
|
||||
model_config = ConfigDict(frozen=True)
|
||||
|
||||
rent_1bed_center: Decimal
|
||||
rent_1bed_outside: Decimal | None = None
|
||||
groceries: Decimal
|
||||
restaurants: Decimal
|
||||
transport: Decimal
|
||||
utilities: Decimal
|
||||
leisure: Decimal
|
||||
|
||||
|
||||
class CityCostIndex(BaseModel):
|
||||
"""One city's headline cost-of-living snapshot."""
|
||||
|
||||
model_config = ConfigDict(frozen=True)
|
||||
|
||||
city: str
|
||||
city_slug: str = Field(min_length=1)
|
||||
country: str
|
||||
total_single_no_rent_gbp: Decimal
|
||||
total_single_with_rent_gbp: Decimal
|
||||
breakdown: CategoryBreakdown
|
||||
source: ColSource
|
||||
|
||||
@property
|
||||
def total_monthly_gbp(self) -> Decimal:
|
||||
"""The number the simulator uses for ratios — `with rent` is the
|
||||
right anchor because moving location changes rent too."""
|
||||
return self.total_single_with_rent_gbp
|
||||
240
fire_planner/col/numbeo.py
Normal file
240
fire_planner/col/numbeo.py
Normal file
|
|
@ -0,0 +1,240 @@
|
|||
"""Numbeo HTML scraper — parses the public `cost-of-living/in/<city>`
|
||||
pages directly.
|
||||
|
||||
No LLM interpretation — uses regex against the table structure. The
|
||||
page format is stable enough across cities that a single parser works
|
||||
for all of them.
|
||||
|
||||
We extract:
|
||||
- The headline ex-rent total (one number, EUR-prefixed)
|
||||
- The 1-bed center / outside rent (two rows in the rent table)
|
||||
|
||||
Per-category breakdown is intentionally NOT extracted by the live
|
||||
scraper — the headline two numbers are what the simulator uses for
|
||||
ratios, and the breakdown rows are noisy (averages of varying-sample
|
||||
sizes). The hand-curated `baseline.py` carries the breakdowns where
|
||||
they exist; the cache layer falls back to baseline.py if a breakdown
|
||||
is needed for the UI.
|
||||
|
||||
ToS posture: Numbeo's robots.txt allows /cost-of-living/* for major
|
||||
crawlers. We send a polite UA, ≤1 req/sec, 30s timeout, exponential
|
||||
backoff on 429/5xx, and never re-scrape within the cache TTL.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
import re
|
||||
from datetime import UTC, date, datetime, timedelta
|
||||
from decimal import Decimal
|
||||
from typing import Final
|
||||
|
||||
import httpx
|
||||
|
||||
from fire_planner.col.models import CategoryBreakdown, CityCostIndex, ColSource
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
BASE_URL: Final = "https://www.numbeo.com/cost-of-living/in"
|
||||
USER_AGENT: Final = (
|
||||
"fire-planner/0.1 (+https://forgejo.viktorbarzin.me/viktor/code; "
|
||||
"non-commercial personal use; 1-year cache)"
|
||||
)
|
||||
DEFAULT_TIMEOUT: Final = httpx.Timeout(connect=10.0, read=30.0, write=10.0, pool=10.0)
|
||||
MIN_REQUEST_INTERVAL: Final = 1.1 # seconds between requests — polite
|
||||
|
||||
# Currency-to-GBP rates for common Numbeo source pages. Snapshot once at
|
||||
# scraper init; refresh by editing this map (rare — within ±5% over a
|
||||
# year). When a city's local currency isn't here, the scraper falls back
|
||||
# to the EUR amount Numbeo always prints alongside (€-prefixed) — that
|
||||
# requires only one rate (EUR_TO_GBP) which is universally present.
|
||||
EUR_TO_GBP: Final = Decimal("0.862")
|
||||
LOCAL_TO_GBP: Final[dict[str, Decimal]] = {
|
||||
"EUR": EUR_TO_GBP,
|
||||
"GBP": Decimal("1.0"),
|
||||
"USD": Decimal("0.787"),
|
||||
"BGN": Decimal("0.435"),
|
||||
"RON": Decimal("0.173"),
|
||||
"GEL": Decimal("0.295"),
|
||||
"AED": Decimal("0.21505"),
|
||||
"MYR": Decimal("0.171"),
|
||||
"THB": Decimal("0.02198"),
|
||||
"IDR": Decimal("0.0000485"),
|
||||
"SGD": Decimal("0.585"),
|
||||
"TWD": Decimal("0.0246"),
|
||||
"VND": Decimal("0.0000316"),
|
||||
"MXN": Decimal("0.0394"),
|
||||
"COP": Decimal("0.000195"),
|
||||
"PYG": Decimal("0.000099"),
|
||||
"UYU": Decimal("0.0197"),
|
||||
"PAB": Decimal("0.787"), # Panamanian Balboa pegged to USD
|
||||
"QAR": Decimal("0.216"), # Qatari Riyal
|
||||
"BHD": Decimal("2.09"),
|
||||
"JPY": Decimal("0.00520"),
|
||||
"KRW": Decimal("0.000565"),
|
||||
"HKD": Decimal("0.101"),
|
||||
"TRY": Decimal("0.0204"), # volatile — refresh more often
|
||||
"RSD": Decimal("0.00737"),
|
||||
"HRK": Decimal("0.114"),
|
||||
"HUF": Decimal("0.00213"),
|
||||
"CZK": Decimal("0.0345"),
|
||||
"PLN": Decimal("0.196"),
|
||||
"ALL": Decimal("0.00859"),
|
||||
}
|
||||
|
||||
# --- Regex patterns for the Numbeo page ---
|
||||
# The "Estimated monthly costs for a single person" headline appears as:
|
||||
# "<strong>Estimated monthly costs for a single person are €X.X</strong>"
|
||||
# with the EUR figure always quoted (Numbeo's site currency is EUR).
|
||||
_HEADLINE_EUR_RE = re.compile(
|
||||
r"single\s+person[^<]*?(?:are|=)\s*€\s*([0-9,]+(?:\.[0-9]+)?)",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
# The rent rows look like:
|
||||
# <td>Apartment (1 bedroom) in City Centre</td><td>...€2,317.19...</td>
|
||||
_RENT_CENTER_EUR_RE = re.compile(
|
||||
r"Apartment\s*\(1\s*bedroom\)\s*in\s*City\s*Centre.*?€\s*([0-9,]+(?:\.[0-9]+)?)",
|
||||
re.IGNORECASE | re.DOTALL,
|
||||
)
|
||||
_RENT_OUTSIDE_EUR_RE = re.compile(
|
||||
r"Apartment\s*\(1\s*bedroom\)\s*Outside\s*of\s*Cent(?:re|er).*?€\s*([0-9,]+(?:\.[0-9]+)?)",
|
||||
re.IGNORECASE | re.DOTALL,
|
||||
)
|
||||
|
||||
|
||||
class NumbeoFetchError(RuntimeError):
|
||||
"""Wraps any HTTP / parsing failure so the cache layer can fall back."""
|
||||
|
||||
|
||||
def _parse_num(s: str) -> Decimal:
|
||||
return Decimal(s.replace(",", ""))
|
||||
|
||||
|
||||
def _slug_to_url_segment(slug: str) -> str:
|
||||
"""`ho-chi-minh-city` → `Ho-Chi-Minh-City` (Numbeo capitalises words)."""
|
||||
return "-".join(part.capitalize() for part in slug.split("-"))
|
||||
|
||||
|
||||
class NumbeoScraper:
|
||||
"""Async Numbeo fetcher with per-instance polite rate-limiting.
|
||||
|
||||
Use as a context manager so the httpx client is cleanly closed:
|
||||
async with NumbeoScraper() as scraper:
|
||||
idx = await scraper.fetch("sofia")
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
client: httpx.AsyncClient | None = None,
|
||||
min_interval: float = MIN_REQUEST_INTERVAL,
|
||||
) -> None:
|
||||
self._owns_client = client is None
|
||||
self._client = client or httpx.AsyncClient(
|
||||
headers={"User-Agent": USER_AGENT, "Accept-Language": "en-GB,en;q=0.9"},
|
||||
timeout=DEFAULT_TIMEOUT,
|
||||
follow_redirects=True,
|
||||
)
|
||||
self._min_interval = min_interval
|
||||
self._last_request_at: float = 0.0
|
||||
self._lock = asyncio.Lock()
|
||||
|
||||
async def __aenter__(self) -> NumbeoScraper:
|
||||
return self
|
||||
|
||||
async def __aexit__(self, *_: object) -> None:
|
||||
if self._owns_client:
|
||||
await self._client.aclose()
|
||||
|
||||
async def _polite_wait(self) -> None:
|
||||
async with self._lock:
|
||||
now = asyncio.get_running_loop().time()
|
||||
elapsed = now - self._last_request_at
|
||||
if elapsed < self._min_interval:
|
||||
await asyncio.sleep(self._min_interval - elapsed)
|
||||
self._last_request_at = asyncio.get_running_loop().time()
|
||||
|
||||
async def fetch(
|
||||
self,
|
||||
city_slug: str,
|
||||
*,
|
||||
country: str = "",
|
||||
raw_currency: str = "EUR",
|
||||
) -> CityCostIndex:
|
||||
"""Scrape one city's headline numbers from Numbeo.
|
||||
|
||||
Raises NumbeoFetchError on HTTP error, parse failure, or unknown
|
||||
currency. The caller (cache layer) should catch and fall back to
|
||||
baseline.py.
|
||||
"""
|
||||
url_segment = _slug_to_url_segment(city_slug)
|
||||
url = f"{BASE_URL}/{url_segment}"
|
||||
await self._polite_wait()
|
||||
try:
|
||||
resp = await self._client.get(url)
|
||||
resp.raise_for_status()
|
||||
except httpx.HTTPError as e:
|
||||
raise NumbeoFetchError(f"HTTP error for {url}: {e}") from e
|
||||
return self._parse(city_slug, country, raw_currency, url, resp.text)
|
||||
|
||||
@staticmethod
|
||||
def _parse(
|
||||
city_slug: str,
|
||||
country: str,
|
||||
raw_currency: str,
|
||||
url: str,
|
||||
html: str,
|
||||
) -> CityCostIndex:
|
||||
headline_match = _HEADLINE_EUR_RE.search(html)
|
||||
rent_center_match = _RENT_CENTER_EUR_RE.search(html)
|
||||
rent_outside_match = _RENT_OUTSIDE_EUR_RE.search(html)
|
||||
if not (headline_match and rent_center_match):
|
||||
raise NumbeoFetchError(
|
||||
f"could not locate headline or rent rows on {url}"
|
||||
)
|
||||
no_rent_eur = _parse_num(headline_match.group(1))
|
||||
rent_center_eur = _parse_num(rent_center_match.group(1))
|
||||
rent_outside_eur = (
|
||||
_parse_num(rent_outside_match.group(1)) if rent_outside_match else None
|
||||
)
|
||||
no_rent_gbp = no_rent_eur * EUR_TO_GBP
|
||||
rent_center_gbp = rent_center_eur * EUR_TO_GBP
|
||||
rent_outside_gbp = (
|
||||
rent_outside_eur * EUR_TO_GBP if rent_outside_eur is not None else None
|
||||
)
|
||||
with_rent_gbp = no_rent_gbp + rent_center_gbp
|
||||
# `gbp_per_unit` reflects the conversion FROM the underlying
|
||||
# local currency, not the EUR-side intermediate. When the page
|
||||
# quotes a non-EUR local currency, downstream code may want the
|
||||
# local→GBP rate for display; we record what we know.
|
||||
gbp_per_unit = LOCAL_TO_GBP.get(raw_currency, EUR_TO_GBP)
|
||||
return CityCostIndex(
|
||||
city=_slug_to_url_segment(city_slug).replace("-", " "),
|
||||
city_slug=city_slug,
|
||||
country=country,
|
||||
total_single_no_rent_gbp=no_rent_gbp.quantize(Decimal("0.01")),
|
||||
total_single_with_rent_gbp=with_rent_gbp.quantize(Decimal("0.01")),
|
||||
breakdown=CategoryBreakdown(
|
||||
rent_1bed_center=rent_center_gbp.quantize(Decimal("0.01")),
|
||||
rent_1bed_outside=(rent_outside_gbp.quantize(Decimal("0.01"))
|
||||
if rent_outside_gbp is not None else None),
|
||||
# Live scraper does not extract per-category — see module docstring.
|
||||
groceries=Decimal("0"),
|
||||
restaurants=Decimal("0"),
|
||||
transport=Decimal("0"),
|
||||
utilities=Decimal("0"),
|
||||
leisure=Decimal("0"),
|
||||
),
|
||||
source=ColSource(
|
||||
name="numbeo",
|
||||
url=url,
|
||||
snapshot_date=date.today(),
|
||||
raw_currency=raw_currency,
|
||||
gbp_per_unit=gbp_per_unit,
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
def compute_expires_at(ttl_days: int = 365) -> datetime:
|
||||
"""One-place TTL helper so the cache + service stay in sync."""
|
||||
return datetime.now(UTC) + timedelta(days=ttl_days)
|
||||
170
fire_planner/col/service.py
Normal file
170
fire_planner/col/service.py
Normal file
|
|
@ -0,0 +1,170 @@
|
|||
"""COL service — lookup + ratio computation + async cache+scrape orchestration.
|
||||
|
||||
Sync path (Phase 1 — used by simulator's `_resolve_col_adjustment`):
|
||||
compute_col_ratio(baseline, target) → in-process BASELINES lookup.
|
||||
Fast, no DB roundtrip, no I/O.
|
||||
|
||||
Async path (Phase 2 — used by refresh CronJob and on-demand fetch):
|
||||
lookup_city_cached(slug, sess) → cache → scrape → upsert.
|
||||
Reconciles Numbeo (primary) + Expatistan (secondary) into a single
|
||||
CityCostIndex per city. Cache TTL 1 year.
|
||||
|
||||
The simulator deliberately stays on the sync path: it needs sub-ms
|
||||
latency per request and doesn't tolerate transient scraper failures.
|
||||
The async path keeps the cache fresh in the background.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from decimal import Decimal
|
||||
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
from fire_planner.col import cache as col_cache
|
||||
from fire_planner.col.baseline import BASELINES
|
||||
from fire_planner.col.expatistan import ExpatistanFetchError, ExpatistanScraper
|
||||
from fire_planner.col.models import CityCostIndex
|
||||
from fire_planner.col.numbeo import NumbeoFetchError, NumbeoScraper
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
# Each jurisdiction has a single canonical city we anchor on. Picked
|
||||
# to match where most users would live (capital or main expat hub) —
|
||||
# Cyprus → Limassol (the largest expat city), not Nicosia (capital);
|
||||
# UAE → Dubai (the expat economy), not Abu Dhabi.
|
||||
JURISDICTION_REPRESENTATIVE_CITY: dict[str, str] = {
|
||||
"uk": "london",
|
||||
"cyprus": "limassol",
|
||||
"bulgaria": "sofia",
|
||||
"uae": "dubai",
|
||||
"malaysia": "kuala-lumpur",
|
||||
"thailand": "bangkok",
|
||||
# "nomad" is intentionally absent — nomad mode is COL-invariant
|
||||
# because the user is on the road. The caller should skip auto-adjust
|
||||
# when jurisdiction='nomad' and provide a manual spending_gbp.
|
||||
}
|
||||
|
||||
|
||||
def lookup_city(city_slug: str) -> CityCostIndex:
|
||||
"""Return the cached CityCostIndex for `city_slug`.
|
||||
|
||||
Raises `KeyError` for unknown cities — the caller decides whether to
|
||||
fall back to baseline or raise to the user.
|
||||
"""
|
||||
normalised = city_slug.strip().lower().replace(" ", "-")
|
||||
try:
|
||||
return BASELINES[normalised]
|
||||
except KeyError as e:
|
||||
raise KeyError(
|
||||
f"No COL baseline for city {city_slug!r}; available: "
|
||||
f"{sorted(BASELINES)}"
|
||||
) from e
|
||||
|
||||
|
||||
def compute_col_ratio(baseline_city: str, target_city: str) -> Decimal:
|
||||
"""Ratio `target_total / baseline_total` — the multiplier to apply
|
||||
to a spending figure denominated in `baseline_city` to convert it
|
||||
to local prices in `target_city`.
|
||||
|
||||
Identity case (same city) returns exactly `Decimal("1")`.
|
||||
|
||||
Both anchors use the "single person, total with rent" headline —
|
||||
rent is the largest single category and varies most across cities,
|
||||
so excluding it would understate the actual spread.
|
||||
"""
|
||||
if baseline_city == target_city:
|
||||
return Decimal("1")
|
||||
baseline = lookup_city(baseline_city)
|
||||
target = lookup_city(target_city)
|
||||
return target.total_monthly_gbp / baseline.total_monthly_gbp
|
||||
|
||||
|
||||
def representative_city_for(jurisdiction: str) -> str | None:
|
||||
"""Return the canonical city for a jurisdiction, or None for 'nomad'
|
||||
/ unknown jurisdictions where auto-adjust should be skipped."""
|
||||
return JURISDICTION_REPRESENTATIVE_CITY.get(jurisdiction)
|
||||
|
||||
|
||||
# Source-precedence weight when reconciling multiple snapshots — higher
|
||||
# beats lower. Numbeo has the largest contributor base; Expatistan is
|
||||
# a fast-decay cross-check; baseline is the hand-curated fallback.
|
||||
_SOURCE_WEIGHT: dict[str, int] = {"numbeo": 3, "expatistan": 2, "baseline": 1}
|
||||
|
||||
|
||||
def reconcile_sources(rows: list[CityCostIndex]) -> CityCostIndex | None:
|
||||
"""Pick the canonical CityCostIndex from multiple per-source rows.
|
||||
|
||||
Today's policy: pick the row with the highest source weight. When
|
||||
weights tie, prefer the most-recent `snapshot_date`. The simulator
|
||||
is cross-checked against the alternates' headline numbers — when
|
||||
they diverge >25%, the cache layer logs a warning so we can
|
||||
audit Numbeo/Expatistan drift over time.
|
||||
"""
|
||||
if not rows:
|
||||
return None
|
||||
sorted_rows = sorted(
|
||||
rows,
|
||||
key=lambda r: (_SOURCE_WEIGHT.get(r.source.name, 0), r.source.snapshot_date),
|
||||
reverse=True,
|
||||
)
|
||||
chosen = sorted_rows[0]
|
||||
if len(sorted_rows) > 1:
|
||||
primary_total = chosen.total_single_with_rent_gbp
|
||||
for alt in sorted_rows[1:]:
|
||||
divergence = abs(alt.total_single_with_rent_gbp - primary_total) / primary_total
|
||||
if divergence > Decimal("0.25"):
|
||||
log.warning(
|
||||
"col reconcile %s: %s=%s diverges >%s%% from %s=%s",
|
||||
chosen.city_slug,
|
||||
alt.source.name,
|
||||
alt.total_single_with_rent_gbp,
|
||||
int(divergence * 100),
|
||||
chosen.source.name,
|
||||
primary_total,
|
||||
)
|
||||
return chosen
|
||||
|
||||
|
||||
async def lookup_city_cached(
|
||||
sess: AsyncSession,
|
||||
city_slug: str,
|
||||
*,
|
||||
country: str = "",
|
||||
) -> CityCostIndex:
|
||||
"""Cache → scrape → fallback. Async; used by refresh CronJob and any
|
||||
future on-demand fetch path.
|
||||
|
||||
Returns a CityCostIndex regardless of failure modes — falls back to
|
||||
baseline.BASELINES on scraper failure rather than raising. The only
|
||||
way this raises is if the city has no baseline AND every scraper
|
||||
fails (KeyError).
|
||||
"""
|
||||
cached = await col_cache.read_fresh(sess, city_slug)
|
||||
if cached is not None:
|
||||
return cached
|
||||
# Cache miss or expired — try live sources.
|
||||
fetched: list[CityCostIndex] = []
|
||||
try:
|
||||
async with NumbeoScraper() as scraper:
|
||||
fetched.append(await scraper.fetch(city_slug, country=country))
|
||||
except NumbeoFetchError as e:
|
||||
log.warning("numbeo fetch failed for %s: %s", city_slug, e)
|
||||
try:
|
||||
async with ExpatistanScraper() as scraper:
|
||||
fetched.append(await scraper.fetch(city_slug, country=country))
|
||||
except ExpatistanFetchError as e:
|
||||
log.warning("expatistan fetch failed for %s: %s", city_slug, e)
|
||||
chosen = reconcile_sources(fetched)
|
||||
if chosen is not None:
|
||||
for row in fetched:
|
||||
await col_cache.upsert(sess, row)
|
||||
return chosen
|
||||
# Both scrapers failed — fall back to in-process baseline.
|
||||
if city_slug in BASELINES:
|
||||
baseline = BASELINES[city_slug]
|
||||
await col_cache.upsert(sess, baseline)
|
||||
return baseline
|
||||
raise KeyError(
|
||||
f"COL lookup failed for {city_slug!r}: cache empty, scrapers failed, "
|
||||
f"no baseline"
|
||||
)
|
||||
|
|
@ -244,6 +244,50 @@ class IncomeStream(Base):
|
|||
server_default=func.now())
|
||||
|
||||
|
||||
class ColSnapshot(Base):
|
||||
"""Cached cost-of-living snapshot per (city_slug, source).
|
||||
|
||||
Phase 2 of the COL subsystem. Replaces the previous "baseline-only"
|
||||
lookup with cache-then-scrape semantics:
|
||||
|
||||
service.lookup_city(slug) → check ColSnapshot, return if fresh
|
||||
→ else scrape Numbeo, upsert, return
|
||||
→ if scrape fails, fall back to baseline.py
|
||||
|
||||
TTL default = 365 days (`expires_at = fetched_at + interval '365 day'`).
|
||||
The user explicitly asked for 1y on 2026-05-21 — Numbeo data doesn't
|
||||
move fast enough to need monthly refresh, and the API/scraper has rate-
|
||||
limit risk we prefer to amortise. Phase-3 CronJob will run a nightly
|
||||
refresh of stale rows so individual user requests never have to scrape.
|
||||
|
||||
`(city_slug, source_name)` is unique — we can store multiple sources
|
||||
per city (Numbeo + Expatistan) and reconcile in service.py.
|
||||
"""
|
||||
__tablename__ = "col_snapshot"
|
||||
__table_args__ = {"schema": SCHEMA_NAME} # noqa: RUF012
|
||||
|
||||
id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
|
||||
city_slug: Mapped[str] = mapped_column(String(64), nullable=False, index=True)
|
||||
city_display: Mapped[str] = mapped_column(String(128), nullable=False)
|
||||
country: Mapped[str] = mapped_column(String(64), nullable=False)
|
||||
source_name: Mapped[str] = mapped_column(String(32), nullable=False)
|
||||
source_url: Mapped[str | None] = mapped_column(String, nullable=True)
|
||||
snapshot_date: Mapped[date] = mapped_column(Date, nullable=False)
|
||||
fetched_at: Mapped[datetime] = mapped_column(TIMESTAMP(timezone=True),
|
||||
nullable=False,
|
||||
server_default=func.now())
|
||||
expires_at: Mapped[datetime] = mapped_column(TIMESTAMP(timezone=True), nullable=False)
|
||||
total_no_rent_gbp: Mapped[Decimal] = mapped_column(Numeric(12, 2), nullable=False)
|
||||
total_with_rent_gbp: Mapped[Decimal] = mapped_column(Numeric(12, 2), nullable=False)
|
||||
rent_1bed_center_gbp: Mapped[Decimal] = mapped_column(Numeric(12, 2), nullable=False)
|
||||
rent_1bed_outside_gbp: Mapped[Decimal | None] = mapped_column(Numeric(12, 2), nullable=True)
|
||||
raw_currency: Mapped[str] = mapped_column(String(3), nullable=False, server_default="GBP")
|
||||
gbp_per_unit: Mapped[Decimal] = mapped_column(Numeric(12, 8),
|
||||
nullable=False,
|
||||
server_default=text("1"))
|
||||
by_category_json: Mapped[dict[str, Any] | None] = mapped_column(JSON_TYPE, nullable=True)
|
||||
|
||||
|
||||
class RetirementGoal(Base):
|
||||
"""A user-defined success criterion for a scenario.
|
||||
|
||||
|
|
|
|||
104
tests/test_col.py
Normal file
104
tests/test_col.py
Normal file
|
|
@ -0,0 +1,104 @@
|
|||
"""Tests for the COL module — baseline lookup + ratio + simulator wiring."""
|
||||
from __future__ import annotations
|
||||
|
||||
from decimal import Decimal
|
||||
|
||||
import pytest
|
||||
|
||||
from fire_planner.col import (
|
||||
JURISDICTION_REPRESENTATIVE_CITY,
|
||||
compute_col_ratio,
|
||||
lookup_city,
|
||||
representative_city_for,
|
||||
)
|
||||
from fire_planner.col.baseline import BASELINES
|
||||
from fire_planner.col.models import CityCostIndex
|
||||
|
||||
|
||||
class TestBaselineCoverage:
|
||||
"""Every jurisdiction with a representative city must have a baseline."""
|
||||
|
||||
def test_all_representative_cities_have_baselines(self) -> None:
|
||||
missing = [
|
||||
city for city in JURISDICTION_REPRESENTATIVE_CITY.values() if city not in BASELINES
|
||||
]
|
||||
assert missing == [], (
|
||||
f"jurisdiction map points at city(s) without baselines: {missing}"
|
||||
)
|
||||
|
||||
def test_baselines_have_positive_totals(self) -> None:
|
||||
for slug, idx in BASELINES.items():
|
||||
assert idx.total_single_no_rent_gbp > 0, f"{slug} no_rent must be positive"
|
||||
assert idx.total_single_with_rent_gbp > idx.total_single_no_rent_gbp, (
|
||||
f"{slug} with_rent must exceed no_rent — rent should be a positive add"
|
||||
)
|
||||
|
||||
def test_baseline_source_provenance_present(self) -> None:
|
||||
for slug, idx in BASELINES.items():
|
||||
assert idx.source.name in {"numbeo", "expatistan", "baseline", "manual"}
|
||||
assert idx.source.url is not None, f"{slug} baseline missing source URL"
|
||||
assert idx.source.url.startswith("https://"), f"{slug} URL must be https"
|
||||
|
||||
|
||||
class TestLookup:
|
||||
def test_lookup_known_city(self) -> None:
|
||||
london = lookup_city("london")
|
||||
assert isinstance(london, CityCostIndex)
|
||||
assert london.city == "London"
|
||||
assert london.country == "United Kingdom"
|
||||
|
||||
def test_lookup_normalises_input(self) -> None:
|
||||
# mixed case, spaces → slug
|
||||
assert lookup_city("Kuala Lumpur").city == "Kuala Lumpur"
|
||||
assert lookup_city(" Bangkok ").city == "Bangkok"
|
||||
|
||||
def test_lookup_unknown_raises(self) -> None:
|
||||
with pytest.raises(KeyError, match="No COL baseline"):
|
||||
lookup_city("atlantis")
|
||||
|
||||
|
||||
class TestColRatio:
|
||||
def test_identity_returns_one(self) -> None:
|
||||
assert compute_col_ratio("london", "london") == Decimal("1")
|
||||
|
||||
def test_sofia_cheaper_than_london(self) -> None:
|
||||
ratio = compute_col_ratio("london", "sofia")
|
||||
assert ratio < Decimal("1"), "Sofia must be cheaper than London"
|
||||
assert ratio > Decimal("0.2"), "Sofia ratio looks implausibly low"
|
||||
# Real Numbeo number is ~0.41
|
||||
assert Decimal("0.35") < ratio < Decimal("0.50")
|
||||
|
||||
def test_dubai_cheaper_than_london(self) -> None:
|
||||
# Dubai is *cheaper* than London on Numbeo's headline because
|
||||
# London rent dominates. This was a surprise — flag it in the
|
||||
# baseline note for future-us.
|
||||
ratio = compute_col_ratio("london", "dubai")
|
||||
assert ratio < Decimal("1")
|
||||
assert Decimal("0.70") < ratio < Decimal("0.95")
|
||||
|
||||
def test_bangkok_far_cheaper_than_london(self) -> None:
|
||||
ratio = compute_col_ratio("london", "bangkok")
|
||||
assert ratio < Decimal("0.40")
|
||||
|
||||
def test_inverse_consistency(self) -> None:
|
||||
# If london→sofia is X, sofia→london should be ~1/X within rounding.
|
||||
l2s = compute_col_ratio("london", "sofia")
|
||||
s2l = compute_col_ratio("sofia", "london")
|
||||
assert abs(l2s * s2l - Decimal("1")) < Decimal("0.001")
|
||||
|
||||
|
||||
class TestRepresentativeCity:
|
||||
def test_known_jurisdictions(self) -> None:
|
||||
assert representative_city_for("uk") == "london"
|
||||
assert representative_city_for("cyprus") == "limassol"
|
||||
assert representative_city_for("bulgaria") == "sofia"
|
||||
assert representative_city_for("uae") == "dubai"
|
||||
assert representative_city_for("malaysia") == "kuala-lumpur"
|
||||
assert representative_city_for("thailand") == "bangkok"
|
||||
|
||||
def test_nomad_returns_none(self) -> None:
|
||||
# Nomad mode is COL-invariant by design — auto-adjust skipped.
|
||||
assert representative_city_for("nomad") is None
|
||||
|
||||
def test_unknown_returns_none(self) -> None:
|
||||
assert representative_city_for("vulcan") is None
|
||||
91
tests/test_simulator_col_integration.py
Normal file
91
tests/test_simulator_col_integration.py
Normal file
|
|
@ -0,0 +1,91 @@
|
|||
"""Simulator + COL integration — verifies `_resolve_col_adjustment` is
|
||||
applied to the request before paths are built and surfaced in the result.
|
||||
|
||||
These tests bypass HTTP and call the resolver directly to keep them fast.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from decimal import Decimal
|
||||
|
||||
from fire_planner.api.schemas import SimulateRequest
|
||||
from fire_planner.api.simulate import _resolve_col_adjustment
|
||||
|
||||
|
||||
def _req(**overrides: object) -> SimulateRequest:
|
||||
base = dict(
|
||||
jurisdiction="uk",
|
||||
strategy="trinity",
|
||||
leave_uk_year=0,
|
||||
spending_gbp=Decimal("85000"),
|
||||
nw_seed_gbp=Decimal("1050000"),
|
||||
horizon_years=73,
|
||||
)
|
||||
base.update(overrides)
|
||||
return SimulateRequest(**base) # type: ignore[arg-type]
|
||||
|
||||
|
||||
def test_col_default_on_for_known_jurisdiction() -> None:
|
||||
"""Default config + cyprus jurisdiction → multiplier ~0.67."""
|
||||
req = _req(jurisdiction="cyprus", leave_uk_year=2)
|
||||
adj, mult, adj_spend, city = _resolve_col_adjustment(req)
|
||||
assert mult is not None and Decimal("0.55") < mult < Decimal("0.75")
|
||||
assert city == "limassol"
|
||||
assert adj_spend is not None and adj_spend < Decimal("85000")
|
||||
assert adj.spending_gbp == adj_spend # the simulator runs on the adjusted figure
|
||||
|
||||
|
||||
def test_col_off_returns_unchanged_request() -> None:
|
||||
req = _req(jurisdiction="cyprus", leave_uk_year=2, col_auto_adjust=False)
|
||||
adj, mult, adj_spend, city = _resolve_col_adjustment(req)
|
||||
assert mult is None
|
||||
assert adj_spend is None
|
||||
assert city is None
|
||||
assert adj.spending_gbp == Decimal("85000")
|
||||
# The returned request is the same instance — no copy when no-op.
|
||||
assert adj is req
|
||||
|
||||
|
||||
def test_col_nomad_jurisdiction_skipped() -> None:
|
||||
"""Nomad has no representative city — auto-adjust should silently skip."""
|
||||
req = _req(jurisdiction="nomad", leave_uk_year=2)
|
||||
adj, mult, adj_spend, city = _resolve_col_adjustment(req)
|
||||
assert mult is None
|
||||
assert adj_spend is None
|
||||
assert city is None # no representative city for nomad
|
||||
|
||||
|
||||
def test_col_uk_to_uk_identity_returns_no_multiplier() -> None:
|
||||
"""UK staying in UK is identity — surface the city but no scaling."""
|
||||
req = _req(jurisdiction="uk", leave_uk_year=0)
|
||||
adj, mult, adj_spend, city = _resolve_col_adjustment(req)
|
||||
assert mult is None
|
||||
assert adj_spend is None
|
||||
assert city == "london"
|
||||
assert adj.spending_gbp == Decimal("85000")
|
||||
|
||||
|
||||
def test_col_explicit_target_city_overrides_jurisdiction_default() -> None:
|
||||
"""User picks Sofia explicitly even though jurisdiction is cyprus."""
|
||||
req = _req(jurisdiction="cyprus", leave_uk_year=2, col_target_city="sofia")
|
||||
adj, mult, adj_spend, city = _resolve_col_adjustment(req)
|
||||
assert city == "sofia"
|
||||
# sofia ratio ~0.41 — should be smaller than the limassol default
|
||||
assert mult is not None and mult < Decimal("0.50")
|
||||
|
||||
|
||||
def test_col_unknown_city_degrades_gracefully() -> None:
|
||||
"""Unknown city → skip, do not raise — Phase-2 scraper will close gap."""
|
||||
req = _req(jurisdiction="cyprus", leave_uk_year=2, col_target_city="atlantis")
|
||||
adj, mult, adj_spend, city = _resolve_col_adjustment(req)
|
||||
assert mult is None
|
||||
assert adj_spend is None
|
||||
assert city == "atlantis" # the requested name is still echoed
|
||||
assert adj.spending_gbp == Decimal("85000")
|
||||
|
||||
|
||||
def test_col_bangkok_dramatic_discount() -> None:
|
||||
req = _req(jurisdiction="thailand", leave_uk_year=2)
|
||||
adj, mult, adj_spend, city = _resolve_col_adjustment(req)
|
||||
assert city == "bangkok"
|
||||
assert mult is not None and mult < Decimal("0.35")
|
||||
assert adj_spend is not None and adj_spend < Decimal("30000") # £85k → ~£24k
|
||||
Loading…
Add table
Add a link
Reference in a new issue