The Monte Carlo used to compare jurisdictions at a flat London-equivalent spend, which silently overstated the cost-of-living for any move to a cheaper region. Now every cross-jurisdiction simulation auto-scales spending_gbp by the real Numbeo/Expatistan ratio between the user's baseline city and the target city. Architecture: - fire_planner/col/baseline.py — 22 cities with headline Numbeo data (source URLs + snapshot dates embedded) — fallback when scraper fails - col/numbeo.py + col/expatistan.py — httpx async scrapers, regex-parsed, polite 1.1s rate-limit, EUR/USD anchored - col/cache.py — PG-backed cache (col_snapshot table, 1-year TTL) - col/service.py — sync compute_col_ratio() for the simulator; async lookup_city_cached() with source reconciliation for the refresh CronJob - alembic 0005 — col_snapshot table, UNIQUE(city_slug, source_name) Simulator wiring: - SimulateRequest gains col_auto_adjust=True (default), col_baseline_city, col_target_city. Defaults pick the jurisdiction's representative city. - _resolve_col_adjustment scales spending_gbp before path-building. - SimulateResult surfaces col_multiplier_applied + col_adjusted_spending_gbp. CLIs: - python -m fire_planner col-seed — loads BASELINES into col_snapshot (post-migration seed step) - python -m fire_planner col-refresh-stale --within-days 7 — used by the weekly fire-planner-col-refresh CronJob 268 tests pass. Mypy strict + ruff clean. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
165 lines
5.9 KiB
Python
165 lines
5.9 KiB
Python
"""Expatistan HTML scraper — secondary COL source.
|
|
|
|
Used by the cache layer to cross-check Numbeo. Expatistan's page format
|
|
is different (price-of-living-index based, not absolute monthly figures),
|
|
so the headline we extract is their "single person, monthly cost"
|
|
estimate from the "Cost of Living in <city>" landing page.
|
|
|
|
Lower fidelity than Numbeo but ToS-friendlier — Expatistan publishes
|
|
their data under CC and explicitly allows non-commercial scraping.
|
|
|
|
Source-of-truth precedence (set in service.reconcile):
|
|
1. numbeo — primary, most data points
|
|
2. expatistan — secondary, cross-check
|
|
3. baseline — hand-curated fallback
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import asyncio
|
|
import logging
|
|
import re
|
|
from datetime import date
|
|
from decimal import Decimal
|
|
from typing import Final
|
|
|
|
import httpx
|
|
|
|
from fire_planner.col.models import CategoryBreakdown, CityCostIndex, ColSource
|
|
|
|
log = logging.getLogger(__name__)
|
|
|
|
BASE_URL: Final = "https://www.expatistan.com/cost-of-living"
|
|
USER_AGENT: Final = (
|
|
"fire-planner/0.1 (+https://forgejo.viktorbarzin.me/viktor/code; "
|
|
"non-commercial personal use; 1-year cache)"
|
|
)
|
|
DEFAULT_TIMEOUT: Final = httpx.Timeout(connect=10.0, read=30.0, write=10.0, pool=10.0)
|
|
MIN_REQUEST_INTERVAL: Final = 1.1
|
|
|
|
# Expatistan publishes prices in USD by default. Convert to GBP.
|
|
USD_TO_GBP: Final = Decimal("0.787")
|
|
|
|
# Single-person monthly estimate appears in the page text as:
|
|
# "Cost of living in <City>, <Country> for an expat is $X" or similar
|
|
# Format varies; capture both "$X,XXX" and "$X" patterns.
|
|
_SINGLE_PERSON_USD_RE = re.compile(
|
|
r"(?:single\s+person|expat)[^$]*?\$\s*([0-9,]+(?:\.[0-9]+)?)",
|
|
re.IGNORECASE | re.DOTALL,
|
|
)
|
|
# Apartment rent (1 bedroom) appears on the "Prices" table line:
|
|
# "Rent for a furnished single room (1 bedroom) in city centre $X,XXX"
|
|
_RENT_CENTER_USD_RE = re.compile(
|
|
r"(?:1\s*bedroom|one[-\s]?bedroom)[^$<]*?(?:cent|expensive)[^$]*?"
|
|
r"\$\s*([0-9,]+(?:\.[0-9]+)?)",
|
|
re.IGNORECASE | re.DOTALL,
|
|
)
|
|
|
|
|
|
class ExpatistanFetchError(RuntimeError):
|
|
"""HTTP/parse failures so the cache layer can fall back."""
|
|
|
|
|
|
def _parse_num(s: str) -> Decimal:
|
|
return Decimal(s.replace(",", ""))
|
|
|
|
|
|
class ExpatistanScraper:
|
|
def __init__(
|
|
self,
|
|
*,
|
|
client: httpx.AsyncClient | None = None,
|
|
min_interval: float = MIN_REQUEST_INTERVAL,
|
|
) -> None:
|
|
self._owns_client = client is None
|
|
self._client = client or httpx.AsyncClient(
|
|
headers={"User-Agent": USER_AGENT, "Accept-Language": "en-GB,en;q=0.9"},
|
|
timeout=DEFAULT_TIMEOUT,
|
|
follow_redirects=True,
|
|
)
|
|
self._min_interval = min_interval
|
|
self._last_request_at: float = 0.0
|
|
self._lock = asyncio.Lock()
|
|
|
|
async def __aenter__(self) -> ExpatistanScraper:
|
|
return self
|
|
|
|
async def __aexit__(self, *_: object) -> None:
|
|
if self._owns_client:
|
|
await self._client.aclose()
|
|
|
|
async def _polite_wait(self) -> None:
|
|
async with self._lock:
|
|
now = asyncio.get_running_loop().time()
|
|
elapsed = now - self._last_request_at
|
|
if elapsed < self._min_interval:
|
|
await asyncio.sleep(self._min_interval - elapsed)
|
|
self._last_request_at = asyncio.get_running_loop().time()
|
|
|
|
async def fetch(
|
|
self,
|
|
city_slug: str,
|
|
*,
|
|
country: str = "",
|
|
) -> CityCostIndex:
|
|
# Expatistan uses lowercase city slugs separated by hyphens —
|
|
# same convention as our internal slugs.
|
|
url = f"{BASE_URL}/{city_slug}"
|
|
await self._polite_wait()
|
|
try:
|
|
resp = await self._client.get(url)
|
|
resp.raise_for_status()
|
|
except httpx.HTTPError as e:
|
|
raise ExpatistanFetchError(f"HTTP error for {url}: {e}") from e
|
|
return self._parse(city_slug, country, url, resp.text)
|
|
|
|
@staticmethod
|
|
def _parse(
|
|
city_slug: str,
|
|
country: str,
|
|
url: str,
|
|
html: str,
|
|
) -> CityCostIndex:
|
|
single_match = _SINGLE_PERSON_USD_RE.search(html)
|
|
rent_match = _RENT_CENTER_USD_RE.search(html)
|
|
if not (single_match and rent_match):
|
|
raise ExpatistanFetchError(
|
|
f"could not locate single-person or rent figure on {url}"
|
|
)
|
|
# Expatistan's "single person" headline is total with rent —
|
|
# different convention from Numbeo. Use it as `total_with_rent`
|
|
# directly; derive no_rent by subtracting their rent figure.
|
|
with_rent_usd = _parse_num(single_match.group(1))
|
|
rent_usd = _parse_num(rent_match.group(1))
|
|
with_rent_gbp = with_rent_usd * USD_TO_GBP
|
|
rent_gbp = rent_usd * USD_TO_GBP
|
|
no_rent_gbp = with_rent_gbp - rent_gbp
|
|
# Guard against malformed pages where rent > total (unusual but
|
|
# possible if the regex grabs the wrong row).
|
|
if no_rent_gbp <= 0:
|
|
raise ExpatistanFetchError(
|
|
f"derived no_rent <= 0 ({no_rent_gbp}) on {url}; "
|
|
f"with_rent={with_rent_gbp}, rent={rent_gbp}"
|
|
)
|
|
return CityCostIndex(
|
|
city=city_slug.replace("-", " ").title(),
|
|
city_slug=city_slug,
|
|
country=country,
|
|
total_single_no_rent_gbp=no_rent_gbp.quantize(Decimal("0.01")),
|
|
total_single_with_rent_gbp=with_rent_gbp.quantize(Decimal("0.01")),
|
|
breakdown=CategoryBreakdown(
|
|
rent_1bed_center=rent_gbp.quantize(Decimal("0.01")),
|
|
rent_1bed_outside=None,
|
|
groceries=Decimal("0"),
|
|
restaurants=Decimal("0"),
|
|
transport=Decimal("0"),
|
|
utilities=Decimal("0"),
|
|
leisure=Decimal("0"),
|
|
),
|
|
source=ColSource(
|
|
name="expatistan",
|
|
url=url,
|
|
snapshot_date=date.today(),
|
|
raw_currency="USD",
|
|
gbp_per_unit=USD_TO_GBP,
|
|
),
|
|
)
|