diff --git a/fire_planner/examples/filters.py b/fire_planner/examples/filters.py new file mode 100644 index 0000000..5d003e1 --- /dev/null +++ b/fire_planner/examples/filters.py @@ -0,0 +1,64 @@ +"""Cheap regex pre-filter — keep posts that look like FIRE examples. + +A post survives if BOTH: + - it mentions money in a FIRE-relevant way (£/$/€, "net worth", + "portfolio", "million"), AND + - it mentions a location (country or major city). + +This prunes ~70–90 % of subreddit traffic before any LLM call. We +deliberately err on the side of false-positives — the LLM is the +expensive but reliable filter; this is the cheap pre-pass. +""" +from __future__ import annotations + +import re +from functools import lru_cache + +from fire_planner.examples.models import RawPost + +MONEY_RE = re.compile( + r"(?:[£$€]\s?\d|" # currency symbol + digit + r"\b(?:GBP|USD|EUR|JPY|AUD|CAD)\b|" + r"\bmillion\b|\bnet\s*worth\b|\bportfolio\b|\bsaved\b)", + re.IGNORECASE, +) + +# Order matters: longer, less-ambiguous tokens first. List is curated to +# cover the 12 target subs' typical countries/cities. Extend as needed. +_LOCATION_KEYWORDS: list[str] = [ + # countries + "philippines", "indonesia", "thailand", "vietnam", "malaysia", + "singapore", "japan", "korea", "taiwan", "india", "australia", + "new zealand", "canada", "united states", "usa", "uk", "ireland", + "scotland", "wales", "england", "spain", "portugal", "france", + "germany", "netherlands", "belgium", "italy", "greece", "cyprus", + "bulgaria", "romania", "poland", "czech", "hungary", "switzerland", + "austria", "denmark", "sweden", "norway", "finland", "estonia", + "uae", "dubai", "abu dhabi", "saudi", "qatar", "kuwait", "bahrain", + "mexico", "brazil", "argentina", "chile", "colombia", "peru", + "panama", "costa rica", "ecuador", + # cities common in expat-FIRE posts + "manila", "cebu", "bangkok", "chiang mai", "phuket", "ho chi minh", + "kuala lumpur", "penang", "bali", "jakarta", "tokyo", "osaka", + "lisbon", "porto", "madeira", "madrid", "barcelona", "valencia", + "limassol", "nicosia", "sofia", "athens", "berlin", "munich", + "amsterdam", "london", "edinburgh", "manchester", "dublin", + "sydney", "melbourne", "auckland", "vancouver", "toronto", + "mexico city", "buenos aires", "santiago", +] + +LOCATION_RE = re.compile( + r"\b(" + "|".join(re.escape(k) for k in _LOCATION_KEYWORDS) + r")\b", + re.IGNORECASE, +) + + +@lru_cache(maxsize=1024) +def _haystack(reddit_id: str, title: str, body: str) -> str: + return f"{title}\n{body}" + + +def is_candidate(post: RawPost) -> bool: + """Return True when `post` is worth sending to the LLM.""" + text = _haystack(post.reddit_id, post.title, post.body) + return bool(MONEY_RE.search(text) and LOCATION_RE.search(text)) diff --git a/tests/test_examples_filters.py b/tests/test_examples_filters.py index 15375bb..7e1f024 100644 --- a/tests/test_examples_filters.py +++ b/tests/test_examples_filters.py @@ -8,6 +8,7 @@ import pytest from pydantic import ValidationError from fire_planner.examples import ExtractedExample, FiStatus, RawPost, SummaryStats +from fire_planner.examples.filters import is_candidate def test_raw_post_minimal() -> None: @@ -44,3 +45,38 @@ def test_extracted_example_fi_status_enum() -> None: def test_summary_stats_optional_fields() -> None: s = SummaryStats(median=None, p25=None, p75=None) assert s.median is None + + +def _post(title: str, body: str = "") -> RawPost: + return RawPost( + reddit_id="x", + source_sub="s", + url="u", + title=title, + body=body, + created_at=date(2026, 1, 1), + ) + + +def test_filter_keeps_money_plus_location() -> None: + assert is_candidate(_post("Hit £1m living in Lisbon, Portugal")) + + +def test_filter_drops_money_without_location() -> None: + assert not is_candidate(_post("Hit £1m, feels great!")) + + +def test_filter_drops_location_without_money() -> None: + assert not is_candidate(_post("Moving to Lisbon next year")) + + +def test_filter_dollar_signs_count() -> None: + assert is_candidate(_post("$1.2M net worth, retired in Chiang Mai")) + + +def test_filter_recognises_net_worth_keyword() -> None: + assert is_candidate(_post("Net worth update — now in Bali, Indonesia")) + + +def test_filter_keyword_match_is_case_insensitive() -> None: + assert is_candidate(_post("PORTFOLIO milestone reached, settled in PHILIPPINES"))