examples: regex pre-filter (MONEY_RE + LOCATION_RE)
All checks were successful
ci/woodpecker/push/woodpecker Pipeline was successful

This commit is contained in:
Viktor Barzin 2026-05-28 22:14:59 +00:00
parent c9bdf537ac
commit a378c7256e
2 changed files with 100 additions and 0 deletions

View file

@ -0,0 +1,64 @@
"""Cheap regex pre-filter — keep posts that look like FIRE examples.
A post survives if BOTH:
- it mentions money in a FIRE-relevant way (£/$/, "net worth",
"portfolio", "million"), AND
- it mentions a location (country or major city).
This prunes ~7090 % of subreddit traffic before any LLM call. We
deliberately err on the side of false-positives the LLM is the
expensive but reliable filter; this is the cheap pre-pass.
"""
from __future__ import annotations
import re
from functools import lru_cache
from fire_planner.examples.models import RawPost
MONEY_RE = re.compile(
r"(?:[£$€]\s?\d|" # currency symbol + digit
r"\b(?:GBP|USD|EUR|JPY|AUD|CAD)\b|"
r"\bmillion\b|\bnet\s*worth\b|\bportfolio\b|\bsaved\b)",
re.IGNORECASE,
)
# Order matters: longer, less-ambiguous tokens first. List is curated to
# cover the 12 target subs' typical countries/cities. Extend as needed.
_LOCATION_KEYWORDS: list[str] = [
# countries
"philippines", "indonesia", "thailand", "vietnam", "malaysia",
"singapore", "japan", "korea", "taiwan", "india", "australia",
"new zealand", "canada", "united states", "usa", "uk", "ireland",
"scotland", "wales", "england", "spain", "portugal", "france",
"germany", "netherlands", "belgium", "italy", "greece", "cyprus",
"bulgaria", "romania", "poland", "czech", "hungary", "switzerland",
"austria", "denmark", "sweden", "norway", "finland", "estonia",
"uae", "dubai", "abu dhabi", "saudi", "qatar", "kuwait", "bahrain",
"mexico", "brazil", "argentina", "chile", "colombia", "peru",
"panama", "costa rica", "ecuador",
# cities common in expat-FIRE posts
"manila", "cebu", "bangkok", "chiang mai", "phuket", "ho chi minh",
"kuala lumpur", "penang", "bali", "jakarta", "tokyo", "osaka",
"lisbon", "porto", "madeira", "madrid", "barcelona", "valencia",
"limassol", "nicosia", "sofia", "athens", "berlin", "munich",
"amsterdam", "london", "edinburgh", "manchester", "dublin",
"sydney", "melbourne", "auckland", "vancouver", "toronto",
"mexico city", "buenos aires", "santiago",
]
LOCATION_RE = re.compile(
r"\b(" + "|".join(re.escape(k) for k in _LOCATION_KEYWORDS) + r")\b",
re.IGNORECASE,
)
@lru_cache(maxsize=1024)
def _haystack(reddit_id: str, title: str, body: str) -> str:
return f"{title}\n{body}"
def is_candidate(post: RawPost) -> bool:
"""Return True when `post` is worth sending to the LLM."""
text = _haystack(post.reddit_id, post.title, post.body)
return bool(MONEY_RE.search(text) and LOCATION_RE.search(text))

View file

@ -8,6 +8,7 @@ import pytest
from pydantic import ValidationError
from fire_planner.examples import ExtractedExample, FiStatus, RawPost, SummaryStats
from fire_planner.examples.filters import is_candidate
def test_raw_post_minimal() -> None:
@ -44,3 +45,38 @@ def test_extracted_example_fi_status_enum() -> None:
def test_summary_stats_optional_fields() -> None:
s = SummaryStats(median=None, p25=None, p75=None)
assert s.median is None
def _post(title: str, body: str = "") -> RawPost:
return RawPost(
reddit_id="x",
source_sub="s",
url="u",
title=title,
body=body,
created_at=date(2026, 1, 1),
)
def test_filter_keeps_money_plus_location() -> None:
assert is_candidate(_post("Hit £1m living in Lisbon, Portugal"))
def test_filter_drops_money_without_location() -> None:
assert not is_candidate(_post("Hit £1m, feels great!"))
def test_filter_drops_location_without_money() -> None:
assert not is_candidate(_post("Moving to Lisbon next year"))
def test_filter_dollar_signs_count() -> None:
assert is_candidate(_post("$1.2M net worth, retired in Chiang Mai"))
def test_filter_recognises_net_worth_keyword() -> None:
assert is_candidate(_post("Net worth update — now in Bali, Indonesia"))
def test_filter_keyword_match_is_case_insensitive() -> None:
assert is_candidate(_post("PORTFOLIO milestone reached, settled in PHILIPPINES"))