examples: regex pre-filter (MONEY_RE + LOCATION_RE)
All checks were successful
ci/woodpecker/push/woodpecker Pipeline was successful
All checks were successful
ci/woodpecker/push/woodpecker Pipeline was successful
This commit is contained in:
parent
c9bdf537ac
commit
a378c7256e
2 changed files with 100 additions and 0 deletions
64
fire_planner/examples/filters.py
Normal file
64
fire_planner/examples/filters.py
Normal file
|
|
@ -0,0 +1,64 @@
|
|||
"""Cheap regex pre-filter — keep posts that look like FIRE examples.
|
||||
|
||||
A post survives if BOTH:
|
||||
- it mentions money in a FIRE-relevant way (£/$/€, "net worth",
|
||||
"portfolio", "million"), AND
|
||||
- it mentions a location (country or major city).
|
||||
|
||||
This prunes ~70–90 % of subreddit traffic before any LLM call. We
|
||||
deliberately err on the side of false-positives — the LLM is the
|
||||
expensive but reliable filter; this is the cheap pre-pass.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
from functools import lru_cache
|
||||
|
||||
from fire_planner.examples.models import RawPost
|
||||
|
||||
MONEY_RE = re.compile(
|
||||
r"(?:[£$€]\s?\d|" # currency symbol + digit
|
||||
r"\b(?:GBP|USD|EUR|JPY|AUD|CAD)\b|"
|
||||
r"\bmillion\b|\bnet\s*worth\b|\bportfolio\b|\bsaved\b)",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
|
||||
# Order matters: longer, less-ambiguous tokens first. List is curated to
|
||||
# cover the 12 target subs' typical countries/cities. Extend as needed.
|
||||
_LOCATION_KEYWORDS: list[str] = [
|
||||
# countries
|
||||
"philippines", "indonesia", "thailand", "vietnam", "malaysia",
|
||||
"singapore", "japan", "korea", "taiwan", "india", "australia",
|
||||
"new zealand", "canada", "united states", "usa", "uk", "ireland",
|
||||
"scotland", "wales", "england", "spain", "portugal", "france",
|
||||
"germany", "netherlands", "belgium", "italy", "greece", "cyprus",
|
||||
"bulgaria", "romania", "poland", "czech", "hungary", "switzerland",
|
||||
"austria", "denmark", "sweden", "norway", "finland", "estonia",
|
||||
"uae", "dubai", "abu dhabi", "saudi", "qatar", "kuwait", "bahrain",
|
||||
"mexico", "brazil", "argentina", "chile", "colombia", "peru",
|
||||
"panama", "costa rica", "ecuador",
|
||||
# cities common in expat-FIRE posts
|
||||
"manila", "cebu", "bangkok", "chiang mai", "phuket", "ho chi minh",
|
||||
"kuala lumpur", "penang", "bali", "jakarta", "tokyo", "osaka",
|
||||
"lisbon", "porto", "madeira", "madrid", "barcelona", "valencia",
|
||||
"limassol", "nicosia", "sofia", "athens", "berlin", "munich",
|
||||
"amsterdam", "london", "edinburgh", "manchester", "dublin",
|
||||
"sydney", "melbourne", "auckland", "vancouver", "toronto",
|
||||
"mexico city", "buenos aires", "santiago",
|
||||
]
|
||||
|
||||
LOCATION_RE = re.compile(
|
||||
r"\b(" + "|".join(re.escape(k) for k in _LOCATION_KEYWORDS) + r")\b",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
|
||||
|
||||
@lru_cache(maxsize=1024)
|
||||
def _haystack(reddit_id: str, title: str, body: str) -> str:
|
||||
return f"{title}\n{body}"
|
||||
|
||||
|
||||
def is_candidate(post: RawPost) -> bool:
|
||||
"""Return True when `post` is worth sending to the LLM."""
|
||||
text = _haystack(post.reddit_id, post.title, post.body)
|
||||
return bool(MONEY_RE.search(text) and LOCATION_RE.search(text))
|
||||
|
|
@ -8,6 +8,7 @@ import pytest
|
|||
from pydantic import ValidationError
|
||||
|
||||
from fire_planner.examples import ExtractedExample, FiStatus, RawPost, SummaryStats
|
||||
from fire_planner.examples.filters import is_candidate
|
||||
|
||||
|
||||
def test_raw_post_minimal() -> None:
|
||||
|
|
@ -44,3 +45,38 @@ def test_extracted_example_fi_status_enum() -> None:
|
|||
def test_summary_stats_optional_fields() -> None:
|
||||
s = SummaryStats(median=None, p25=None, p75=None)
|
||||
assert s.median is None
|
||||
|
||||
|
||||
def _post(title: str, body: str = "") -> RawPost:
|
||||
return RawPost(
|
||||
reddit_id="x",
|
||||
source_sub="s",
|
||||
url="u",
|
||||
title=title,
|
||||
body=body,
|
||||
created_at=date(2026, 1, 1),
|
||||
)
|
||||
|
||||
|
||||
def test_filter_keeps_money_plus_location() -> None:
|
||||
assert is_candidate(_post("Hit £1m living in Lisbon, Portugal"))
|
||||
|
||||
|
||||
def test_filter_drops_money_without_location() -> None:
|
||||
assert not is_candidate(_post("Hit £1m, feels great!"))
|
||||
|
||||
|
||||
def test_filter_drops_location_without_money() -> None:
|
||||
assert not is_candidate(_post("Moving to Lisbon next year"))
|
||||
|
||||
|
||||
def test_filter_dollar_signs_count() -> None:
|
||||
assert is_candidate(_post("$1.2M net worth, retired in Chiang Mai"))
|
||||
|
||||
|
||||
def test_filter_recognises_net_worth_keyword() -> None:
|
||||
assert is_candidate(_post("Net worth update — now in Bali, Indonesia"))
|
||||
|
||||
|
||||
def test_filter_keyword_match_is_case_insensitive() -> None:
|
||||
assert is_candidate(_post("PORTFOLIO milestone reached, settled in PHILIPPINES"))
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue