- Point Ollama to local instance via host.docker.internal, use gemma3 model - Remove Docker Ollama service (using host's Ollama instead) - Add company-name-to-ticker mapping (Apple→AAPL, Tesla→TSLA, etc.) for RSS articles - Lower signal thresholds for faster feedback with paper trading: - FinBERT confidence: 0.6→0.4, signal strength: 0.3→0.15 - News strategy: article_count 2→1, confidence 0.5→0.3, score ±0.3→±0.15 - Fix market data BarSet access bug (BarSet.__contains__ returns False incorrectly) - Fix market data SIP feed error by switching to IEX feed for free Alpaca accounts - Fix nginx proxy routing for /api/auth/* to api-gateway /auth/* - Add seed_sample_data script - Update tests for new thresholds and alpaca mock modules
219 lines
5.1 KiB
Python
219 lines
5.1 KiB
Python
"""Extract stock ticker symbols from free-form text.
|
|
|
|
Handles common formats:
|
|
- Dollar-prefixed: ``$AAPL``
|
|
- Exchange-prefixed: ``NASDAQ:AAPL``, ``NYSE:TSLA``
|
|
- Standalone uppercase words that look like tickers (1-5 uppercase letters)
|
|
- Company name mentions: ``Apple``, ``Tesla``, ``Nvidia``, etc.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import re
|
|
|
|
# Common false positives: short English words, financial abbreviations, and
|
|
# exchange names that match the 1-5 uppercase letter pattern.
|
|
_FALSE_POSITIVES: frozenset[str] = frozenset(
|
|
{
|
|
# Common English words / pronouns
|
|
"A",
|
|
"I",
|
|
"AM",
|
|
"AN",
|
|
"AS",
|
|
"AT",
|
|
"BE",
|
|
"BY",
|
|
"DO",
|
|
"GO",
|
|
"IF",
|
|
"IN",
|
|
"IS",
|
|
"IT",
|
|
"ME",
|
|
"MY",
|
|
"NO",
|
|
"OF",
|
|
"OK",
|
|
"ON",
|
|
"OR",
|
|
"SO",
|
|
"TO",
|
|
"UP",
|
|
"US",
|
|
"WE",
|
|
"PM",
|
|
"THE",
|
|
"AND",
|
|
"FOR",
|
|
"NOT",
|
|
"BUT",
|
|
"ARE",
|
|
"WAS",
|
|
"HAS",
|
|
"HAD",
|
|
"ALL",
|
|
"CAN",
|
|
"HER",
|
|
"HIS",
|
|
"HOW",
|
|
"ITS",
|
|
"MAY",
|
|
"NEW",
|
|
"NOW",
|
|
"OLD",
|
|
"OUR",
|
|
"OUT",
|
|
"OWN",
|
|
"SAY",
|
|
"SHE",
|
|
"TOO",
|
|
"USE",
|
|
# Time-related
|
|
"EST",
|
|
"PST",
|
|
"CST",
|
|
"MST",
|
|
"UTC",
|
|
"GMT",
|
|
# Financial jargon
|
|
"CEO",
|
|
"CFO",
|
|
"COO",
|
|
"CTO",
|
|
"IPO",
|
|
"ETF",
|
|
"SEC",
|
|
"NYSE",
|
|
"AMEX",
|
|
"DJIA",
|
|
"GDP",
|
|
"CPI",
|
|
"FED",
|
|
"FOMC",
|
|
"FDA",
|
|
"EPS",
|
|
"P&L",
|
|
"ROI",
|
|
"YTD",
|
|
"QOQ",
|
|
"YOY",
|
|
"ATH",
|
|
"ATL",
|
|
"RSI",
|
|
"SMA",
|
|
"EMA",
|
|
"IOT",
|
|
"API",
|
|
"AI",
|
|
"ML",
|
|
"US",
|
|
"USA",
|
|
"UK",
|
|
"EU",
|
|
"IMF",
|
|
"FTC",
|
|
"DOJ",
|
|
"IRS",
|
|
"DOT",
|
|
"SPAC",
|
|
}
|
|
)
|
|
|
|
# Mapping of company names (lowercase) to their ticker symbols.
|
|
# Longer names are checked first to avoid partial matches.
|
|
_COMPANY_TO_TICKER: dict[str, str] = {
|
|
"alphabet": "GOOGL",
|
|
"google": "GOOGL",
|
|
"amazon": "AMZN",
|
|
"apple": "AAPL",
|
|
"microsoft": "MSFT",
|
|
"tesla": "TSLA",
|
|
"nvidia": "NVDA",
|
|
"meta platforms": "META",
|
|
"meta": "META",
|
|
"netflix": "NFLX",
|
|
"advanced micro devices": "AMD",
|
|
"amd": "AMD",
|
|
"intel": "INTC",
|
|
"broadcom": "AVGO",
|
|
"salesforce": "CRM",
|
|
"adobe": "ADBE",
|
|
"paypal": "PYPL",
|
|
"uber": "UBER",
|
|
"airbnb": "ABNB",
|
|
"spotify": "SPOT",
|
|
"shopify": "SHOP",
|
|
"snowflake": "SNOW",
|
|
"palantir": "PLTR",
|
|
"coinbase": "COIN",
|
|
"robinhood": "HOOD",
|
|
"walmart": "WMT",
|
|
"costco": "COST",
|
|
"jpmorgan": "JPM",
|
|
"goldman sachs": "GS",
|
|
"bank of america": "BAC",
|
|
"berkshire hathaway": "BRK.B",
|
|
"johnson & johnson": "JNJ",
|
|
"procter & gamble": "PG",
|
|
"coca-cola": "KO",
|
|
"disney": "DIS",
|
|
"boeing": "BA",
|
|
}
|
|
|
|
# Build a regex that matches any company name as a whole word (case-insensitive).
|
|
# Sort by length descending so multi-word names match before single-word subsets.
|
|
_COMPANY_PATTERN = re.compile(
|
|
r"\b(" + "|".join(re.escape(name) for name in sorted(_COMPANY_TO_TICKER, key=len, reverse=True)) + r")\b",
|
|
re.IGNORECASE,
|
|
)
|
|
|
|
# Pattern 1: $AAPL (dollar-sign prefix)
|
|
_DOLLAR_PATTERN = re.compile(r"\$([A-Z]{1,5})\b")
|
|
|
|
# Pattern 2: NASDAQ:AAPL, NYSE:TSLA (exchange prefix)
|
|
_EXCHANGE_PATTERN = re.compile(r"\b(?:NASDAQ|NYSE|AMEX|OTC|BATS|ARCA):([A-Z]{1,5})\b")
|
|
|
|
# Pattern 3: standalone uppercase words at word boundaries (1-5 chars)
|
|
_STANDALONE_PATTERN = re.compile(r"\b([A-Z]{1,5})\b")
|
|
|
|
|
|
def extract_tickers(text: str) -> list[str]:
|
|
"""Extract deduplicated stock ticker symbols from *text*.
|
|
|
|
Returns a list of unique ticker strings in the order they were first
|
|
encountered. False positives (common English words, acronyms) are
|
|
filtered out.
|
|
"""
|
|
seen: set[str] = set()
|
|
result: list[str] = []
|
|
|
|
def _add(ticker: str) -> None:
|
|
if ticker not in seen and ticker not in _FALSE_POSITIVES:
|
|
seen.add(ticker)
|
|
result.append(ticker)
|
|
|
|
# Dollar-sign tickers have the highest signal — always include.
|
|
for match in _DOLLAR_PATTERN.finditer(text):
|
|
_add(match.group(1))
|
|
|
|
# Exchange-prefixed tickers are also high confidence.
|
|
for match in _EXCHANGE_PATTERN.finditer(text):
|
|
_add(match.group(1))
|
|
|
|
# Company name mentions (case-insensitive).
|
|
for match in _COMPANY_PATTERN.finditer(text):
|
|
company_name = match.group(1).lower()
|
|
ticker = _COMPANY_TO_TICKER.get(company_name)
|
|
if ticker:
|
|
_add(ticker)
|
|
|
|
# Standalone uppercase words: only include if they look like real tickers
|
|
# (not in the false positives list). We restrict to 2-5 chars to reduce
|
|
# noise, unless they were already captured by the dollar/exchange patterns.
|
|
for match in _STANDALONE_PATTERN.finditer(text):
|
|
candidate = match.group(1)
|
|
if len(candidate) >= 2:
|
|
_add(candidate)
|
|
|
|
return result
|