trading/services/sentiment_analyzer/ticker_extractor.py
Viktor Barzin d36ae40df1
feat: productionize local service — fix signal pipeline, lower thresholds, add company-name ticker extraction
- Point Ollama to local instance via host.docker.internal, use gemma3 model
- Remove Docker Ollama service (using host's Ollama instead)
- Add company-name-to-ticker mapping (Apple→AAPL, Tesla→TSLA, etc.) for RSS articles
- Lower signal thresholds for faster feedback with paper trading:
  - FinBERT confidence: 0.6→0.4, signal strength: 0.3→0.15
  - News strategy: article_count 2→1, confidence 0.5→0.3, score ±0.3→±0.15
- Fix market data BarSet access bug (BarSet.__contains__ returns False incorrectly)
- Fix market data SIP feed error by switching to IEX feed for free Alpaca accounts
- Fix nginx proxy routing for /api/auth/* to api-gateway /auth/*
- Add seed_sample_data script
- Update tests for new thresholds and alpaca mock modules
2026-02-22 22:17:26 +00:00

219 lines
5.1 KiB
Python

"""Extract stock ticker symbols from free-form text.
Handles common formats:
- Dollar-prefixed: ``$AAPL``
- Exchange-prefixed: ``NASDAQ:AAPL``, ``NYSE:TSLA``
- Standalone uppercase words that look like tickers (1-5 uppercase letters)
- Company name mentions: ``Apple``, ``Tesla``, ``Nvidia``, etc.
"""
from __future__ import annotations
import re
# Common false positives: short English words, financial abbreviations, and
# exchange names that match the 1-5 uppercase letter pattern.
_FALSE_POSITIVES: frozenset[str] = frozenset(
{
# Common English words / pronouns
"A",
"I",
"AM",
"AN",
"AS",
"AT",
"BE",
"BY",
"DO",
"GO",
"IF",
"IN",
"IS",
"IT",
"ME",
"MY",
"NO",
"OF",
"OK",
"ON",
"OR",
"SO",
"TO",
"UP",
"US",
"WE",
"PM",
"THE",
"AND",
"FOR",
"NOT",
"BUT",
"ARE",
"WAS",
"HAS",
"HAD",
"ALL",
"CAN",
"HER",
"HIS",
"HOW",
"ITS",
"MAY",
"NEW",
"NOW",
"OLD",
"OUR",
"OUT",
"OWN",
"SAY",
"SHE",
"TOO",
"USE",
# Time-related
"EST",
"PST",
"CST",
"MST",
"UTC",
"GMT",
# Financial jargon
"CEO",
"CFO",
"COO",
"CTO",
"IPO",
"ETF",
"SEC",
"NYSE",
"AMEX",
"DJIA",
"GDP",
"CPI",
"FED",
"FOMC",
"FDA",
"EPS",
"P&L",
"ROI",
"YTD",
"QOQ",
"YOY",
"ATH",
"ATL",
"RSI",
"SMA",
"EMA",
"IOT",
"API",
"AI",
"ML",
"US",
"USA",
"UK",
"EU",
"IMF",
"FTC",
"DOJ",
"IRS",
"DOT",
"SPAC",
}
)
# Mapping of company names (lowercase) to their ticker symbols.
# Longer names are checked first to avoid partial matches.
_COMPANY_TO_TICKER: dict[str, str] = {
"alphabet": "GOOGL",
"google": "GOOGL",
"amazon": "AMZN",
"apple": "AAPL",
"microsoft": "MSFT",
"tesla": "TSLA",
"nvidia": "NVDA",
"meta platforms": "META",
"meta": "META",
"netflix": "NFLX",
"advanced micro devices": "AMD",
"amd": "AMD",
"intel": "INTC",
"broadcom": "AVGO",
"salesforce": "CRM",
"adobe": "ADBE",
"paypal": "PYPL",
"uber": "UBER",
"airbnb": "ABNB",
"spotify": "SPOT",
"shopify": "SHOP",
"snowflake": "SNOW",
"palantir": "PLTR",
"coinbase": "COIN",
"robinhood": "HOOD",
"walmart": "WMT",
"costco": "COST",
"jpmorgan": "JPM",
"goldman sachs": "GS",
"bank of america": "BAC",
"berkshire hathaway": "BRK.B",
"johnson & johnson": "JNJ",
"procter & gamble": "PG",
"coca-cola": "KO",
"disney": "DIS",
"boeing": "BA",
}
# Build a regex that matches any company name as a whole word (case-insensitive).
# Sort by length descending so multi-word names match before single-word subsets.
_COMPANY_PATTERN = re.compile(
r"\b(" + "|".join(re.escape(name) for name in sorted(_COMPANY_TO_TICKER, key=len, reverse=True)) + r")\b",
re.IGNORECASE,
)
# Pattern 1: $AAPL (dollar-sign prefix)
_DOLLAR_PATTERN = re.compile(r"\$([A-Z]{1,5})\b")
# Pattern 2: NASDAQ:AAPL, NYSE:TSLA (exchange prefix)
_EXCHANGE_PATTERN = re.compile(r"\b(?:NASDAQ|NYSE|AMEX|OTC|BATS|ARCA):([A-Z]{1,5})\b")
# Pattern 3: standalone uppercase words at word boundaries (1-5 chars)
_STANDALONE_PATTERN = re.compile(r"\b([A-Z]{1,5})\b")
def extract_tickers(text: str) -> list[str]:
"""Extract deduplicated stock ticker symbols from *text*.
Returns a list of unique ticker strings in the order they were first
encountered. False positives (common English words, acronyms) are
filtered out.
"""
seen: set[str] = set()
result: list[str] = []
def _add(ticker: str) -> None:
if ticker not in seen and ticker not in _FALSE_POSITIVES:
seen.add(ticker)
result.append(ticker)
# Dollar-sign tickers have the highest signal — always include.
for match in _DOLLAR_PATTERN.finditer(text):
_add(match.group(1))
# Exchange-prefixed tickers are also high confidence.
for match in _EXCHANGE_PATTERN.finditer(text):
_add(match.group(1))
# Company name mentions (case-insensitive).
for match in _COMPANY_PATTERN.finditer(text):
company_name = match.group(1).lower()
ticker = _COMPANY_TO_TICKER.get(company_name)
if ticker:
_add(ticker)
# Standalone uppercase words: only include if they look like real tickers
# (not in the false positives list). We restrict to 2-5 chars to reduce
# noise, unless they were already captured by the dollar/exchange patterns.
for match in _STANDALONE_PATTERN.finditer(text):
candidate = match.group(1)
if len(candidate) >= 2:
_add(candidate)
return result