163 lines
3.5 KiB
Python
163 lines
3.5 KiB
Python
"""Extract stock ticker symbols from free-form text.
|
|
|
|
Handles common formats:
|
|
- Dollar-prefixed: ``$AAPL``
|
|
- Exchange-prefixed: ``NASDAQ:AAPL``, ``NYSE:TSLA``
|
|
- Standalone uppercase words that look like tickers (1-5 uppercase letters)
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import re
|
|
|
|
# Common false positives: short English words, financial abbreviations, and
|
|
# exchange names that match the 1-5 uppercase letter pattern.
|
|
_FALSE_POSITIVES: frozenset[str] = frozenset(
|
|
{
|
|
# Common English words / pronouns
|
|
"A",
|
|
"I",
|
|
"AM",
|
|
"AN",
|
|
"AS",
|
|
"AT",
|
|
"BE",
|
|
"BY",
|
|
"DO",
|
|
"GO",
|
|
"IF",
|
|
"IN",
|
|
"IS",
|
|
"IT",
|
|
"ME",
|
|
"MY",
|
|
"NO",
|
|
"OF",
|
|
"OK",
|
|
"ON",
|
|
"OR",
|
|
"SO",
|
|
"TO",
|
|
"UP",
|
|
"US",
|
|
"WE",
|
|
"PM",
|
|
"THE",
|
|
"AND",
|
|
"FOR",
|
|
"NOT",
|
|
"BUT",
|
|
"ARE",
|
|
"WAS",
|
|
"HAS",
|
|
"HAD",
|
|
"ALL",
|
|
"CAN",
|
|
"HER",
|
|
"HIS",
|
|
"HOW",
|
|
"ITS",
|
|
"MAY",
|
|
"NEW",
|
|
"NOW",
|
|
"OLD",
|
|
"OUR",
|
|
"OUT",
|
|
"OWN",
|
|
"SAY",
|
|
"SHE",
|
|
"TOO",
|
|
"USE",
|
|
# Time-related
|
|
"EST",
|
|
"PST",
|
|
"CST",
|
|
"MST",
|
|
"UTC",
|
|
"GMT",
|
|
# Financial jargon
|
|
"CEO",
|
|
"CFO",
|
|
"COO",
|
|
"CTO",
|
|
"IPO",
|
|
"ETF",
|
|
"SEC",
|
|
"NYSE",
|
|
"AMEX",
|
|
"DJIA",
|
|
"GDP",
|
|
"CPI",
|
|
"FED",
|
|
"FOMC",
|
|
"FDA",
|
|
"EPS",
|
|
"P&L",
|
|
"ROI",
|
|
"YTD",
|
|
"QOQ",
|
|
"YOY",
|
|
"ATH",
|
|
"ATL",
|
|
"RSI",
|
|
"SMA",
|
|
"EMA",
|
|
"IOT",
|
|
"API",
|
|
"AI",
|
|
"ML",
|
|
"US",
|
|
"USA",
|
|
"UK",
|
|
"EU",
|
|
"IMF",
|
|
"FTC",
|
|
"DOJ",
|
|
"IRS",
|
|
"DOT",
|
|
"SPAC",
|
|
}
|
|
)
|
|
|
|
# Pattern 1: $AAPL (dollar-sign prefix)
|
|
_DOLLAR_PATTERN = re.compile(r"\$([A-Z]{1,5})\b")
|
|
|
|
# Pattern 2: NASDAQ:AAPL, NYSE:TSLA (exchange prefix)
|
|
_EXCHANGE_PATTERN = re.compile(r"\b(?:NASDAQ|NYSE|AMEX|OTC|BATS|ARCA):([A-Z]{1,5})\b")
|
|
|
|
# Pattern 3: standalone uppercase words at word boundaries (1-5 chars)
|
|
_STANDALONE_PATTERN = re.compile(r"\b([A-Z]{1,5})\b")
|
|
|
|
|
|
def extract_tickers(text: str) -> list[str]:
|
|
"""Extract deduplicated stock ticker symbols from *text*.
|
|
|
|
Returns a list of unique ticker strings in the order they were first
|
|
encountered. False positives (common English words, acronyms) are
|
|
filtered out.
|
|
"""
|
|
seen: set[str] = set()
|
|
result: list[str] = []
|
|
|
|
def _add(ticker: str) -> None:
|
|
if ticker not in seen and ticker not in _FALSE_POSITIVES:
|
|
seen.add(ticker)
|
|
result.append(ticker)
|
|
|
|
# Dollar-sign tickers have the highest signal — always include.
|
|
for match in _DOLLAR_PATTERN.finditer(text):
|
|
_add(match.group(1))
|
|
|
|
# Exchange-prefixed tickers are also high confidence.
|
|
for match in _EXCHANGE_PATTERN.finditer(text):
|
|
_add(match.group(1))
|
|
|
|
# Standalone uppercase words: only include if they look like real tickers
|
|
# (not in the false positives list). We restrict to 2-5 chars to reduce
|
|
# noise, unless they were already captured by the dollar/exchange patterns.
|
|
for match in _STANDALONE_PATTERN.finditer(text):
|
|
candidate = match.group(1)
|
|
if len(candidate) >= 2:
|
|
_add(candidate)
|
|
|
|
return result
|