trading/services/sentiment_analyzer/ticker_extractor.py

"""Extract stock ticker symbols from free-form text.

Handles common formats:
  - Dollar-prefixed: ``$AAPL``
  - Exchange-prefixed: ``NASDAQ:AAPL``, ``NYSE:TSLA``
  - Standalone uppercase words that look like tickers (1-5 uppercase letters)
"""

from __future__ import annotations

import re

# Common false positives: short English words, financial abbreviations, and
# exchange names that match the 1-5 uppercase letter pattern.
_FALSE_POSITIVES: frozenset[str] = frozenset(
    {
        # Common English words / pronouns
        "A",
        "I",
        "AM",
        "AN",
        "AS",
        "AT",
        "BE",
        "BY",
        "DO",
        "GO",
        "IF",
        "IN",
        "IS",
        "IT",
        "ME",
        "MY",
        "NO",
        "OF",
        "OK",
        "ON",
        "OR",
        "SO",
        "TO",
        "UP",
        "US",
        "WE",
        "PM",
        "THE",
        "AND",
        "FOR",
        "NOT",
        "BUT",
        "ARE",
        "WAS",
        "HAS",
        "HAD",
        "ALL",
        "CAN",
        "HER",
        "HIS",
        "HOW",
        "ITS",
        "MAY",
        "NEW",
        "NOW",
        "OLD",
        "OUR",
        "OUT",
        "OWN",
        "SAY",
        "SHE",
        "TOO",
        "USE",
        # Time-related
        "EST",
        "PST",
        "CST",
        "MST",
        "UTC",
        "GMT",
        # Financial jargon
        "CEO",
        "CFO",
        "COO",
        "CTO",
        "IPO",
        "ETF",
        "SEC",
        "NYSE",
        "AMEX",
        "DJIA",
        "GDP",
        "CPI",
        "FED",
        "FOMC",
        "FDA",
        "EPS",
        "P&L",
        "ROI",
        "YTD",
        "QOQ",
        "YOY",
        "ATH",
        "ATL",
        "RSI",
        "SMA",
        "EMA",
        "IOT",
        "API",
        "AI",
        "ML",
        "US",
        "USA",
        "UK",
        "EU",
        "IMF",
        "FTC",
        "DOJ",
        "IRS",
        "DOT",
        "SPAC",
    }
)

# Pattern 1: $AAPL (dollar-sign prefix)
_DOLLAR_PATTERN = re.compile(r"\$([A-Z]{1,5})\b")

# Pattern 2: NASDAQ:AAPL, NYSE:TSLA (exchange prefix)
_EXCHANGE_PATTERN = re.compile(r"\b(?:NASDAQ|NYSE|AMEX|OTC|BATS|ARCA):([A-Z]{1,5})\b")

# Pattern 3: standalone uppercase words at word boundaries (1-5 chars)
_STANDALONE_PATTERN = re.compile(r"\b([A-Z]{1,5})\b")


def extract_tickers(text: str) -> list[str]:
    """Extract deduplicated stock ticker symbols from *text*.

    Returns a list of unique ticker strings in the order they were first
    encountered.  False positives (common English words, acronyms) are
    filtered out.
    """
    seen: set[str] = set()
    result: list[str] = []

    def _add(ticker: str) -> None:
        if ticker not in seen and ticker not in _FALSE_POSITIVES:
            seen.add(ticker)
            result.append(ticker)

    # Dollar-sign tickers have the highest signal — always include.
    for match in _DOLLAR_PATTERN.finditer(text):
        _add(match.group(1))

    # Exchange-prefixed tickers are also high confidence.
    for match in _EXCHANGE_PATTERN.finditer(text):
        _add(match.group(1))

    # Standalone uppercase words: only include if they look like real tickers
    # (not in the false positives list).  We restrict to 2-5 chars to reduce
    # noise, unless they were already captured by the dollar/exchange patterns.
    for match in _STANDALONE_PATTERN.finditer(text):
        candidate = match.group(1)
        if len(candidate) >= 2:
            _add(candidate)

    return result