feat: sentiment analyzer — FinBERT + Ollama tiered analysis

2026-02-22 15:27:06 +00:00 · 2026-02-22 15:27:06 +00:00 · 6952a829ae
commit 6952a829ae
parent 9f46071502
11 changed files with 976 additions and 1 deletions
--- a/services/sentiment_analyzer/analyzers/finbert.py
+++ b/services/sentiment_analyzer/analyzers/finbert.py
@ -0,0 +1,113 @@
+"""FinBERT-based financial sentiment analyzer.
+
+Uses the ProsusAI/finbert model via the HuggingFace transformers library
+to classify article text as positive, negative, or neutral.
+"""
+
+from __future__ import annotations
+
+import asyncio
+import logging
+from typing import Any
+
+logger = logging.getLogger(__name__)
+
+
+class FinBERTAnalyzer:
+    """Lazy-loading wrapper around a transformers sentiment-analysis pipeline.
+
+    The heavy ``transformers`` + ``torch`` imports and model download happen
+    only once, on the first call to :meth:`analyze`.
+    """
+
+    def __init__(self, model_name: str = "ProsusAI/finbert", max_content_length: int = 512) -> None:
+        self.model_name = model_name
+        self.max_content_length = max_content_length
+        self._pipeline: Any | None = None
+
+    def _load_pipeline(self) -> Any:
+        """Lazily load the transformers pipeline on first use."""
+        if self._pipeline is None:
+            from transformers import pipeline  # type: ignore[import-untyped]
+
+            logger.info("Loading FinBERT model: %s", self.model_name)
+            self._pipeline = pipeline(
+                "sentiment-analysis",
+                model=self.model_name,
+                return_all_scores=True,
+            )
+            logger.info("FinBERT model loaded successfully")
+        return self._pipeline
+
+    async def analyze(self, title: str, content: str) -> tuple[float, float]:
+        """Score the sentiment of an article.
+
+        Parameters
+        ----------
+        title:
+            Article headline.
+        content:
+            Article body text.
+
+        Returns
+        -------
+        tuple[float, float]
+            ``(sentiment_score, confidence)`` where *sentiment_score* is in
+            ``[-1.0, 1.0]`` and *confidence* is in ``[0.0, 1.0]``.
+
+        The input text is truncated to ``max_content_length`` tokens by
+        passing it through the model's tokenizer truncation (handled
+        automatically by the transformers pipeline).
+        """
+        pipe = self._load_pipeline()
+
+        # Combine title and content; the pipeline will truncate to model max tokens.
+        text = f"{title}. {content}"
+        # Truncate to a reasonable character length proportional to max_content_length
+        # tokens (rough estimate: 1 token ~ 4 chars for English).  The pipeline
+        # tokenizer will do the precise truncation, but this avoids sending
+        # enormous strings.
+        char_limit = self.max_content_length * 4
+        text = text[:char_limit]
+
+        # Run the blocking model inference in a thread pool so we don't block
+        # the event loop.
+        loop = asyncio.get_running_loop()
+        results = await loop.run_in_executor(
+            None,
+            lambda: pipe(text, truncation=True, max_length=self.max_content_length),
+        )
+
+        return self._parse_scores(results)
+
+    @staticmethod
+    def _parse_scores(results: list[list[dict[str, Any]]]) -> tuple[float, float]:
+        """Map pipeline output to ``(score, confidence)``.
+
+        The ``return_all_scores=True`` pipeline returns a list of lists of dicts:
+        ``[[{"label": "positive", "score": 0.85}, ...]]``.
+
+        Mapping:
+          - ``"positive"`` -> +1
+          - ``"negative"`` -> -1
+          - ``"neutral"``  ->  0
+
+        The sentiment score is the weighted sum of label polarities scaled by
+        their softmax probabilities.  Confidence is the maximum softmax
+        probability.
+        """
+        label_map = {"positive": 1.0, "negative": -1.0, "neutral": 0.0}
+
+        # results is [[{label, score}, ...]]
+        scores = results[0]
+
+        sentiment_score = 0.0
+        confidence = 0.0
+        for entry in scores:
+            label = entry["label"].lower()
+            prob = entry["score"]
+            sentiment_score += label_map.get(label, 0.0) * prob
+            if prob > confidence:
+                confidence = prob
+
+        return sentiment_score, confidence