feat: sentiment analyzer — FinBERT + Ollama tiered analysis
This commit is contained in:
parent
9f46071502
commit
6952a829ae
11 changed files with 976 additions and 1 deletions
113
services/sentiment_analyzer/analyzers/finbert.py
Normal file
113
services/sentiment_analyzer/analyzers/finbert.py
Normal file
|
|
@ -0,0 +1,113 @@
|
|||
"""FinBERT-based financial sentiment analyzer.
|
||||
|
||||
Uses the ProsusAI/finbert model via the HuggingFace transformers library
|
||||
to classify article text as positive, negative, or neutral.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
from typing import Any
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class FinBERTAnalyzer:
|
||||
"""Lazy-loading wrapper around a transformers sentiment-analysis pipeline.
|
||||
|
||||
The heavy ``transformers`` + ``torch`` imports and model download happen
|
||||
only once, on the first call to :meth:`analyze`.
|
||||
"""
|
||||
|
||||
def __init__(self, model_name: str = "ProsusAI/finbert", max_content_length: int = 512) -> None:
|
||||
self.model_name = model_name
|
||||
self.max_content_length = max_content_length
|
||||
self._pipeline: Any | None = None
|
||||
|
||||
def _load_pipeline(self) -> Any:
|
||||
"""Lazily load the transformers pipeline on first use."""
|
||||
if self._pipeline is None:
|
||||
from transformers import pipeline # type: ignore[import-untyped]
|
||||
|
||||
logger.info("Loading FinBERT model: %s", self.model_name)
|
||||
self._pipeline = pipeline(
|
||||
"sentiment-analysis",
|
||||
model=self.model_name,
|
||||
return_all_scores=True,
|
||||
)
|
||||
logger.info("FinBERT model loaded successfully")
|
||||
return self._pipeline
|
||||
|
||||
async def analyze(self, title: str, content: str) -> tuple[float, float]:
|
||||
"""Score the sentiment of an article.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
title:
|
||||
Article headline.
|
||||
content:
|
||||
Article body text.
|
||||
|
||||
Returns
|
||||
-------
|
||||
tuple[float, float]
|
||||
``(sentiment_score, confidence)`` where *sentiment_score* is in
|
||||
``[-1.0, 1.0]`` and *confidence* is in ``[0.0, 1.0]``.
|
||||
|
||||
The input text is truncated to ``max_content_length`` tokens by
|
||||
passing it through the model's tokenizer truncation (handled
|
||||
automatically by the transformers pipeline).
|
||||
"""
|
||||
pipe = self._load_pipeline()
|
||||
|
||||
# Combine title and content; the pipeline will truncate to model max tokens.
|
||||
text = f"{title}. {content}"
|
||||
# Truncate to a reasonable character length proportional to max_content_length
|
||||
# tokens (rough estimate: 1 token ~ 4 chars for English). The pipeline
|
||||
# tokenizer will do the precise truncation, but this avoids sending
|
||||
# enormous strings.
|
||||
char_limit = self.max_content_length * 4
|
||||
text = text[:char_limit]
|
||||
|
||||
# Run the blocking model inference in a thread pool so we don't block
|
||||
# the event loop.
|
||||
loop = asyncio.get_running_loop()
|
||||
results = await loop.run_in_executor(
|
||||
None,
|
||||
lambda: pipe(text, truncation=True, max_length=self.max_content_length),
|
||||
)
|
||||
|
||||
return self._parse_scores(results)
|
||||
|
||||
@staticmethod
|
||||
def _parse_scores(results: list[list[dict[str, Any]]]) -> tuple[float, float]:
|
||||
"""Map pipeline output to ``(score, confidence)``.
|
||||
|
||||
The ``return_all_scores=True`` pipeline returns a list of lists of dicts:
|
||||
``[[{"label": "positive", "score": 0.85}, ...]]``.
|
||||
|
||||
Mapping:
|
||||
- ``"positive"`` -> +1
|
||||
- ``"negative"`` -> -1
|
||||
- ``"neutral"`` -> 0
|
||||
|
||||
The sentiment score is the weighted sum of label polarities scaled by
|
||||
their softmax probabilities. Confidence is the maximum softmax
|
||||
probability.
|
||||
"""
|
||||
label_map = {"positive": 1.0, "negative": -1.0, "neutral": 0.0}
|
||||
|
||||
# results is [[{label, score}, ...]]
|
||||
scores = results[0]
|
||||
|
||||
sentiment_score = 0.0
|
||||
confidence = 0.0
|
||||
for entry in scores:
|
||||
label = entry["label"].lower()
|
||||
prob = entry["score"]
|
||||
sentiment_score += label_map.get(label, 0.0) * prob
|
||||
if prob > confidence:
|
||||
confidence = prob
|
||||
|
||||
return sentiment_score, confidence
|
||||
Loading…
Add table
Add a link
Reference in a new issue