feat(meet-kevin): Claude Sonnet 4.6 LLM analyzer (tool-use forcing + prompt cache)
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
145f7dbec5
commit
8309556c00
2 changed files with 887 additions and 0 deletions
427
services/meet_kevin_watcher/llm_analyzer.py
Normal file
427
services/meet_kevin_watcher/llm_analyzer.py
Normal file
|
|
@ -0,0 +1,427 @@
|
|||
"""Claude LLM analyzer for Meet Kevin video transcripts.
|
||||
|
||||
Calls Claude Sonnet 4.6 with tool-use forcing to extract structured
|
||||
MeetKevinAnalysis from a video transcript. Uses prompt caching on the
|
||||
system block to reduce cost across videos processed within the same
|
||||
5-minute window.
|
||||
|
||||
Public API:
|
||||
SYSTEM_PROMPT — module-level analyst instructions
|
||||
compute_cost_usd() — Decimal-precise cost from token counts
|
||||
LlmCallResult — frozen dataclass returned by analyze()
|
||||
LlmAnalyzer — async class; .analyze() does the API call
|
||||
"""
|
||||
|
||||
import logging
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime
|
||||
from decimal import Decimal
|
||||
from typing import Any
|
||||
|
||||
from anthropic import AsyncAnthropic
|
||||
|
||||
from shared.schemas.meet_kevin import MeetKevinAnalysis
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Pricing table (USD per 1 000 000 tokens: input, output)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
_PRICING: dict[str, tuple[Decimal, Decimal]] = {
|
||||
"claude-sonnet-4-6": (Decimal("3"), Decimal("15")),
|
||||
"claude-opus-4-7": (Decimal("15"), Decimal("75")),
|
||||
"claude-haiku-4-5-20251001": (Decimal("1"), Decimal("5")),
|
||||
}
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# System prompt
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
SYSTEM_PROMPT = """
|
||||
You are a professional financial analyst specialising in retail investor sentiment.
|
||||
Your task is to read the full transcript of a Meet Kevin (Kevin Paffrath) YouTube
|
||||
video and extract a structured investment analysis from it.
|
||||
|
||||
## Your mission
|
||||
|
||||
Read the transcript carefully and produce a single, precise call to the
|
||||
`submit_analysis` tool. Do **not** respond with prose — your entire output must be
|
||||
that one tool call with all required fields filled in correctly.
|
||||
|
||||
## What to extract
|
||||
|
||||
### Market outlook
|
||||
Identify the overall market direction Kevin is expressing: bullish, bearish, neutral,
|
||||
or mixed. Write a concise `market_outlook_reasoning` (2–4 sentences) that explains
|
||||
*why* you assigned that direction, grounded in specific statements from the video.
|
||||
|
||||
### Macro themes
|
||||
List the 2–6 highest-level economic or policy themes Kevin discusses (e.g.
|
||||
"Federal Reserve rate path", "AI capex cycle", "commercial real estate stress",
|
||||
"dollar strength", "energy transition"). These should be phrase-length labels, not
|
||||
full sentences.
|
||||
|
||||
### Key risks
|
||||
List the 2–5 principal downside risks Kevin flags. Again, short phrase labels, not
|
||||
paragraphs. Only include risks Kevin explicitly names or clearly implies — do not
|
||||
invent risks he did not discuss.
|
||||
|
||||
### Summary
|
||||
Write a ~200-word plain-English summary of the video's investment thesis. Focus on
|
||||
actionable takeaways and any specific catalysts Kevin mentions. Avoid filler phrases
|
||||
like "In this video Kevin discusses…" — start directly with the insight.
|
||||
|
||||
### Per-ticker mentions (tickers field)
|
||||
Extract every stock, ETF, or crypto ticker that Kevin makes a substantive statement
|
||||
about. For each one, fill in the following:
|
||||
|
||||
- **symbol** — The uppercase ticker symbol (e.g. "NVDA", "SPY", "BTC"). If Kevin
|
||||
mentions the company name but not the ticker, infer the ticker from the name (e.g.
|
||||
"Nvidia" → "NVDA"). Max 6 characters. Only include tickers you are confident about.
|
||||
|
||||
- **action** — The clearest action signal you can infer from what Kevin says. Use
|
||||
exactly one of: `buy`, `sell`, `hold`, `watch`, `avoid`. If Kevin expresses
|
||||
interest but no clear directional view, use `watch`. If he says he is exiting or
|
||||
would not touch it, use `sell` or `avoid` respectively. Do not default to `hold`
|
||||
just because you are unsure — skip the ticker instead.
|
||||
|
||||
- **conviction** — A float between 0.0 and 1.0 representing how confident Kevin
|
||||
sounds. Use 0.8–1.0 for "I'm buying this aggressively / this is my top pick",
|
||||
0.5–0.7 for a clear directional view with some hedging, 0.2–0.4 for a tentative
|
||||
or heavily-caveated take. A ticker Kevin mentions only in passing (< 20 words of
|
||||
commentary) should be **skipped entirely** rather than assigned low conviction.
|
||||
|
||||
- **time_horizon** — Pick the closest match from: `intraday`, `days`, `weeks`,
|
||||
`months`, `long_term`, `unspecified`. If Kevin does not say, use `unspecified`.
|
||||
|
||||
- **rationale_quote** — A short verbatim or lightly paraphrased quote (20–80 words)
|
||||
from the transcript that best justifies the action you assigned. Include enough
|
||||
context to be meaningful on its own.
|
||||
|
||||
- **video_timestamp_seconds** — If the transcript includes segment timestamps (lines
|
||||
formatted as `[<N>s] <text>`), set this to the integer second where Kevin first
|
||||
makes the substantive statement about this ticker. If no timestamps are available,
|
||||
set to null.
|
||||
|
||||
## Rules for ticker inclusion
|
||||
|
||||
1. **Skip tickers mentioned only in passing.** Kevin often references tickers as
|
||||
examples or comparisons without making any recommendation. If he says fewer than
|
||||
~20 words about a ticker with no clear directional signal, omit it from `tickers`.
|
||||
|
||||
2. **Do not duplicate tickers.** If Kevin mentions the same ticker multiple times,
|
||||
merge the signals into a single entry that represents his overall view from the
|
||||
video. Use the timestamp of the *first* substantive mention.
|
||||
|
||||
3. **Symbols only, no company names.** The `symbol` field must be a ticker, not a
|
||||
company name. "Nvidia" is wrong; "NVDA" is correct.
|
||||
|
||||
4. **Conviction scores are comparative.** Calibrate them relative to each other
|
||||
within the video — Kevin's "top conviction" pick in a video might be 0.85, while
|
||||
a hedged mention is 0.45.
|
||||
|
||||
## Quality checklist (review before calling submit_analysis)
|
||||
|
||||
- [ ] `market_outlook_direction` is one of: bullish, neutral, bearish, mixed
|
||||
- [ ] `macro_themes` has 2–6 items, each a concise phrase
|
||||
- [ ] `key_risks` has 2–5 items, each a concise phrase
|
||||
- [ ] `summary` is approximately 200 words
|
||||
- [ ] Every ticker in `tickers` has a clear actionable signal (no "I'm not sure")
|
||||
- [ ] Tickers mentioned only in passing are omitted
|
||||
- [ ] `conviction` values are floats in [0.0, 1.0]
|
||||
- [ ] `time_horizon` is one of the six allowed values
|
||||
- [ ] `rationale_quote` is grounded in something Kevin actually said
|
||||
- [ ] You are calling `submit_analysis` exactly once with all required fields
|
||||
|
||||
Now read the transcript provided in the user message and call `submit_analysis`.
|
||||
""".strip()
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Tool definition (JSON Schema mirroring MeetKevinAnalysis)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
_ANALYSIS_TOOL: dict[str, Any] = {
|
||||
"name": "submit_analysis",
|
||||
"description": (
|
||||
"Submit a structured analysis of a Meet Kevin video transcript. "
|
||||
"Call this exactly once with all fields filled in."
|
||||
),
|
||||
"input_schema": {
|
||||
"type": "object",
|
||||
"required": [
|
||||
"market_outlook_direction",
|
||||
"market_outlook_reasoning",
|
||||
"macro_themes",
|
||||
"key_risks",
|
||||
"summary",
|
||||
"tickers",
|
||||
],
|
||||
"properties": {
|
||||
"market_outlook_direction": {
|
||||
"type": "string",
|
||||
"enum": ["bullish", "neutral", "bearish", "mixed"],
|
||||
"description": "Overall market sentiment direction",
|
||||
},
|
||||
"market_outlook_reasoning": {
|
||||
"type": "string",
|
||||
"description": "2-4 sentence explanation of the market outlook direction",
|
||||
},
|
||||
"macro_themes": {
|
||||
"type": "array",
|
||||
"items": {"type": "string"},
|
||||
"description": "2-6 high-level macro economic themes discussed",
|
||||
},
|
||||
"key_risks": {
|
||||
"type": "array",
|
||||
"items": {"type": "string"},
|
||||
"description": "2-5 principal downside risks Kevin mentions",
|
||||
},
|
||||
"summary": {
|
||||
"type": "string",
|
||||
"description": "~200-word plain-English investment thesis summary",
|
||||
},
|
||||
"tickers": {
|
||||
"type": "array",
|
||||
"description": "Per-ticker mentions with action and conviction",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"required": [
|
||||
"symbol",
|
||||
"action",
|
||||
"conviction",
|
||||
"time_horizon",
|
||||
"rationale_quote",
|
||||
"video_timestamp_seconds",
|
||||
],
|
||||
"properties": {
|
||||
"symbol": {
|
||||
"type": "string",
|
||||
"description": "Uppercase ticker symbol (1-6 chars)",
|
||||
},
|
||||
"action": {
|
||||
"type": "string",
|
||||
"enum": ["buy", "sell", "hold", "watch", "avoid"],
|
||||
"description": "Recommendation action",
|
||||
},
|
||||
"conviction": {
|
||||
"type": "number",
|
||||
"minimum": 0.0,
|
||||
"maximum": 1.0,
|
||||
"description": "Confidence in recommendation (0.0-1.0)",
|
||||
},
|
||||
"time_horizon": {
|
||||
"type": "string",
|
||||
"enum": [
|
||||
"intraday",
|
||||
"days",
|
||||
"weeks",
|
||||
"months",
|
||||
"long_term",
|
||||
"unspecified",
|
||||
],
|
||||
"description": "Time horizon for the recommendation",
|
||||
},
|
||||
"rationale_quote": {
|
||||
"type": "string",
|
||||
"description": "Short verbatim or paraphrased quote from video",
|
||||
},
|
||||
"video_timestamp_seconds": {
|
||||
"type": ["integer", "null"],
|
||||
"description": "Timestamp in seconds for deep-link target",
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Public helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def compute_cost_usd(model: str, input_tokens: int, output_tokens: int) -> Decimal:
|
||||
"""Compute LLM call cost in USD using pinned per-model pricing.
|
||||
|
||||
Args:
|
||||
model: Model identifier string (must be a key in _PRICING).
|
||||
input_tokens: Number of input/prompt tokens consumed.
|
||||
output_tokens: Number of output/completion tokens generated.
|
||||
|
||||
Returns:
|
||||
Cost as a Decimal. Returns Decimal("0") for unknown models (logs warning).
|
||||
"""
|
||||
pricing = _PRICING.get(model)
|
||||
if pricing is None:
|
||||
logger.warning("compute_cost_usd: unknown model %r — returning zero cost", model)
|
||||
return Decimal("0")
|
||||
|
||||
price_per_m_input, price_per_m_output = pricing
|
||||
million = Decimal("1000000")
|
||||
cost = (
|
||||
Decimal(input_tokens) / million * price_per_m_input
|
||||
+ Decimal(output_tokens) / million * price_per_m_output
|
||||
)
|
||||
return cost.quantize(Decimal("0.0001"))
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Result dataclass
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class LlmCallResult:
|
||||
"""Immutable result of one LLM analyze() call."""
|
||||
|
||||
analysis: MeetKevinAnalysis
|
||||
raw_response: dict
|
||||
prompt_tokens: int
|
||||
completion_tokens: int
|
||||
cost_usd: Decimal
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Analyzer class
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
_MAX_SEGMENTS = 1000
|
||||
|
||||
|
||||
class LlmAnalyzer:
|
||||
"""Calls Claude to extract structured analysis from a video transcript.
|
||||
|
||||
Args:
|
||||
client: Configured AsyncAnthropic client.
|
||||
model: Model identifier (e.g. "claude-sonnet-4-6").
|
||||
prompt_version: Prompt version string stored in kevin_analyses.
|
||||
"""
|
||||
|
||||
def __init__(self, client: AsyncAnthropic, model: str, prompt_version: str) -> None:
|
||||
self._client = client
|
||||
self._model = model
|
||||
self._prompt_version = prompt_version
|
||||
|
||||
async def analyze(
|
||||
self,
|
||||
*,
|
||||
title: str,
|
||||
description: str,
|
||||
published_at: datetime,
|
||||
transcript_text: str,
|
||||
transcript_segments: list[dict],
|
||||
) -> LlmCallResult:
|
||||
"""Run Claude analysis on a transcript and return a structured result.
|
||||
|
||||
Args:
|
||||
title: Video title.
|
||||
description: Video description (may be empty).
|
||||
published_at: UTC publication timestamp.
|
||||
transcript_text: Full concatenated transcript text.
|
||||
transcript_segments: List of {start, end, text} dicts.
|
||||
|
||||
Returns:
|
||||
LlmCallResult with parsed MeetKevinAnalysis and token accounting.
|
||||
|
||||
Raises:
|
||||
ValueError: If the response contains no tool_use block.
|
||||
pydantic.ValidationError: If tool_use input fails schema validation.
|
||||
"""
|
||||
user_msg = self._build_user_message(
|
||||
title=title,
|
||||
description=description,
|
||||
published_at=published_at,
|
||||
transcript_text=transcript_text,
|
||||
transcript_segments=transcript_segments,
|
||||
)
|
||||
|
||||
response = await self._client.messages.create(
|
||||
model=self._model,
|
||||
max_tokens=4096,
|
||||
system=[
|
||||
{
|
||||
"type": "text",
|
||||
"text": SYSTEM_PROMPT,
|
||||
"cache_control": {"type": "ephemeral"},
|
||||
}
|
||||
],
|
||||
tools=[_ANALYSIS_TOOL],
|
||||
tool_choice={"type": "tool", "name": "submit_analysis"},
|
||||
messages=[{"role": "user", "content": user_msg}],
|
||||
)
|
||||
|
||||
# Find the first tool_use block
|
||||
tool_block = next(
|
||||
(b for b in response.content if b.type == "tool_use"),
|
||||
None,
|
||||
)
|
||||
if tool_block is None:
|
||||
raise ValueError(
|
||||
f"Claude response contained no tool_use block "
|
||||
f"(stop_reason={response.stop_reason!r})"
|
||||
)
|
||||
|
||||
analysis = MeetKevinAnalysis.model_validate(tool_block.input)
|
||||
|
||||
prompt_tokens: int = response.usage.input_tokens
|
||||
completion_tokens: int = response.usage.output_tokens
|
||||
cost_usd = compute_cost_usd(self._model, prompt_tokens, completion_tokens)
|
||||
|
||||
raw_response: dict = {
|
||||
"stop_reason": response.stop_reason,
|
||||
"tool_name": tool_block.name,
|
||||
"tool_input": tool_block.input,
|
||||
"usage": {
|
||||
"input_tokens": prompt_tokens,
|
||||
"output_tokens": completion_tokens,
|
||||
},
|
||||
}
|
||||
|
||||
return LlmCallResult(
|
||||
analysis=analysis,
|
||||
raw_response=raw_response,
|
||||
prompt_tokens=prompt_tokens,
|
||||
completion_tokens=completion_tokens,
|
||||
cost_usd=cost_usd,
|
||||
)
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Private helpers
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def _build_user_message(
|
||||
self,
|
||||
*,
|
||||
title: str,
|
||||
description: str,
|
||||
published_at: datetime,
|
||||
transcript_text: str,
|
||||
transcript_segments: list[dict],
|
||||
) -> str:
|
||||
"""Build the user-turn message for the API call."""
|
||||
parts: list[str] = [
|
||||
f"Title: {title}",
|
||||
f"Published: {published_at.strftime('%Y-%m-%d %H:%M UTC')}",
|
||||
]
|
||||
if description:
|
||||
parts.append(f"Description: {description}")
|
||||
|
||||
parts.append("") # blank line before transcript
|
||||
|
||||
if transcript_segments:
|
||||
# Prefer timestamped segments (up to _MAX_SEGMENTS)
|
||||
segment_lines = [
|
||||
f"[{int(seg.get('start', 0))}s] {seg.get('text', '').strip()}"
|
||||
for seg in transcript_segments[:_MAX_SEGMENTS]
|
||||
]
|
||||
parts.append("Transcript (with timestamps):")
|
||||
parts.extend(segment_lines)
|
||||
elif transcript_text:
|
||||
parts.append("Transcript:")
|
||||
parts.append(transcript_text)
|
||||
else:
|
||||
parts.append("Transcript: (no transcript available)")
|
||||
|
||||
return "\n".join(parts)
|
||||
460
tests/services/meet_kevin_watcher/test_llm_analyzer.py
Normal file
460
tests/services/meet_kevin_watcher/test_llm_analyzer.py
Normal file
|
|
@ -0,0 +1,460 @@
|
|||
"""Tests for the Claude LLM analyzer (Task 7).
|
||||
|
||||
Tests use MagicMock/AsyncMock to avoid real API calls.
|
||||
"""
|
||||
|
||||
from datetime import datetime, timezone
|
||||
from decimal import Decimal
|
||||
from unittest.mock import AsyncMock, MagicMock
|
||||
|
||||
import pytest
|
||||
|
||||
from services.meet_kevin_watcher.llm_analyzer import (
|
||||
SYSTEM_PROMPT,
|
||||
LlmAnalyzer,
|
||||
LlmCallResult,
|
||||
compute_cost_usd,
|
||||
)
|
||||
from shared.schemas.meet_kevin import (
|
||||
MarketOutlook,
|
||||
MeetKevinAnalysis,
|
||||
TickerAction,
|
||||
TimeHorizon,
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Test helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _make_anthropic_response(tool_input, in_tokens=5000, out_tokens=800):
|
||||
"""Build a minimal mock of an Anthropic messages.create response."""
|
||||
block = MagicMock()
|
||||
block.type = "tool_use"
|
||||
block.name = "submit_analysis"
|
||||
block.input = tool_input
|
||||
|
||||
resp = MagicMock()
|
||||
resp.content = [block]
|
||||
resp.usage = MagicMock(input_tokens=in_tokens, output_tokens=out_tokens)
|
||||
resp.stop_reason = "tool_use"
|
||||
return resp
|
||||
|
||||
|
||||
def _valid_analysis_input() -> dict:
|
||||
"""Return a dict that Pydantic can validate into MeetKevinAnalysis."""
|
||||
return {
|
||||
"market_outlook_direction": "bullish",
|
||||
"market_outlook_reasoning": "Strong earnings and low unemployment.",
|
||||
"macro_themes": ["Fed pivot", "AI boom"],
|
||||
"key_risks": ["Inflation rebound", "Credit crunch"],
|
||||
"summary": "Kevin discussed the current bull market and highlighted several tech stocks.",
|
||||
"tickers": [
|
||||
{
|
||||
"symbol": "NVDA",
|
||||
"action": "buy",
|
||||
"conviction": 0.85,
|
||||
"time_horizon": "months",
|
||||
"rationale_quote": "AI infrastructure buildout has years to run",
|
||||
"video_timestamp_seconds": 320,
|
||||
}
|
||||
],
|
||||
}
|
||||
|
||||
|
||||
def _make_client(response=None):
|
||||
"""Return a mocked AsyncAnthropic client with messages.create wired up."""
|
||||
mock_create = AsyncMock(return_value=response)
|
||||
mock_messages = MagicMock()
|
||||
mock_messages.create = mock_create
|
||||
|
||||
client = MagicMock()
|
||||
client.messages = mock_messages
|
||||
return client, mock_create
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# compute_cost_usd
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestComputeCostUsd:
|
||||
"""Verify monetary cost calculations using Decimal arithmetic."""
|
||||
|
||||
def test_sonnet_46_pricing(self):
|
||||
"""claude-sonnet-4-6: $3/M input + $15/M output."""
|
||||
# 1M input + 1M output = $3 + $15 = $18
|
||||
result = compute_cost_usd("claude-sonnet-4-6", 1_000_000, 1_000_000)
|
||||
assert result == Decimal("18.0000")
|
||||
|
||||
def test_opus_47_pricing(self):
|
||||
"""claude-opus-4-7: $15/M input + $75/M output."""
|
||||
result = compute_cost_usd("claude-opus-4-7", 1_000_000, 1_000_000)
|
||||
assert result == Decimal("90.0000")
|
||||
|
||||
def test_haiku_45_pricing(self):
|
||||
"""claude-haiku-4-5-20251001: $1/M input + $5/M output."""
|
||||
result = compute_cost_usd("claude-haiku-4-5-20251001", 1_000_000, 1_000_000)
|
||||
assert result == Decimal("6.0000")
|
||||
|
||||
def test_unknown_model_returns_zero(self):
|
||||
"""Unknown model logs warning and returns Decimal('0')."""
|
||||
result = compute_cost_usd("unknown-model", 1000, 1000)
|
||||
assert result == Decimal("0")
|
||||
|
||||
def test_zero_tokens(self):
|
||||
"""Zero tokens produce zero cost."""
|
||||
result = compute_cost_usd("claude-sonnet-4-6", 0, 0)
|
||||
assert result == Decimal("0")
|
||||
|
||||
def test_result_is_decimal(self):
|
||||
"""Return type is always Decimal, not float."""
|
||||
result = compute_cost_usd("claude-sonnet-4-6", 5000, 800)
|
||||
assert isinstance(result, Decimal)
|
||||
|
||||
def test_small_realistic_call(self):
|
||||
"""Realistic 10K input + 1K output token call (Sonnet 4.6)."""
|
||||
# input: 10000/1_000_000 * 3 = 0.03000
|
||||
# output: 1000/1_000_000 * 15 = 0.01500
|
||||
# total: 0.04500
|
||||
result = compute_cost_usd("claude-sonnet-4-6", 10_000, 1_000)
|
||||
assert result == Decimal("0.0450")
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# SYSTEM_PROMPT
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestSystemPrompt:
|
||||
"""Verify the system prompt has the required content markers."""
|
||||
|
||||
def test_contains_submit_analysis(self):
|
||||
"""SYSTEM_PROMPT must reference the tool name 'submit_analysis'."""
|
||||
assert "submit_analysis" in SYSTEM_PROMPT
|
||||
|
||||
def test_contains_ticker(self):
|
||||
"""SYSTEM_PROMPT must mention 'ticker' (case-insensitive)."""
|
||||
assert "ticker" in SYSTEM_PROMPT.lower()
|
||||
|
||||
def test_is_substantial(self):
|
||||
"""SYSTEM_PROMPT should be at least 300 words (analyst guidance)."""
|
||||
word_count = len(SYSTEM_PROMPT.split())
|
||||
assert word_count >= 300, f"SYSTEM_PROMPT is only {word_count} words"
|
||||
|
||||
def test_mentions_conviction(self):
|
||||
"""SYSTEM_PROMPT should mention conviction scoring."""
|
||||
assert "conviction" in SYSTEM_PROMPT.lower()
|
||||
|
||||
def test_mentions_time_horizon(self):
|
||||
"""SYSTEM_PROMPT should describe time_horizon field."""
|
||||
assert "time_horizon" in SYSTEM_PROMPT or "time horizon" in SYSTEM_PROMPT.lower()
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# LlmCallResult dataclass
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestLlmCallResult:
|
||||
"""Verify LlmCallResult shape and immutability."""
|
||||
|
||||
def test_is_frozen(self):
|
||||
"""LlmCallResult must be a frozen dataclass."""
|
||||
analysis = MeetKevinAnalysis(**_valid_analysis_input())
|
||||
result = LlmCallResult(
|
||||
analysis=analysis,
|
||||
raw_response={"stop_reason": "tool_use"},
|
||||
prompt_tokens=5000,
|
||||
completion_tokens=800,
|
||||
cost_usd=Decimal("0.027"),
|
||||
)
|
||||
with pytest.raises((AttributeError, TypeError)):
|
||||
result.prompt_tokens = 9999 # type: ignore
|
||||
|
||||
def test_fields_accessible(self):
|
||||
"""All five fields are accessible on LlmCallResult."""
|
||||
analysis = MeetKevinAnalysis(**_valid_analysis_input())
|
||||
cost = Decimal("0.027")
|
||||
result = LlmCallResult(
|
||||
analysis=analysis,
|
||||
raw_response={"stop_reason": "tool_use"},
|
||||
prompt_tokens=5000,
|
||||
completion_tokens=800,
|
||||
cost_usd=cost,
|
||||
)
|
||||
assert result.analysis is analysis
|
||||
assert result.raw_response == {"stop_reason": "tool_use"}
|
||||
assert result.prompt_tokens == 5000
|
||||
assert result.completion_tokens == 800
|
||||
assert result.cost_usd == cost
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# LlmAnalyzer.analyze — happy path
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestLlmAnalyzerHappyPath:
|
||||
"""Happy-path tests for the analyzer."""
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_returns_llm_call_result(self):
|
||||
"""analyze() returns an LlmCallResult with parsed MeetKevinAnalysis."""
|
||||
tool_input = _valid_analysis_input()
|
||||
resp = _make_anthropic_response(tool_input, in_tokens=5000, out_tokens=800)
|
||||
client, mock_create = _make_client(resp)
|
||||
|
||||
analyzer = LlmAnalyzer(client=client, model="claude-sonnet-4-6", prompt_version="v1")
|
||||
result = await analyzer.analyze(
|
||||
title="Market Update",
|
||||
description="Kevin covers the latest market trends.",
|
||||
published_at=datetime(2026, 5, 21, 12, 0, 0, tzinfo=timezone.utc),
|
||||
transcript_text="Welcome to today's update. NVDA is looking strong.",
|
||||
transcript_segments=[
|
||||
{"start": 0.0, "end": 5.0, "text": "Welcome to today's update."},
|
||||
{"start": 5.0, "end": 10.0, "text": "NVDA is looking strong."},
|
||||
],
|
||||
)
|
||||
|
||||
assert isinstance(result, LlmCallResult)
|
||||
assert isinstance(result.analysis, MeetKevinAnalysis)
|
||||
assert result.prompt_tokens == 5000
|
||||
assert result.completion_tokens == 800
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_analysis_fields_parsed_correctly(self):
|
||||
"""Parsed MeetKevinAnalysis has correct field values from tool input."""
|
||||
tool_input = _valid_analysis_input()
|
||||
resp = _make_anthropic_response(tool_input)
|
||||
client, _ = _make_client(resp)
|
||||
|
||||
analyzer = LlmAnalyzer(client=client, model="claude-sonnet-4-6", prompt_version="v1")
|
||||
result = await analyzer.analyze(
|
||||
title="Test Video",
|
||||
description="Description",
|
||||
published_at=datetime(2026, 5, 21, tzinfo=timezone.utc),
|
||||
transcript_text="Some transcript.",
|
||||
transcript_segments=[],
|
||||
)
|
||||
|
||||
analysis = result.analysis
|
||||
assert analysis.market_outlook_direction == MarketOutlook.BULLISH
|
||||
assert analysis.market_outlook_reasoning == "Strong earnings and low unemployment."
|
||||
assert "Fed pivot" in analysis.macro_themes
|
||||
assert len(analysis.tickers) == 1
|
||||
assert analysis.tickers[0].symbol == "NVDA"
|
||||
assert analysis.tickers[0].action == TickerAction.BUY
|
||||
assert analysis.tickers[0].conviction == pytest.approx(0.85)
|
||||
assert analysis.tickers[0].time_horizon == TimeHorizon.MONTHS
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_cost_usd_is_positive(self):
|
||||
"""cost_usd is calculated and positive for a valid token count."""
|
||||
resp = _make_anthropic_response(_valid_analysis_input(), in_tokens=10_000, out_tokens=1_000)
|
||||
client, _ = _make_client(resp)
|
||||
|
||||
analyzer = LlmAnalyzer(client=client, model="claude-sonnet-4-6", prompt_version="v1")
|
||||
result = await analyzer.analyze(
|
||||
title="Test",
|
||||
description="",
|
||||
published_at=datetime(2026, 5, 21, tzinfo=timezone.utc),
|
||||
transcript_text="",
|
||||
transcript_segments=[],
|
||||
)
|
||||
|
||||
assert result.cost_usd > Decimal("0")
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_api_called_with_tool_choice_forcing(self):
|
||||
"""messages.create is called with tool_choice forcing submit_analysis."""
|
||||
resp = _make_anthropic_response(_valid_analysis_input())
|
||||
client, mock_create = _make_client(resp)
|
||||
|
||||
analyzer = LlmAnalyzer(client=client, model="claude-sonnet-4-6", prompt_version="v1")
|
||||
await analyzer.analyze(
|
||||
title="Test",
|
||||
description="",
|
||||
published_at=datetime(2026, 5, 21, tzinfo=timezone.utc),
|
||||
transcript_text="",
|
||||
transcript_segments=[],
|
||||
)
|
||||
|
||||
mock_create.assert_called_once()
|
||||
kwargs = mock_create.call_args.kwargs
|
||||
assert kwargs["tool_choice"] == {"type": "tool", "name": "submit_analysis"}
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_api_called_with_cache_control_on_system(self):
|
||||
"""System prompt is passed with cache_control: {type: ephemeral}."""
|
||||
resp = _make_anthropic_response(_valid_analysis_input())
|
||||
client, mock_create = _make_client(resp)
|
||||
|
||||
analyzer = LlmAnalyzer(client=client, model="claude-sonnet-4-6", prompt_version="v1")
|
||||
await analyzer.analyze(
|
||||
title="Test",
|
||||
description="",
|
||||
published_at=datetime(2026, 5, 21, tzinfo=timezone.utc),
|
||||
transcript_text="",
|
||||
transcript_segments=[],
|
||||
)
|
||||
|
||||
kwargs = mock_create.call_args.kwargs
|
||||
system = kwargs["system"]
|
||||
assert isinstance(system, list)
|
||||
assert len(system) >= 1
|
||||
assert system[0]["type"] == "text"
|
||||
assert system[0]["cache_control"] == {"type": "ephemeral"}
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_api_called_with_correct_model(self):
|
||||
"""messages.create is called with the model passed to LlmAnalyzer."""
|
||||
resp = _make_anthropic_response(_valid_analysis_input())
|
||||
client, mock_create = _make_client(resp)
|
||||
|
||||
analyzer = LlmAnalyzer(client=client, model="claude-opus-4-7", prompt_version="v1")
|
||||
await analyzer.analyze(
|
||||
title="Test",
|
||||
description="",
|
||||
published_at=datetime(2026, 5, 21, tzinfo=timezone.utc),
|
||||
transcript_text="",
|
||||
transcript_segments=[],
|
||||
)
|
||||
|
||||
kwargs = mock_create.call_args.kwargs
|
||||
assert kwargs["model"] == "claude-opus-4-7"
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_api_called_with_submit_analysis_tool(self):
|
||||
"""Tool definition includes name='submit_analysis'."""
|
||||
resp = _make_anthropic_response(_valid_analysis_input())
|
||||
client, mock_create = _make_client(resp)
|
||||
|
||||
analyzer = LlmAnalyzer(client=client, model="claude-sonnet-4-6", prompt_version="v1")
|
||||
await analyzer.analyze(
|
||||
title="Test",
|
||||
description="",
|
||||
published_at=datetime(2026, 5, 21, tzinfo=timezone.utc),
|
||||
transcript_text="",
|
||||
transcript_segments=[],
|
||||
)
|
||||
|
||||
kwargs = mock_create.call_args.kwargs
|
||||
tools = kwargs["tools"]
|
||||
assert any(t.get("name") == "submit_analysis" for t in tools)
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_raw_response_is_captured(self):
|
||||
"""raw_response in LlmCallResult holds serializable dict."""
|
||||
resp = _make_anthropic_response(_valid_analysis_input())
|
||||
client, _ = _make_client(resp)
|
||||
|
||||
analyzer = LlmAnalyzer(client=client, model="claude-sonnet-4-6", prompt_version="v1")
|
||||
result = await analyzer.analyze(
|
||||
title="Test",
|
||||
description="",
|
||||
published_at=datetime(2026, 5, 21, tzinfo=timezone.utc),
|
||||
transcript_text="",
|
||||
transcript_segments=[],
|
||||
)
|
||||
|
||||
assert isinstance(result.raw_response, dict)
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_transcript_segments_included_in_user_message(self):
|
||||
"""User message contains timestamped segment lines from transcript_segments."""
|
||||
resp = _make_anthropic_response(_valid_analysis_input())
|
||||
client, mock_create = _make_client(resp)
|
||||
|
||||
segments = [
|
||||
{"start": 0.0, "end": 5.0, "text": "Hello world."},
|
||||
{"start": 5.0, "end": 10.0, "text": "Let's talk stocks."},
|
||||
]
|
||||
|
||||
analyzer = LlmAnalyzer(client=client, model="claude-sonnet-4-6", prompt_version="v1")
|
||||
await analyzer.analyze(
|
||||
title="Test",
|
||||
description="",
|
||||
published_at=datetime(2026, 5, 21, tzinfo=timezone.utc),
|
||||
transcript_text="Hello world. Let's talk stocks.",
|
||||
transcript_segments=segments,
|
||||
)
|
||||
|
||||
kwargs = mock_create.call_args.kwargs
|
||||
user_content = kwargs["messages"][0]["content"]
|
||||
# The user message should contain the segment text
|
||||
assert "Hello world." in user_content
|
||||
assert "Let's talk stocks." in user_content
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# LlmAnalyzer.analyze — failure paths
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestLlmAnalyzerFailurePaths:
|
||||
"""Failure path tests."""
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_no_tool_use_block_raises_value_error(self):
|
||||
"""If response has no tool_use block, raises ValueError mentioning tool_use."""
|
||||
# Response with a text block instead of tool_use
|
||||
text_block = MagicMock()
|
||||
text_block.type = "text"
|
||||
text_block.text = "Here is my analysis..."
|
||||
|
||||
resp = MagicMock()
|
||||
resp.content = [text_block]
|
||||
resp.usage = MagicMock(input_tokens=5000, output_tokens=800)
|
||||
resp.stop_reason = "end_turn"
|
||||
|
||||
client, _ = _make_client(resp)
|
||||
|
||||
analyzer = LlmAnalyzer(client=client, model="claude-sonnet-4-6", prompt_version="v1")
|
||||
with pytest.raises(ValueError, match="tool_use"):
|
||||
await analyzer.analyze(
|
||||
title="Test",
|
||||
description="",
|
||||
published_at=datetime(2026, 5, 21, tzinfo=timezone.utc),
|
||||
transcript_text="",
|
||||
transcript_segments=[],
|
||||
)
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_empty_content_raises_value_error(self):
|
||||
"""If response content is empty, raises ValueError."""
|
||||
resp = MagicMock()
|
||||
resp.content = []
|
||||
resp.usage = MagicMock(input_tokens=5000, output_tokens=800)
|
||||
resp.stop_reason = "tool_use"
|
||||
|
||||
client, _ = _make_client(resp)
|
||||
|
||||
analyzer = LlmAnalyzer(client=client, model="claude-sonnet-4-6", prompt_version="v1")
|
||||
with pytest.raises(ValueError):
|
||||
await analyzer.analyze(
|
||||
title="Test",
|
||||
description="",
|
||||
published_at=datetime(2026, 5, 21, tzinfo=timezone.utc),
|
||||
transcript_text="",
|
||||
transcript_segments=[],
|
||||
)
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_invalid_tool_input_raises_validation_error(self):
|
||||
"""Malformed tool input (invalid enum) raises a validation error."""
|
||||
bad_input = _valid_analysis_input()
|
||||
bad_input["market_outlook_direction"] = "extremely_bullish" # not a valid enum
|
||||
|
||||
resp = _make_anthropic_response(bad_input)
|
||||
client, _ = _make_client(resp)
|
||||
|
||||
analyzer = LlmAnalyzer(client=client, model="claude-sonnet-4-6", prompt_version="v1")
|
||||
with pytest.raises(Exception): # pydantic ValidationError or ValueError
|
||||
await analyzer.analyze(
|
||||
title="Test",
|
||||
description="",
|
||||
published_at=datetime(2026, 5, 21, tzinfo=timezone.utc),
|
||||
transcript_text="",
|
||||
transcript_segments=[],
|
||||
)
|
||||
Loading…
Add table
Add a link
Reference in a new issue