Previous refactor (89f01ad) moved to OpenRouter because no sk-ant-api-* key
was found in Vault. Turns out claude-agent-service-spare-{1,2} hold
sk-ant-oat01-* OAuth tokens (108 chars, scope user:inference, 1-year TTL,
minted via 'claude setup-token' — see memory id=832).
These tokens work with the Anthropic SDK via the auth_token= constructor
argument (routes to Authorization: Bearer ... instead of x-api-key: ...).
They consume the Enterprise Claude subscription quota rather than
per-call billing, so the OpenRouter zero-credit problem goes away.
- llm_analyzer.py: revert OpenAI client to AsyncAnthropic; tool-use API
+ cache_control restored
- config.py: openrouter_api_key -> anthropic_oauth_token; model slug
reverted from anthropic/claude-sonnet-4.5 -> claude-sonnet-4-5
- main.py: AsyncOpenAI -> AsyncAnthropic(auth_token=...), drop OpenRouter
attribution headers
- pyproject: openai>=1.50 -> anthropic>=0.40 in meet_kevin extras
- tests: mocks ported back to messages.create + tool_use blocks
463 lines
18 KiB
Python
463 lines
18 KiB
Python
"""Tests for the Anthropic SDK LLM analyzer (Task 7).
|
|
|
|
Tests use MagicMock/AsyncMock to avoid real API calls.
|
|
"""
|
|
|
|
from datetime import datetime, timezone
|
|
from decimal import Decimal
|
|
from unittest.mock import AsyncMock, MagicMock
|
|
|
|
import pytest
|
|
|
|
from services.meet_kevin_watcher.llm_analyzer import (
|
|
SYSTEM_PROMPT,
|
|
LlmAnalyzer,
|
|
LlmCallResult,
|
|
compute_cost_usd,
|
|
)
|
|
from shared.schemas.meet_kevin import (
|
|
MarketOutlook,
|
|
MeetKevinAnalysis,
|
|
TickerAction,
|
|
TimeHorizon,
|
|
)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Test helpers
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def _make_anthropic_response(tool_input: dict, in_tokens: int = 5000, out_tokens: int = 800):
|
|
"""Mock an Anthropic Messages response with one tool_use block."""
|
|
block = MagicMock()
|
|
block.type = "tool_use"
|
|
block.name = "submit_analysis"
|
|
block.input = tool_input
|
|
|
|
resp = MagicMock()
|
|
resp.content = [block]
|
|
resp.usage = MagicMock(input_tokens=in_tokens, output_tokens=out_tokens)
|
|
resp.stop_reason = "tool_use"
|
|
return resp
|
|
|
|
|
|
def _valid_analysis_input() -> dict:
|
|
"""Return a dict that Pydantic can validate into MeetKevinAnalysis."""
|
|
return {
|
|
"market_outlook_direction": "bullish",
|
|
"market_outlook_reasoning": "Strong earnings and low unemployment.",
|
|
"macro_themes": ["Fed pivot", "AI boom"],
|
|
"key_risks": ["Inflation rebound", "Credit crunch"],
|
|
"summary": "Kevin discussed the current bull market and highlighted several tech stocks.",
|
|
"tickers": [
|
|
{
|
|
"symbol": "NVDA",
|
|
"action": "buy",
|
|
"conviction": 0.85,
|
|
"time_horizon": "months",
|
|
"rationale_quote": "AI infrastructure buildout has years to run",
|
|
"video_timestamp_seconds": 320,
|
|
}
|
|
],
|
|
}
|
|
|
|
|
|
def _make_client(response=None):
|
|
"""Return a mocked AsyncAnthropic client with messages.create wired up."""
|
|
mock_create = AsyncMock(return_value=response)
|
|
mock_messages = MagicMock()
|
|
mock_messages.create = mock_create
|
|
|
|
client = MagicMock()
|
|
client.messages = mock_messages
|
|
return client, mock_create
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# compute_cost_usd
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
class TestComputeCostUsd:
|
|
"""Verify monetary cost calculations using Decimal arithmetic."""
|
|
|
|
def test_sonnet_45_native_pricing(self):
|
|
"""claude-sonnet-4-5: $3/M input + $15/M output = $18/M total."""
|
|
# 1M input + 1M output = $3 + $15 = $18
|
|
result = compute_cost_usd("claude-sonnet-4-5", 1_000_000, 1_000_000)
|
|
assert result == Decimal("18.0000")
|
|
|
|
def test_sonnet_46_native_pricing(self):
|
|
"""claude-sonnet-4-6: same pricing as 4-5 ($3/$15)."""
|
|
result = compute_cost_usd("claude-sonnet-4-6", 1_000_000, 1_000_000)
|
|
assert result == Decimal("18.0000")
|
|
|
|
def test_opus_47_pricing(self):
|
|
"""claude-opus-4-7: $15/M input + $75/M output."""
|
|
result = compute_cost_usd("claude-opus-4-7", 1_000_000, 1_000_000)
|
|
assert result == Decimal("90.0000")
|
|
|
|
def test_haiku_45_pricing(self):
|
|
"""claude-haiku-4-5-20251001: $1/M input + $5/M output."""
|
|
result = compute_cost_usd("claude-haiku-4-5-20251001", 1_000_000, 1_000_000)
|
|
assert result == Decimal("6.0000")
|
|
|
|
def test_unknown_model_returns_zero(self):
|
|
"""Unknown model logs warning and returns Decimal('0')."""
|
|
result = compute_cost_usd("unknown-model", 1000, 1000)
|
|
assert result == Decimal("0")
|
|
|
|
def test_zero_tokens(self):
|
|
"""Zero tokens produce zero cost."""
|
|
result = compute_cost_usd("claude-sonnet-4-5", 0, 0)
|
|
assert result == Decimal("0")
|
|
|
|
def test_result_is_decimal(self):
|
|
"""Return type is always Decimal, not float."""
|
|
result = compute_cost_usd("claude-sonnet-4-5", 5000, 800)
|
|
assert isinstance(result, Decimal)
|
|
|
|
def test_small_realistic_call(self):
|
|
"""Realistic 10K input + 1K output token call (Sonnet 4.5 native)."""
|
|
# input: 10000/1_000_000 * 3 = 0.03000
|
|
# output: 1000/1_000_000 * 15 = 0.01500
|
|
# total: 0.04500
|
|
result = compute_cost_usd("claude-sonnet-4-5", 10_000, 1_000)
|
|
assert result == Decimal("0.0450")
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# SYSTEM_PROMPT
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
class TestSystemPrompt:
|
|
"""Verify the system prompt has the required content markers."""
|
|
|
|
def test_contains_submit_analysis(self):
|
|
"""SYSTEM_PROMPT must reference the tool name 'submit_analysis'."""
|
|
assert "submit_analysis" in SYSTEM_PROMPT
|
|
|
|
def test_contains_ticker(self):
|
|
"""SYSTEM_PROMPT must mention 'ticker' (case-insensitive)."""
|
|
assert "ticker" in SYSTEM_PROMPT.lower()
|
|
|
|
def test_is_substantial(self):
|
|
"""SYSTEM_PROMPT should be at least 300 words (analyst guidance)."""
|
|
word_count = len(SYSTEM_PROMPT.split())
|
|
assert word_count >= 300, f"SYSTEM_PROMPT is only {word_count} words"
|
|
|
|
def test_mentions_conviction(self):
|
|
"""SYSTEM_PROMPT should mention conviction scoring."""
|
|
assert "conviction" in SYSTEM_PROMPT.lower()
|
|
|
|
def test_mentions_time_horizon(self):
|
|
"""SYSTEM_PROMPT should describe time_horizon field."""
|
|
assert "time_horizon" in SYSTEM_PROMPT or "time horizon" in SYSTEM_PROMPT.lower()
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# LlmCallResult dataclass
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
class TestLlmCallResult:
|
|
"""Verify LlmCallResult shape and immutability."""
|
|
|
|
def test_is_frozen(self):
|
|
"""LlmCallResult must be a frozen dataclass."""
|
|
analysis = MeetKevinAnalysis(**_valid_analysis_input())
|
|
result = LlmCallResult(
|
|
analysis=analysis,
|
|
raw_response={"stop_reason": "tool_use"},
|
|
prompt_tokens=5000,
|
|
completion_tokens=800,
|
|
cost_usd=Decimal("0.027"),
|
|
)
|
|
with pytest.raises((AttributeError, TypeError)):
|
|
result.prompt_tokens = 9999 # type: ignore
|
|
|
|
def test_fields_accessible(self):
|
|
"""All five fields are accessible on LlmCallResult."""
|
|
analysis = MeetKevinAnalysis(**_valid_analysis_input())
|
|
cost = Decimal("0.027")
|
|
result = LlmCallResult(
|
|
analysis=analysis,
|
|
raw_response={"stop_reason": "tool_use"},
|
|
prompt_tokens=5000,
|
|
completion_tokens=800,
|
|
cost_usd=cost,
|
|
)
|
|
assert result.analysis is analysis
|
|
assert result.raw_response == {"stop_reason": "tool_use"}
|
|
assert result.prompt_tokens == 5000
|
|
assert result.completion_tokens == 800
|
|
assert result.cost_usd == cost
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# LlmAnalyzer.analyze — happy path
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
class TestLlmAnalyzerHappyPath:
|
|
"""Happy-path tests for the analyzer."""
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_returns_llm_call_result(self):
|
|
"""analyze() returns an LlmCallResult with parsed MeetKevinAnalysis."""
|
|
tool_input = _valid_analysis_input()
|
|
resp = _make_anthropic_response(tool_input, in_tokens=5000, out_tokens=800)
|
|
client, mock_create = _make_client(resp)
|
|
|
|
analyzer = LlmAnalyzer(client=client, model="claude-sonnet-4-5", prompt_version="v1")
|
|
result = await analyzer.analyze(
|
|
title="Market Update",
|
|
description="Kevin covers the latest market trends.",
|
|
published_at=datetime(2026, 5, 21, 12, 0, 0, tzinfo=timezone.utc),
|
|
transcript_text="Welcome to today's update. NVDA is looking strong.",
|
|
transcript_segments=[
|
|
{"start": 0.0, "end": 5.0, "text": "Welcome to today's update."},
|
|
{"start": 5.0, "end": 10.0, "text": "NVDA is looking strong."},
|
|
],
|
|
)
|
|
|
|
assert isinstance(result, LlmCallResult)
|
|
assert isinstance(result.analysis, MeetKevinAnalysis)
|
|
assert result.prompt_tokens == 5000
|
|
assert result.completion_tokens == 800
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_analysis_fields_parsed_correctly(self):
|
|
"""Parsed MeetKevinAnalysis has correct field values from tool input."""
|
|
tool_input = _valid_analysis_input()
|
|
resp = _make_anthropic_response(tool_input)
|
|
client, _ = _make_client(resp)
|
|
|
|
analyzer = LlmAnalyzer(client=client, model="claude-sonnet-4-5", prompt_version="v1")
|
|
result = await analyzer.analyze(
|
|
title="Test Video",
|
|
description="Description",
|
|
published_at=datetime(2026, 5, 21, tzinfo=timezone.utc),
|
|
transcript_text="Some transcript.",
|
|
transcript_segments=[],
|
|
)
|
|
|
|
analysis = result.analysis
|
|
assert analysis.market_outlook_direction == MarketOutlook.BULLISH
|
|
assert analysis.market_outlook_reasoning == "Strong earnings and low unemployment."
|
|
assert "Fed pivot" in analysis.macro_themes
|
|
assert len(analysis.tickers) == 1
|
|
assert analysis.tickers[0].symbol == "NVDA"
|
|
assert analysis.tickers[0].action == TickerAction.BUY
|
|
assert analysis.tickers[0].conviction == pytest.approx(0.85)
|
|
assert analysis.tickers[0].time_horizon == TimeHorizon.MONTHS
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_cost_usd_is_positive(self):
|
|
"""cost_usd is calculated and positive for a valid token count."""
|
|
resp = _make_anthropic_response(_valid_analysis_input(), in_tokens=10_000, out_tokens=1_000)
|
|
client, _ = _make_client(resp)
|
|
|
|
analyzer = LlmAnalyzer(client=client, model="claude-sonnet-4-5", prompt_version="v1")
|
|
result = await analyzer.analyze(
|
|
title="Test",
|
|
description="",
|
|
published_at=datetime(2026, 5, 21, tzinfo=timezone.utc),
|
|
transcript_text="",
|
|
transcript_segments=[],
|
|
)
|
|
|
|
assert result.cost_usd > Decimal("0")
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_api_called_with_tool_choice_forcing(self):
|
|
"""messages.create is called with tool_choice forcing submit_analysis."""
|
|
resp = _make_anthropic_response(_valid_analysis_input())
|
|
client, mock_create = _make_client(resp)
|
|
|
|
analyzer = LlmAnalyzer(client=client, model="claude-sonnet-4-5", prompt_version="v1")
|
|
await analyzer.analyze(
|
|
title="Test",
|
|
description="",
|
|
published_at=datetime(2026, 5, 21, tzinfo=timezone.utc),
|
|
transcript_text="",
|
|
transcript_segments=[],
|
|
)
|
|
|
|
mock_create.assert_called_once()
|
|
kwargs = mock_create.call_args.kwargs
|
|
assert kwargs["tool_choice"] == {"type": "tool", "name": "submit_analysis"}
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_api_called_with_system_prompt_in_system_param(self):
|
|
"""System prompt is passed as the system parameter (list with cache_control)."""
|
|
resp = _make_anthropic_response(_valid_analysis_input())
|
|
client, mock_create = _make_client(resp)
|
|
|
|
analyzer = LlmAnalyzer(client=client, model="claude-sonnet-4-5", prompt_version="v1")
|
|
await analyzer.analyze(
|
|
title="Test",
|
|
description="",
|
|
published_at=datetime(2026, 5, 21, tzinfo=timezone.utc),
|
|
transcript_text="",
|
|
transcript_segments=[],
|
|
)
|
|
|
|
kwargs = mock_create.call_args.kwargs
|
|
system = kwargs["system"]
|
|
assert isinstance(system, list)
|
|
assert system[0]["type"] == "text"
|
|
assert SYSTEM_PROMPT in system[0]["text"]
|
|
assert system[0]["cache_control"] == {"type": "ephemeral"}
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_api_called_with_correct_model(self):
|
|
"""messages.create is called with the model passed to LlmAnalyzer."""
|
|
resp = _make_anthropic_response(_valid_analysis_input())
|
|
client, mock_create = _make_client(resp)
|
|
|
|
analyzer = LlmAnalyzer(client=client, model="claude-sonnet-4-5", prompt_version="v1")
|
|
await analyzer.analyze(
|
|
title="Test",
|
|
description="",
|
|
published_at=datetime(2026, 5, 21, tzinfo=timezone.utc),
|
|
transcript_text="",
|
|
transcript_segments=[],
|
|
)
|
|
|
|
kwargs = mock_create.call_args.kwargs
|
|
assert kwargs["model"] == "claude-sonnet-4-5"
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_api_called_with_submit_analysis_tool(self):
|
|
"""Tool definition includes name 'submit_analysis' with input_schema."""
|
|
resp = _make_anthropic_response(_valid_analysis_input())
|
|
client, mock_create = _make_client(resp)
|
|
|
|
analyzer = LlmAnalyzer(client=client, model="claude-sonnet-4-5", prompt_version="v1")
|
|
await analyzer.analyze(
|
|
title="Test",
|
|
description="",
|
|
published_at=datetime(2026, 5, 21, tzinfo=timezone.utc),
|
|
transcript_text="",
|
|
transcript_segments=[],
|
|
)
|
|
|
|
kwargs = mock_create.call_args.kwargs
|
|
tools = kwargs["tools"]
|
|
assert any(
|
|
t.get("name") == "submit_analysis" and "input_schema" in t
|
|
for t in tools
|
|
)
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_raw_response_is_captured(self):
|
|
"""raw_response in LlmCallResult holds serializable dict."""
|
|
resp = _make_anthropic_response(_valid_analysis_input())
|
|
client, _ = _make_client(resp)
|
|
|
|
analyzer = LlmAnalyzer(client=client, model="claude-sonnet-4-5", prompt_version="v1")
|
|
result = await analyzer.analyze(
|
|
title="Test",
|
|
description="",
|
|
published_at=datetime(2026, 5, 21, tzinfo=timezone.utc),
|
|
transcript_text="",
|
|
transcript_segments=[],
|
|
)
|
|
|
|
assert isinstance(result.raw_response, dict)
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_transcript_segments_included_in_user_message(self):
|
|
"""User message contains timestamped segment lines from transcript_segments."""
|
|
resp = _make_anthropic_response(_valid_analysis_input())
|
|
client, mock_create = _make_client(resp)
|
|
|
|
segments = [
|
|
{"start": 0.0, "end": 5.0, "text": "Hello world."},
|
|
{"start": 5.0, "end": 10.0, "text": "Let's talk stocks."},
|
|
]
|
|
|
|
analyzer = LlmAnalyzer(client=client, model="claude-sonnet-4-5", prompt_version="v1")
|
|
await analyzer.analyze(
|
|
title="Test",
|
|
description="",
|
|
published_at=datetime(2026, 5, 21, tzinfo=timezone.utc),
|
|
transcript_text="Hello world. Let's talk stocks.",
|
|
transcript_segments=segments,
|
|
)
|
|
|
|
kwargs = mock_create.call_args.kwargs
|
|
# user message is in the messages list
|
|
user_content = kwargs["messages"][0]["content"]
|
|
assert "Hello world." in user_content
|
|
assert "Let's talk stocks." in user_content
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# LlmAnalyzer.analyze — failure paths
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
class TestLlmAnalyzerFailurePaths:
|
|
"""Failure path tests."""
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_no_tool_use_block_raises_value_error(self):
|
|
"""If response has no tool_use block, raises ValueError containing 'tool_use'."""
|
|
resp = MagicMock()
|
|
resp.content = [MagicMock(type="text")]
|
|
resp.usage = MagicMock(input_tokens=5000, output_tokens=800)
|
|
resp.stop_reason = "end_turn"
|
|
|
|
client, _ = _make_client(resp)
|
|
|
|
analyzer = LlmAnalyzer(client=client, model="claude-sonnet-4-5", prompt_version="v1")
|
|
with pytest.raises(ValueError, match="tool_use"):
|
|
await analyzer.analyze(
|
|
title="Test",
|
|
description="",
|
|
published_at=datetime(2026, 5, 21, tzinfo=timezone.utc),
|
|
transcript_text="",
|
|
transcript_segments=[],
|
|
)
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_empty_content_raises_value_error(self):
|
|
"""If response.content is empty, raises ValueError."""
|
|
resp = MagicMock()
|
|
resp.content = []
|
|
resp.usage = MagicMock(input_tokens=5000, output_tokens=800)
|
|
resp.stop_reason = "end_turn"
|
|
|
|
client, _ = _make_client(resp)
|
|
|
|
analyzer = LlmAnalyzer(client=client, model="claude-sonnet-4-5", prompt_version="v1")
|
|
with pytest.raises(ValueError):
|
|
await analyzer.analyze(
|
|
title="Test",
|
|
description="",
|
|
published_at=datetime(2026, 5, 21, tzinfo=timezone.utc),
|
|
transcript_text="",
|
|
transcript_segments=[],
|
|
)
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_invalid_tool_input_raises_validation_error(self):
|
|
"""Malformed tool input (invalid enum) raises a validation error."""
|
|
bad_input = _valid_analysis_input()
|
|
bad_input["market_outlook_direction"] = "extremely_bullish" # not a valid enum
|
|
|
|
resp = _make_anthropic_response(bad_input)
|
|
client, _ = _make_client(resp)
|
|
|
|
analyzer = LlmAnalyzer(client=client, model="claude-sonnet-4-5", prompt_version="v1")
|
|
with pytest.raises(Exception): # pydantic ValidationError or ValueError
|
|
await analyzer.analyze(
|
|
title="Test",
|
|
description="",
|
|
published_at=datetime(2026, 5, 21, tzinfo=timezone.utc),
|
|
transcript_text="",
|
|
transcript_segments=[],
|
|
)
|