refactor(meet-kevin): switch LLM back to native Anthropic SDK with OAuth bearer

Previous refactor (89f01ad) moved to OpenRouter because no sk-ant-api-* key
was found in Vault. Turns out claude-agent-service-spare-{1,2} hold
sk-ant-oat01-* OAuth tokens (108 chars, scope user:inference, 1-year TTL,
minted via 'claude setup-token' — see memory id=832).

These tokens work with the Anthropic SDK via the auth_token= constructor
argument (routes to Authorization: Bearer ... instead of x-api-key: ...).
They consume the Enterprise Claude subscription quota rather than
per-call billing, so the OpenRouter zero-credit problem goes away.

- llm_analyzer.py: revert OpenAI client to AsyncAnthropic; tool-use API
  + cache_control restored
- config.py: openrouter_api_key -> anthropic_oauth_token; model slug
  reverted from anthropic/claude-sonnet-4.5 -> claude-sonnet-4-5
- main.py: AsyncOpenAI -> AsyncAnthropic(auth_token=...), drop OpenRouter
  attribution headers
- pyproject: openai>=1.50 -> anthropic>=0.40 in meet_kevin extras
- tests: mocks ported back to messages.create + tool_use blocks
This commit is contained in:
Viktor Barzin 2026-05-22 19:24:40 +00:00
parent 4f4d365652
commit 8a1d03a967
5 changed files with 211 additions and 235 deletions

View file

@ -1,9 +1,8 @@
"""Tests for the OpenRouter LLM analyzer (Task 7).
"""Tests for the Anthropic SDK LLM analyzer (Task 7).
Tests use MagicMock/AsyncMock to avoid real API calls.
"""
import json
from datetime import datetime, timezone
from decimal import Decimal
from unittest.mock import AsyncMock, MagicMock
@ -28,23 +27,17 @@ from shared.schemas.meet_kevin import (
# Test helpers
# ---------------------------------------------------------------------------
def _make_openai_response(tool_args: dict, in_tokens: int = 5000, out_tokens: int = 800):
"""Mock an OpenAI ChatCompletion response with one tool_call."""
tool_call = MagicMock()
tool_call.function = MagicMock()
tool_call.function.name = "submit_analysis"
tool_call.function.arguments = json.dumps(tool_args)
msg = MagicMock()
msg.tool_calls = [tool_call]
choice = MagicMock()
choice.message = msg
choice.finish_reason = "tool_calls"
def _make_anthropic_response(tool_input: dict, in_tokens: int = 5000, out_tokens: int = 800):
"""Mock an Anthropic Messages response with one tool_use block."""
block = MagicMock()
block.type = "tool_use"
block.name = "submit_analysis"
block.input = tool_input
resp = MagicMock()
resp.choices = [choice]
resp.usage = MagicMock(prompt_tokens=in_tokens, completion_tokens=out_tokens)
resp.content = [block]
resp.usage = MagicMock(input_tokens=in_tokens, output_tokens=out_tokens)
resp.stop_reason = "tool_use"
return resp
@ -70,15 +63,13 @@ def _valid_analysis_input() -> dict:
def _make_client(response=None):
"""Return a mocked AsyncOpenAI client with chat.completions.create wired up."""
"""Return a mocked AsyncAnthropic client with messages.create wired up."""
mock_create = AsyncMock(return_value=response)
mock_completions = MagicMock()
mock_completions.create = mock_create
mock_chat = MagicMock()
mock_chat.completions = mock_completions
mock_messages = MagicMock()
mock_messages.create = mock_create
client = MagicMock()
client.chat = mock_chat
client.messages = mock_messages
return client, mock_create
@ -90,16 +81,16 @@ def _make_client(response=None):
class TestComputeCostUsd:
"""Verify monetary cost calculations using Decimal arithmetic."""
def test_sonnet_45_openrouter_pricing(self):
"""anthropic/claude-sonnet-4.5: $3.10/M input + $15.50/M output."""
# 1M input + 1M output = $3.10 + $15.50 = $18.60
result = compute_cost_usd("anthropic/claude-sonnet-4.5", 1_000_000, 1_000_000)
assert result == Decimal("18.6000")
def test_sonnet_45_native_pricing(self):
"""claude-sonnet-4-5: $3/M input + $15/M output = $18/M total."""
# 1M input + 1M output = $3 + $15 = $18
result = compute_cost_usd("claude-sonnet-4-5", 1_000_000, 1_000_000)
assert result == Decimal("18.0000")
def test_sonnet_46_legacy_slug(self):
"""claude-sonnet-4-6 (legacy slug) is also priced at $3.10/$15.50."""
def test_sonnet_46_native_pricing(self):
"""claude-sonnet-4-6: same pricing as 4-5 ($3/$15)."""
result = compute_cost_usd("claude-sonnet-4-6", 1_000_000, 1_000_000)
assert result == Decimal("18.6000")
assert result == Decimal("18.0000")
def test_opus_47_pricing(self):
"""claude-opus-4-7: $15/M input + $75/M output."""
@ -118,21 +109,21 @@ class TestComputeCostUsd:
def test_zero_tokens(self):
"""Zero tokens produce zero cost."""
result = compute_cost_usd("anthropic/claude-sonnet-4.5", 0, 0)
result = compute_cost_usd("claude-sonnet-4-5", 0, 0)
assert result == Decimal("0")
def test_result_is_decimal(self):
"""Return type is always Decimal, not float."""
result = compute_cost_usd("anthropic/claude-sonnet-4.5", 5000, 800)
result = compute_cost_usd("claude-sonnet-4-5", 5000, 800)
assert isinstance(result, Decimal)
def test_small_realistic_call(self):
"""Realistic 10K input + 1K output token call (Sonnet 4.5 via OpenRouter)."""
# input: 10000/1_000_000 * 3.10 = 0.03100
# output: 1000/1_000_000 * 15.50 = 0.01550
# total: 0.04650
result = compute_cost_usd("anthropic/claude-sonnet-4.5", 10_000, 1_000)
assert result == Decimal("0.0465")
"""Realistic 10K input + 1K output token call (Sonnet 4.5 native)."""
# input: 10000/1_000_000 * 3 = 0.03000
# output: 1000/1_000_000 * 15 = 0.01500
# total: 0.04500
result = compute_cost_usd("claude-sonnet-4-5", 10_000, 1_000)
assert result == Decimal("0.0450")
# ---------------------------------------------------------------------------
@ -178,7 +169,7 @@ class TestLlmCallResult:
analysis = MeetKevinAnalysis(**_valid_analysis_input())
result = LlmCallResult(
analysis=analysis,
raw_response={"finish_reason": "tool_calls"},
raw_response={"stop_reason": "tool_use"},
prompt_tokens=5000,
completion_tokens=800,
cost_usd=Decimal("0.027"),
@ -192,13 +183,13 @@ class TestLlmCallResult:
cost = Decimal("0.027")
result = LlmCallResult(
analysis=analysis,
raw_response={"finish_reason": "tool_calls"},
raw_response={"stop_reason": "tool_use"},
prompt_tokens=5000,
completion_tokens=800,
cost_usd=cost,
)
assert result.analysis is analysis
assert result.raw_response == {"finish_reason": "tool_calls"}
assert result.raw_response == {"stop_reason": "tool_use"}
assert result.prompt_tokens == 5000
assert result.completion_tokens == 800
assert result.cost_usd == cost
@ -216,10 +207,10 @@ class TestLlmAnalyzerHappyPath:
async def test_returns_llm_call_result(self):
"""analyze() returns an LlmCallResult with parsed MeetKevinAnalysis."""
tool_input = _valid_analysis_input()
resp = _make_openai_response(tool_input, in_tokens=5000, out_tokens=800)
resp = _make_anthropic_response(tool_input, in_tokens=5000, out_tokens=800)
client, mock_create = _make_client(resp)
analyzer = LlmAnalyzer(client=client, model="anthropic/claude-sonnet-4.5", prompt_version="v1")
analyzer = LlmAnalyzer(client=client, model="claude-sonnet-4-5", prompt_version="v1")
result = await analyzer.analyze(
title="Market Update",
description="Kevin covers the latest market trends.",
@ -240,10 +231,10 @@ class TestLlmAnalyzerHappyPath:
async def test_analysis_fields_parsed_correctly(self):
"""Parsed MeetKevinAnalysis has correct field values from tool input."""
tool_input = _valid_analysis_input()
resp = _make_openai_response(tool_input)
resp = _make_anthropic_response(tool_input)
client, _ = _make_client(resp)
analyzer = LlmAnalyzer(client=client, model="anthropic/claude-sonnet-4.5", prompt_version="v1")
analyzer = LlmAnalyzer(client=client, model="claude-sonnet-4-5", prompt_version="v1")
result = await analyzer.analyze(
title="Test Video",
description="Description",
@ -265,10 +256,10 @@ class TestLlmAnalyzerHappyPath:
@pytest.mark.asyncio
async def test_cost_usd_is_positive(self):
"""cost_usd is calculated and positive for a valid token count."""
resp = _make_openai_response(_valid_analysis_input(), in_tokens=10_000, out_tokens=1_000)
resp = _make_anthropic_response(_valid_analysis_input(), in_tokens=10_000, out_tokens=1_000)
client, _ = _make_client(resp)
analyzer = LlmAnalyzer(client=client, model="anthropic/claude-sonnet-4.5", prompt_version="v1")
analyzer = LlmAnalyzer(client=client, model="claude-sonnet-4-5", prompt_version="v1")
result = await analyzer.analyze(
title="Test",
description="",
@ -281,11 +272,11 @@ class TestLlmAnalyzerHappyPath:
@pytest.mark.asyncio
async def test_api_called_with_tool_choice_forcing(self):
"""chat.completions.create is called with tool_choice forcing submit_analysis."""
resp = _make_openai_response(_valid_analysis_input())
"""messages.create is called with tool_choice forcing submit_analysis."""
resp = _make_anthropic_response(_valid_analysis_input())
client, mock_create = _make_client(resp)
analyzer = LlmAnalyzer(client=client, model="anthropic/claude-sonnet-4.5", prompt_version="v1")
analyzer = LlmAnalyzer(client=client, model="claude-sonnet-4-5", prompt_version="v1")
await analyzer.analyze(
title="Test",
description="",
@ -296,15 +287,15 @@ class TestLlmAnalyzerHappyPath:
mock_create.assert_called_once()
kwargs = mock_create.call_args.kwargs
assert kwargs["tool_choice"] == {"type": "function", "function": {"name": "submit_analysis"}}
assert kwargs["tool_choice"] == {"type": "tool", "name": "submit_analysis"}
@pytest.mark.asyncio
async def test_api_called_with_system_prompt_in_messages(self):
"""System prompt is passed as a system role message in the messages list."""
resp = _make_openai_response(_valid_analysis_input())
async def test_api_called_with_system_prompt_in_system_param(self):
"""System prompt is passed as the system parameter (list with cache_control)."""
resp = _make_anthropic_response(_valid_analysis_input())
client, mock_create = _make_client(resp)
analyzer = LlmAnalyzer(client=client, model="anthropic/claude-sonnet-4.5", prompt_version="v1")
analyzer = LlmAnalyzer(client=client, model="claude-sonnet-4-5", prompt_version="v1")
await analyzer.analyze(
title="Test",
description="",
@ -314,18 +305,19 @@ class TestLlmAnalyzerHappyPath:
)
kwargs = mock_create.call_args.kwargs
messages = kwargs["messages"]
assert isinstance(messages, list)
assert messages[0]["role"] == "system"
assert SYSTEM_PROMPT in messages[0]["content"]
system = kwargs["system"]
assert isinstance(system, list)
assert system[0]["type"] == "text"
assert SYSTEM_PROMPT in system[0]["text"]
assert system[0]["cache_control"] == {"type": "ephemeral"}
@pytest.mark.asyncio
async def test_api_called_with_correct_model(self):
"""chat.completions.create is called with the model passed to LlmAnalyzer."""
resp = _make_openai_response(_valid_analysis_input())
"""messages.create is called with the model passed to LlmAnalyzer."""
resp = _make_anthropic_response(_valid_analysis_input())
client, mock_create = _make_client(resp)
analyzer = LlmAnalyzer(client=client, model="anthropic/claude-sonnet-4.5", prompt_version="v1")
analyzer = LlmAnalyzer(client=client, model="claude-sonnet-4-5", prompt_version="v1")
await analyzer.analyze(
title="Test",
description="",
@ -335,15 +327,15 @@ class TestLlmAnalyzerHappyPath:
)
kwargs = mock_create.call_args.kwargs
assert kwargs["model"] == "anthropic/claude-sonnet-4.5"
assert kwargs["model"] == "claude-sonnet-4-5"
@pytest.mark.asyncio
async def test_api_called_with_submit_analysis_tool(self):
"""Tool definition includes function name 'submit_analysis'."""
resp = _make_openai_response(_valid_analysis_input())
"""Tool definition includes name 'submit_analysis' with input_schema."""
resp = _make_anthropic_response(_valid_analysis_input())
client, mock_create = _make_client(resp)
analyzer = LlmAnalyzer(client=client, model="anthropic/claude-sonnet-4.5", prompt_version="v1")
analyzer = LlmAnalyzer(client=client, model="claude-sonnet-4-5", prompt_version="v1")
await analyzer.analyze(
title="Test",
description="",
@ -355,17 +347,17 @@ class TestLlmAnalyzerHappyPath:
kwargs = mock_create.call_args.kwargs
tools = kwargs["tools"]
assert any(
t.get("type") == "function" and t.get("function", {}).get("name") == "submit_analysis"
t.get("name") == "submit_analysis" and "input_schema" in t
for t in tools
)
@pytest.mark.asyncio
async def test_raw_response_is_captured(self):
"""raw_response in LlmCallResult holds serializable dict."""
resp = _make_openai_response(_valid_analysis_input())
resp = _make_anthropic_response(_valid_analysis_input())
client, _ = _make_client(resp)
analyzer = LlmAnalyzer(client=client, model="anthropic/claude-sonnet-4.5", prompt_version="v1")
analyzer = LlmAnalyzer(client=client, model="claude-sonnet-4-5", prompt_version="v1")
result = await analyzer.analyze(
title="Test",
description="",
@ -379,7 +371,7 @@ class TestLlmAnalyzerHappyPath:
@pytest.mark.asyncio
async def test_transcript_segments_included_in_user_message(self):
"""User message contains timestamped segment lines from transcript_segments."""
resp = _make_openai_response(_valid_analysis_input())
resp = _make_anthropic_response(_valid_analysis_input())
client, mock_create = _make_client(resp)
segments = [
@ -387,7 +379,7 @@ class TestLlmAnalyzerHappyPath:
{"start": 5.0, "end": 10.0, "text": "Let's talk stocks."},
]
analyzer = LlmAnalyzer(client=client, model="anthropic/claude-sonnet-4.5", prompt_version="v1")
analyzer = LlmAnalyzer(client=client, model="claude-sonnet-4-5", prompt_version="v1")
await analyzer.analyze(
title="Test",
description="",
@ -397,8 +389,8 @@ class TestLlmAnalyzerHappyPath:
)
kwargs = mock_create.call_args.kwargs
# user message is the second entry in messages list
user_content = kwargs["messages"][1]["content"]
# user message is in the messages list
user_content = kwargs["messages"][0]["content"]
assert "Hello world." in user_content
assert "Let's talk stocks." in user_content
@ -412,23 +404,17 @@ class TestLlmAnalyzerFailurePaths:
"""Failure path tests."""
@pytest.mark.asyncio
async def test_no_tool_calls_raises_value_error(self):
"""If response message has no tool_calls, raises ValueError."""
msg = MagicMock()
msg.tool_calls = None
choice = MagicMock()
choice.message = msg
choice.finish_reason = "stop"
async def test_no_tool_use_block_raises_value_error(self):
"""If response has no tool_use block, raises ValueError containing 'tool_use'."""
resp = MagicMock()
resp.choices = [choice]
resp.usage = MagicMock(prompt_tokens=5000, completion_tokens=800)
resp.content = [MagicMock(type="text")]
resp.usage = MagicMock(input_tokens=5000, output_tokens=800)
resp.stop_reason = "end_turn"
client, _ = _make_client(resp)
analyzer = LlmAnalyzer(client=client, model="anthropic/claude-sonnet-4.5", prompt_version="v1")
with pytest.raises(ValueError):
analyzer = LlmAnalyzer(client=client, model="claude-sonnet-4-5", prompt_version="v1")
with pytest.raises(ValueError, match="tool_use"):
await analyzer.analyze(
title="Test",
description="",
@ -438,22 +424,16 @@ class TestLlmAnalyzerFailurePaths:
)
@pytest.mark.asyncio
async def test_empty_tool_calls_raises_value_error(self):
"""If response message has empty tool_calls list, raises ValueError."""
msg = MagicMock()
msg.tool_calls = []
choice = MagicMock()
choice.message = msg
choice.finish_reason = "stop"
async def test_empty_content_raises_value_error(self):
"""If response.content is empty, raises ValueError."""
resp = MagicMock()
resp.choices = [choice]
resp.usage = MagicMock(prompt_tokens=5000, completion_tokens=800)
resp.content = []
resp.usage = MagicMock(input_tokens=5000, output_tokens=800)
resp.stop_reason = "end_turn"
client, _ = _make_client(resp)
analyzer = LlmAnalyzer(client=client, model="anthropic/claude-sonnet-4.5", prompt_version="v1")
analyzer = LlmAnalyzer(client=client, model="claude-sonnet-4-5", prompt_version="v1")
with pytest.raises(ValueError):
await analyzer.analyze(
title="Test",
@ -469,10 +449,10 @@ class TestLlmAnalyzerFailurePaths:
bad_input = _valid_analysis_input()
bad_input["market_outlook_direction"] = "extremely_bullish" # not a valid enum
resp = _make_openai_response(bad_input)
resp = _make_anthropic_response(bad_input)
client, _ = _make_client(resp)
analyzer = LlmAnalyzer(client=client, model="anthropic/claude-sonnet-4.5", prompt_version="v1")
analyzer = LlmAnalyzer(client=client, model="claude-sonnet-4-5", prompt_version="v1")
with pytest.raises(Exception): # pydantic ValidationError or ValueError
await analyzer.analyze(
title="Test",