feat(meet-kevin): pipeline orchestrator + service main loop
Implements Task 8 of the Meet Kevin revival plan. - pipeline.py: PipelineDeps dataclass (frozen, DI-friendly), process_one_video state machine (discovered→captioned→analyzed with retry/cost-cap logic), and daily_cost_used() SQL helper. - main.py: async run() entry point with RSS poll loop, per-video pipeline processing, OTEL counters, SIGTERM/SIGINT shutdown, httpx client lifecycle, and clean Anthropic/DB teardown. - tests: 5 pipeline unit tests (happy path, no captions, cost cap, retry increment, failed-after-3-retries) all passing; full watcher suite 56/56. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
8309556c00
commit
8f5ee8f1c3
3 changed files with 644 additions and 0 deletions
263
services/meet_kevin_watcher/pipeline.py
Normal file
263
services/meet_kevin_watcher/pipeline.py
Normal file
|
|
@ -0,0 +1,263 @@
|
|||
"""Meet Kevin pipeline orchestrator.
|
||||
|
||||
Contains the per-video state-machine (process_one_video) and the daily
|
||||
cost accounting helper (daily_cost_used). Both are designed for
|
||||
dependency injection so they are fully unit-testable without a real DB
|
||||
or LLM backend.
|
||||
|
||||
Public exports:
|
||||
PipelineDeps — frozen dataclass carrying all injected callables + config
|
||||
process_one_video — advance one KevinVideo by one pipeline stage
|
||||
daily_cost_used — sum today's LLM spend from kevin_analyses
|
||||
"""
|
||||
|
||||
import logging
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime, timezone
|
||||
from decimal import Decimal
|
||||
from typing import Any, Callable, Coroutine
|
||||
|
||||
from sqlalchemy import func, select
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
from shared.models.meet_kevin import (
|
||||
KevinAnalysis,
|
||||
KevinStockMention,
|
||||
KevinTranscript,
|
||||
KevinVideo,
|
||||
)
|
||||
from services.meet_kevin_watcher.caption_extractor import CaptionResult
|
||||
from services.meet_kevin_watcher.llm_analyzer import LlmCallResult
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Dependency-injection container
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class PipelineDeps:
|
||||
"""Injected dependencies for the pipeline, making it unit-testable.
|
||||
|
||||
All async callables match the signatures of the real implementations
|
||||
but can be replaced with AsyncMock in tests.
|
||||
"""
|
||||
|
||||
extract_captions: Callable[..., Coroutine[Any, Any, CaptionResult | None]]
|
||||
"""Async callable: (video_id: str, workdir: str) -> CaptionResult | None"""
|
||||
|
||||
analyze: Callable[..., Coroutine[Any, Any, LlmCallResult]]
|
||||
"""Async callable: (**kwargs) -> LlmCallResult"""
|
||||
|
||||
daily_cost_used: Callable[..., Coroutine[Any, Any, Decimal]]
|
||||
"""Async callable: (session: AsyncSession) -> Decimal"""
|
||||
|
||||
model: str
|
||||
"""LLM model identifier stored in kevin_analyses.model."""
|
||||
|
||||
prompt_version: str
|
||||
"""Prompt version string stored in kevin_analyses.prompt_version."""
|
||||
|
||||
daily_cost_cap_usd: Decimal
|
||||
"""Hard ceiling for total LLM spend per calendar day (UTC)."""
|
||||
|
||||
workdir: str
|
||||
"""Filesystem directory for yt-dlp caption downloads."""
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Daily cost accounting
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
async def daily_cost_used(session: AsyncSession) -> Decimal:
|
||||
"""Return total LLM cost incurred today (UTC) from kevin_analyses.
|
||||
|
||||
Uses a single SUM query truncated to the start of the current UTC day.
|
||||
|
||||
Args:
|
||||
session: Async SQLAlchemy session.
|
||||
|
||||
Returns:
|
||||
Sum of cost_usd for all analyses created since midnight UTC today,
|
||||
as a Decimal. Returns Decimal("0") when no rows match.
|
||||
"""
|
||||
stmt = select(
|
||||
func.coalesce(func.sum(KevinAnalysis.cost_usd), 0)
|
||||
).where(
|
||||
KevinAnalysis.created_at >= func.date_trunc("day", func.now())
|
||||
)
|
||||
result = await session.execute(stmt)
|
||||
scalar = result.scalar_one()
|
||||
return Decimal(str(scalar or 0))
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Per-video pipeline stage runner
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
async def process_one_video(
|
||||
video: KevinVideo,
|
||||
session: AsyncSession,
|
||||
deps: PipelineDeps,
|
||||
) -> str:
|
||||
"""Advance *video* by one pipeline stage and return the new status string.
|
||||
|
||||
Stage transitions:
|
||||
discovered → extract captions
|
||||
• None result → status='failed', failure_reason='no_captions'
|
||||
• CaptionResult → insert KevinTranscript, advance to 'captioned'
|
||||
captioned → check daily cost cap
|
||||
• over cap → leave as 'captioned' (retry tomorrow)
|
||||
• under cap → call analyze()
|
||||
- success → insert KevinAnalysis + KevinStockMention rows,
|
||||
advance to 'analyzed', set processed_at
|
||||
- exception → increment retry_count;
|
||||
if retry_count >= 3: status='failed'
|
||||
otherwise leave as 'captioned'
|
||||
|
||||
Args:
|
||||
video: ORM instance (mutated in-place; caller is responsible for commit).
|
||||
session: Async SQLAlchemy session (add/flush, NOT commit — caller commits).
|
||||
deps: Injected callables and config.
|
||||
|
||||
Returns:
|
||||
The new status string (e.g. "analyzed", "captioned", "failed").
|
||||
"""
|
||||
current_status: str = str(video.status.value) if hasattr(video.status, "value") else str(video.status)
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Stage 1: discovered → extract captions
|
||||
# ------------------------------------------------------------------
|
||||
if current_status == "discovered":
|
||||
caption_result: CaptionResult | None = await deps.extract_captions(
|
||||
video.youtube_video_id, deps.workdir
|
||||
)
|
||||
|
||||
if caption_result is None:
|
||||
logger.warning("No captions for video %s — marking failed", video.youtube_video_id)
|
||||
video.status = "failed"
|
||||
video.failure_reason = "no_captions"
|
||||
return "failed"
|
||||
|
||||
# Determine transcript source from CaptionResult.source field
|
||||
source_str = caption_result.source # e.g. "youtube", "captions_auto", "captions_manual"
|
||||
if "manual" in source_str:
|
||||
transcript_source = "captions_manual"
|
||||
elif "auto" in source_str or source_str == "youtube":
|
||||
transcript_source = "captions_auto"
|
||||
else:
|
||||
transcript_source = "captions_auto"
|
||||
|
||||
transcript = KevinTranscript(
|
||||
video_id=video.id,
|
||||
source=transcript_source,
|
||||
language=caption_result.language,
|
||||
raw_text=caption_result.raw_text,
|
||||
segments_json=list(caption_result.segments),
|
||||
word_count=caption_result.word_count,
|
||||
)
|
||||
session.add(transcript)
|
||||
await session.flush()
|
||||
|
||||
video.status = "captioned"
|
||||
logger.info("Captions extracted for video %s (%d words)", video.youtube_video_id, caption_result.word_count)
|
||||
current_status = "captioned"
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Stage 2: captioned → LLM analysis
|
||||
# ------------------------------------------------------------------
|
||||
if current_status == "captioned":
|
||||
# Check daily cost cap before calling the LLM
|
||||
cost_so_far: Decimal = await deps.daily_cost_used(session)
|
||||
if cost_so_far >= deps.daily_cost_cap_usd:
|
||||
logger.info(
|
||||
"Daily cost cap $%.4f reached ($%.4f used) — skipping LLM for %s",
|
||||
deps.daily_cost_cap_usd, cost_so_far, video.youtube_video_id,
|
||||
)
|
||||
return "captioned"
|
||||
|
||||
# Fetch the transcript for this video to pass to the LLM
|
||||
from sqlalchemy import select as _select
|
||||
stmt = _select(KevinTranscript).where(KevinTranscript.video_id == video.id)
|
||||
result = await session.execute(stmt)
|
||||
transcript = result.scalar_one()
|
||||
|
||||
segments: list[dict] = transcript.segments_json or []
|
||||
|
||||
try:
|
||||
llm_result: LlmCallResult = await deps.analyze(
|
||||
title=getattr(video, "title", ""),
|
||||
description=getattr(video, "description", "") or "",
|
||||
published_at=getattr(video, "published_at", None) or datetime.now(tz=timezone.utc),
|
||||
transcript_text=transcript.raw_text,
|
||||
transcript_segments=segments,
|
||||
)
|
||||
except Exception as exc:
|
||||
video.retry_count = (video.retry_count or 0) + 1
|
||||
if video.retry_count >= 3:
|
||||
video.status = "failed"
|
||||
video.failure_reason = f"llm_error: {type(exc).__name__}"
|
||||
logger.error(
|
||||
"Video %s failed after %d retries: %s",
|
||||
video.youtube_video_id, video.retry_count, exc,
|
||||
)
|
||||
return "failed"
|
||||
else:
|
||||
logger.warning(
|
||||
"LLM error for video %s (retry %d/3): %s",
|
||||
video.youtube_video_id, video.retry_count, exc,
|
||||
)
|
||||
return "captioned"
|
||||
|
||||
analysis = llm_result.analysis
|
||||
|
||||
# Persist KevinAnalysis row
|
||||
db_analysis = KevinAnalysis(
|
||||
video_id=video.id,
|
||||
model=deps.model,
|
||||
prompt_version=deps.prompt_version,
|
||||
market_outlook_direction=analysis.market_outlook_direction.value,
|
||||
market_outlook_reasoning=analysis.market_outlook_reasoning,
|
||||
macro_themes_json=analysis.macro_themes,
|
||||
key_risks_json=analysis.key_risks,
|
||||
summary=analysis.summary,
|
||||
raw_response_json=llm_result.raw_response,
|
||||
prompt_tokens=llm_result.prompt_tokens,
|
||||
completion_tokens=llm_result.completion_tokens,
|
||||
cost_usd=llm_result.cost_usd,
|
||||
)
|
||||
session.add(db_analysis)
|
||||
await session.flush() # get db_analysis.id
|
||||
|
||||
# Persist KevinStockMention rows
|
||||
for ticker in analysis.tickers:
|
||||
mention = KevinStockMention(
|
||||
video_id=video.id,
|
||||
analysis_id=db_analysis.id,
|
||||
symbol=ticker.symbol,
|
||||
action=ticker.action.value,
|
||||
conviction=Decimal(str(ticker.conviction)),
|
||||
time_horizon=ticker.time_horizon.value,
|
||||
rationale_quote=ticker.rationale_quote,
|
||||
video_timestamp_seconds=ticker.video_timestamp_seconds,
|
||||
)
|
||||
session.add(mention)
|
||||
|
||||
video.status = "analyzed"
|
||||
video.processed_at = datetime.now(tz=timezone.utc)
|
||||
logger.info(
|
||||
"Analysis complete for video %s: %s, %d tickers, cost=$%.4f",
|
||||
video.youtube_video_id,
|
||||
analysis.market_outlook_direction.value,
|
||||
len(analysis.tickers),
|
||||
llm_result.cost_usd,
|
||||
)
|
||||
return "analyzed"
|
||||
|
||||
# Unknown status — log and return unchanged
|
||||
logger.warning("process_one_video: unexpected status %r for video %s", current_status, video.youtube_video_id)
|
||||
return current_status
|
||||
Loading…
Add table
Add a link
Reference in a new issue