feat(meet-kevin): pipeline orchestrator + service main loop

Implements Task 8 of the Meet Kevin revival plan. - pipeline.py: PipelineDeps dataclass (frozen, DI-friendly), process_one_video state machine (discovered→captioned→analyzed with retry/cost-cap logic), and daily_cost_used() SQL helper. - main.py: async run() entry point with RSS poll loop, per-video pipeline processing, OTEL counters, SIGTERM/SIGINT shutdown, httpx client lifecycle, and clean Anthropic/DB teardown. - tests: 5 pipeline unit tests (happy path, no captions, cost cap, retry increment, failed-after-3-retries) all passing; full watcher suite 56/56. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-21 19:48:43 +00:00 · 2026-05-21 19:48:43 +00:00 · 8f5ee8f1c3
commit 8f5ee8f1c3
parent 8309556c00
3 changed files with 644 additions and 0 deletions
--- a/services/meet_kevin_watcher/pipeline.py
+++ b/services/meet_kevin_watcher/pipeline.py
@ -0,0 +1,263 @@
+"""Meet Kevin pipeline orchestrator.
+
+Contains the per-video state-machine (process_one_video) and the daily
+cost accounting helper (daily_cost_used). Both are designed for
+dependency injection so they are fully unit-testable without a real DB
+or LLM backend.
+
+Public exports:
+  PipelineDeps      — frozen dataclass carrying all injected callables + config
+  process_one_video — advance one KevinVideo by one pipeline stage
+  daily_cost_used   — sum today's LLM spend from kevin_analyses
+"""
+
+import logging
+from dataclasses import dataclass
+from datetime import datetime, timezone
+from decimal import Decimal
+from typing import Any, Callable, Coroutine
+
+from sqlalchemy import func, select
+from sqlalchemy.ext.asyncio import AsyncSession
+
+from shared.models.meet_kevin import (
+    KevinAnalysis,
+    KevinStockMention,
+    KevinTranscript,
+    KevinVideo,
+)
+from services.meet_kevin_watcher.caption_extractor import CaptionResult
+from services.meet_kevin_watcher.llm_analyzer import LlmCallResult
+
+logger = logging.getLogger(__name__)
+
+
+# ---------------------------------------------------------------------------
+# Dependency-injection container
+# ---------------------------------------------------------------------------
+
+
+@dataclass(frozen=True)
+class PipelineDeps:
+    """Injected dependencies for the pipeline, making it unit-testable.
+
+    All async callables match the signatures of the real implementations
+    but can be replaced with AsyncMock in tests.
+    """
+
+    extract_captions: Callable[..., Coroutine[Any, Any, CaptionResult | None]]
+    """Async callable: (video_id: str, workdir: str) -> CaptionResult | None"""
+
+    analyze: Callable[..., Coroutine[Any, Any, LlmCallResult]]
+    """Async callable: (**kwargs) -> LlmCallResult"""
+
+    daily_cost_used: Callable[..., Coroutine[Any, Any, Decimal]]
+    """Async callable: (session: AsyncSession) -> Decimal"""
+
+    model: str
+    """LLM model identifier stored in kevin_analyses.model."""
+
+    prompt_version: str
+    """Prompt version string stored in kevin_analyses.prompt_version."""
+
+    daily_cost_cap_usd: Decimal
+    """Hard ceiling for total LLM spend per calendar day (UTC)."""
+
+    workdir: str
+    """Filesystem directory for yt-dlp caption downloads."""
+
+
+# ---------------------------------------------------------------------------
+# Daily cost accounting
+# ---------------------------------------------------------------------------
+
+
+async def daily_cost_used(session: AsyncSession) -> Decimal:
+    """Return total LLM cost incurred today (UTC) from kevin_analyses.
+
+    Uses a single SUM query truncated to the start of the current UTC day.
+
+    Args:
+        session: Async SQLAlchemy session.
+
+    Returns:
+        Sum of cost_usd for all analyses created since midnight UTC today,
+        as a Decimal. Returns Decimal("0") when no rows match.
+    """
+    stmt = select(
+        func.coalesce(func.sum(KevinAnalysis.cost_usd), 0)
+    ).where(
+        KevinAnalysis.created_at >= func.date_trunc("day", func.now())
+    )
+    result = await session.execute(stmt)
+    scalar = result.scalar_one()
+    return Decimal(str(scalar or 0))
+
+
+# ---------------------------------------------------------------------------
+# Per-video pipeline stage runner
+# ---------------------------------------------------------------------------
+
+
+async def process_one_video(
+    video: KevinVideo,
+    session: AsyncSession,
+    deps: PipelineDeps,
+) -> str:
+    """Advance *video* by one pipeline stage and return the new status string.
+
+    Stage transitions:
+      discovered  → extract captions
+                     • None result → status='failed', failure_reason='no_captions'
+                     • CaptionResult → insert KevinTranscript, advance to 'captioned'
+      captioned   → check daily cost cap
+                     • over cap → leave as 'captioned' (retry tomorrow)
+                     • under cap → call analyze()
+                       - success → insert KevinAnalysis + KevinStockMention rows,
+                                   advance to 'analyzed', set processed_at
+                       - exception → increment retry_count;
+                                     if retry_count >= 3: status='failed'
+                                     otherwise leave as 'captioned'
+
+    Args:
+        video: ORM instance (mutated in-place; caller is responsible for commit).
+        session: Async SQLAlchemy session (add/flush, NOT commit — caller commits).
+        deps:   Injected callables and config.
+
+    Returns:
+        The new status string (e.g. "analyzed", "captioned", "failed").
+    """
+    current_status: str = str(video.status.value) if hasattr(video.status, "value") else str(video.status)
+
+    # ------------------------------------------------------------------
+    # Stage 1: discovered → extract captions
+    # ------------------------------------------------------------------
+    if current_status == "discovered":
+        caption_result: CaptionResult | None = await deps.extract_captions(
+            video.youtube_video_id, deps.workdir
+        )
+
+        if caption_result is None:
+            logger.warning("No captions for video %s — marking failed", video.youtube_video_id)
+            video.status = "failed"
+            video.failure_reason = "no_captions"
+            return "failed"
+
+        # Determine transcript source from CaptionResult.source field
+        source_str = caption_result.source  # e.g. "youtube", "captions_auto", "captions_manual"
+        if "manual" in source_str:
+            transcript_source = "captions_manual"
+        elif "auto" in source_str or source_str == "youtube":
+            transcript_source = "captions_auto"
+        else:
+            transcript_source = "captions_auto"
+
+        transcript = KevinTranscript(
+            video_id=video.id,
+            source=transcript_source,
+            language=caption_result.language,
+            raw_text=caption_result.raw_text,
+            segments_json=list(caption_result.segments),
+            word_count=caption_result.word_count,
+        )
+        session.add(transcript)
+        await session.flush()
+
+        video.status = "captioned"
+        logger.info("Captions extracted for video %s (%d words)", video.youtube_video_id, caption_result.word_count)
+        current_status = "captioned"
+
+    # ------------------------------------------------------------------
+    # Stage 2: captioned → LLM analysis
+    # ------------------------------------------------------------------
+    if current_status == "captioned":
+        # Check daily cost cap before calling the LLM
+        cost_so_far: Decimal = await deps.daily_cost_used(session)
+        if cost_so_far >= deps.daily_cost_cap_usd:
+            logger.info(
+                "Daily cost cap $%.4f reached ($%.4f used) — skipping LLM for %s",
+                deps.daily_cost_cap_usd, cost_so_far, video.youtube_video_id,
+            )
+            return "captioned"
+
+        # Fetch the transcript for this video to pass to the LLM
+        from sqlalchemy import select as _select
+        stmt = _select(KevinTranscript).where(KevinTranscript.video_id == video.id)
+        result = await session.execute(stmt)
+        transcript = result.scalar_one()
+
+        segments: list[dict] = transcript.segments_json or []
+
+        try:
+            llm_result: LlmCallResult = await deps.analyze(
+                title=getattr(video, "title", ""),
+                description=getattr(video, "description", "") or "",
+                published_at=getattr(video, "published_at", None) or datetime.now(tz=timezone.utc),
+                transcript_text=transcript.raw_text,
+                transcript_segments=segments,
+            )
+        except Exception as exc:
+            video.retry_count = (video.retry_count or 0) + 1
+            if video.retry_count >= 3:
+                video.status = "failed"
+                video.failure_reason = f"llm_error: {type(exc).__name__}"
+                logger.error(
+                    "Video %s failed after %d retries: %s",
+                    video.youtube_video_id, video.retry_count, exc,
+                )
+                return "failed"
+            else:
+                logger.warning(
+                    "LLM error for video %s (retry %d/3): %s",
+                    video.youtube_video_id, video.retry_count, exc,
+                )
+                return "captioned"
+
+        analysis = llm_result.analysis
+
+        # Persist KevinAnalysis row
+        db_analysis = KevinAnalysis(
+            video_id=video.id,
+            model=deps.model,
+            prompt_version=deps.prompt_version,
+            market_outlook_direction=analysis.market_outlook_direction.value,
+            market_outlook_reasoning=analysis.market_outlook_reasoning,
+            macro_themes_json=analysis.macro_themes,
+            key_risks_json=analysis.key_risks,
+            summary=analysis.summary,
+            raw_response_json=llm_result.raw_response,
+            prompt_tokens=llm_result.prompt_tokens,
+            completion_tokens=llm_result.completion_tokens,
+            cost_usd=llm_result.cost_usd,
+        )
+        session.add(db_analysis)
+        await session.flush()  # get db_analysis.id
+
+        # Persist KevinStockMention rows
+        for ticker in analysis.tickers:
+            mention = KevinStockMention(
+                video_id=video.id,
+                analysis_id=db_analysis.id,
+                symbol=ticker.symbol,
+                action=ticker.action.value,
+                conviction=Decimal(str(ticker.conviction)),
+                time_horizon=ticker.time_horizon.value,
+                rationale_quote=ticker.rationale_quote,
+                video_timestamp_seconds=ticker.video_timestamp_seconds,
+            )
+            session.add(mention)
+
+        video.status = "analyzed"
+        video.processed_at = datetime.now(tz=timezone.utc)
+        logger.info(
+            "Analysis complete for video %s: %s, %d tickers, cost=$%.4f",
+            video.youtube_video_id,
+            analysis.market_outlook_direction.value,
+            len(analysis.tickers),
+            llm_result.cost_usd,
+        )
+        return "analyzed"
+
+    # Unknown status — log and return unchanged
+    logger.warning("process_one_video: unexpected status %r for video %s", current_status, video.youtube_video_id)
+    return current_status