trading/services/meet_kevin_watcher/pipeline.py

"""Meet Kevin pipeline orchestrator.

Contains the per-video state-machine (process_one_video) and the daily
cost accounting helper (daily_cost_used). Both are designed for
dependency injection so they are fully unit-testable without a real DB
or LLM backend.

Public exports:
  PipelineDeps      — frozen dataclass carrying all injected callables + config
  process_one_video — advance one KevinVideo by one pipeline stage
  daily_cost_used   — sum today's LLM spend from kevin_analyses
"""

import logging
from dataclasses import dataclass
from datetime import datetime, timezone
from decimal import Decimal
from typing import Any, Callable, Coroutine

from sqlalchemy import func, select
from sqlalchemy.ext.asyncio import AsyncSession

from shared.models.meet_kevin import (
    KevinAnalysis,
    KevinStockMention,
    KevinTranscript,
    KevinVideo,
)
from services.meet_kevin_watcher.caption_extractor import CaptionResult
from services.meet_kevin_watcher.llm_analyzer import LlmCallResult

logger = logging.getLogger(__name__)


# ---------------------------------------------------------------------------
# Dependency-injection container
# ---------------------------------------------------------------------------


@dataclass(frozen=True)
class PipelineDeps:
    """Injected dependencies for the pipeline, making it unit-testable.

    All async callables match the signatures of the real implementations
    but can be replaced with AsyncMock in tests.
    """

    extract_captions: Callable[..., Coroutine[Any, Any, CaptionResult | None]]
    """Async callable: (video_id: str, workdir: str) -> CaptionResult | None"""

    analyze: Callable[..., Coroutine[Any, Any, LlmCallResult]]
    """Async callable: (**kwargs) -> LlmCallResult"""

    daily_cost_used: Callable[..., Coroutine[Any, Any, Decimal]]
    """Async callable: (session: AsyncSession) -> Decimal"""

    model: str
    """LLM model identifier stored in kevin_analyses.model."""

    prompt_version: str
    """Prompt version string stored in kevin_analyses.prompt_version."""

    daily_cost_cap_usd: Decimal
    """Hard ceiling for total LLM spend per calendar day (UTC)."""

    workdir: str
    """Filesystem directory for yt-dlp caption downloads."""

    inter_video_sleep_seconds: int = 30
    """Sleep between consecutive videos to stay under the LLM provider RPM limit."""


# ---------------------------------------------------------------------------
# Daily cost accounting
# ---------------------------------------------------------------------------


async def daily_cost_used(session: AsyncSession) -> Decimal:
    """Return total LLM cost incurred today (UTC) from kevin_analyses.

    Uses a single SUM query truncated to the start of the current UTC day.

    Args:
        session: Async SQLAlchemy session.

    Returns:
        Sum of cost_usd for all analyses created since midnight UTC today,
        as a Decimal. Returns Decimal("0") when no rows match.
    """
    stmt = select(
        func.coalesce(func.sum(KevinAnalysis.cost_usd), 0)
    ).where(
        KevinAnalysis.created_at >= func.date_trunc("day", func.now())
    )
    result = await session.execute(stmt)
    scalar = result.scalar_one()
    return Decimal(str(scalar or 0))


# ---------------------------------------------------------------------------
# Per-video pipeline stage runner
# ---------------------------------------------------------------------------


async def process_one_video(
    video: KevinVideo,
    session: AsyncSession,
    deps: PipelineDeps,
) -> str:
    """Advance *video* by one pipeline stage and return the new status string.

    Stage transitions:
      discovered  → extract captions
                     • None result → status='failed', failure_reason='no_captions'
                     • CaptionResult → insert KevinTranscript, advance to 'captioned'
      captioned   → check daily cost cap
                     • over cap → leave as 'captioned' (retry tomorrow)
                     • under cap → call analyze()
                       - success → insert KevinAnalysis + KevinStockMention rows,
                                   advance to 'analyzed', set processed_at
                       - exception → increment retry_count;
                                     if retry_count >= 3: status='failed'
                                     otherwise leave as 'captioned'

    Args:
        video: ORM instance (mutated in-place; caller is responsible for commit).
        session: Async SQLAlchemy session (add/flush, NOT commit — caller commits).
        deps:   Injected callables and config.

    Returns:
        The new status string (e.g. "analyzed", "captioned", "failed").
    """
    current_status: str = str(video.status.value) if hasattr(video.status, "value") else str(video.status)

    # ------------------------------------------------------------------
    # Stage 1: discovered → extract captions
    # ------------------------------------------------------------------
    if current_status == "discovered":
        caption_result: CaptionResult | None = await deps.extract_captions(
            video.youtube_video_id, deps.workdir
        )

        if caption_result is None:
            logger.warning("No captions for video %s — marking failed", video.youtube_video_id)
            video.status = "failed"
            video.failure_reason = "no_captions"
            return "failed"

        # Determine transcript source from CaptionResult.source field
        source_str = caption_result.source  # e.g. "youtube", "captions_auto", "captions_manual"
        if "manual" in source_str:
            transcript_source = "captions_manual"
        elif "auto" in source_str or source_str == "youtube":
            transcript_source = "captions_auto"
        else:
            transcript_source = "captions_auto"

        transcript = KevinTranscript(
            video_id=video.id,
            source=transcript_source,
            language=caption_result.language,
            raw_text=caption_result.raw_text,
            segments_json=list(caption_result.segments),
            word_count=caption_result.word_count,
        )
        session.add(transcript)
        await session.flush()

        video.status = "captioned"
        logger.info("Captions extracted for video %s (%d words)", video.youtube_video_id, caption_result.word_count)
        current_status = "captioned"

    # ------------------------------------------------------------------
    # Stage 2: captioned → LLM analysis
    # ------------------------------------------------------------------
    if current_status == "captioned":
        # Check daily cost cap before calling the LLM
        cost_so_far: Decimal = await deps.daily_cost_used(session)
        if cost_so_far >= deps.daily_cost_cap_usd:
            logger.info(
                "Daily cost cap $%.4f reached ($%.4f used) — skipping LLM for %s",
                deps.daily_cost_cap_usd, cost_so_far, video.youtube_video_id,
            )
            return "captioned"

        # Fetch the transcript for this video to pass to the LLM
        from sqlalchemy import select as _select
        stmt = _select(KevinTranscript).where(KevinTranscript.video_id == video.id)
        result = await session.execute(stmt)
        transcript = result.scalar_one()

        segments: list[dict] = transcript.segments_json or []

        try:
            llm_result: LlmCallResult = await deps.analyze(
                title=getattr(video, "title", ""),
                description=getattr(video, "description", "") or "",
                published_at=getattr(video, "published_at", None) or datetime.now(tz=timezone.utc),
                transcript_text=transcript.raw_text,
                transcript_segments=segments,
            )
        except Exception as exc:
            video.retry_count = (video.retry_count or 0) + 1
            if video.retry_count >= 3:
                video.status = "failed"
                video.failure_reason = f"llm_error: {type(exc).__name__}"
                logger.error(
                    "Video %s failed after %d retries: %s",
                    video.youtube_video_id, video.retry_count, exc,
                )
                return "failed"
            else:
                logger.warning(
                    "LLM error for video %s (retry %d/3): %s",
                    video.youtube_video_id, video.retry_count, exc,
                )
                return "captioned"

        analysis = llm_result.analysis

        # Persist KevinAnalysis row
        db_analysis = KevinAnalysis(
            video_id=video.id,
            model=deps.model,
            prompt_version=deps.prompt_version,
            market_outlook_direction=analysis.market_outlook_direction.value,
            market_outlook_reasoning=analysis.market_outlook_reasoning,
            macro_themes_json=analysis.macro_themes,
            key_risks_json=analysis.key_risks,
            summary=analysis.summary,
            raw_response_json=llm_result.raw_response,
            prompt_tokens=llm_result.prompt_tokens,
            completion_tokens=llm_result.completion_tokens,
            cost_usd=llm_result.cost_usd,
        )
        session.add(db_analysis)
        await session.flush()  # get db_analysis.id

        # Persist KevinStockMention rows
        for ticker in analysis.tickers:
            mention = KevinStockMention(
                video_id=video.id,
                analysis_id=db_analysis.id,
                symbol=ticker.symbol,
                action=ticker.action.value,
                conviction=Decimal(str(ticker.conviction)),
                time_horizon=ticker.time_horizon.value,
                rationale_quote=ticker.rationale_quote,
                video_timestamp_seconds=ticker.video_timestamp_seconds,
            )
            session.add(mention)

        video.status = "analyzed"
        video.processed_at = datetime.now(tz=timezone.utc)
        logger.info(
            "Analysis complete for video %s: %s, %d tickers, cost=$%.4f",
            video.youtube_video_id,
            analysis.market_outlook_direction.value,
            len(analysis.tickers),
            llm_result.cost_usd,
        )
        return "analyzed"

    # Unknown status — log and return unchanged
    logger.warning("process_one_video: unexpected status %r for video %s", current_status, video.youtube_video_id)
    return current_status