First production run hit Anthropic's per-account rate_limit_error (429) trying to burn through 16 backfill videos in seconds. The SDK's built-in retry can't recover because the rate limit window resets slower than the 3 retry attempts. Added meet_kevin_inter_video_sleep_seconds (default 30s) to PipelineDeps and main's _process_pending_videos loop. 16 backfill videos now take ~8 min (16 * 30s sleeps + ~30s per LLM call) instead of bursting into the rate limit.
266 lines
10 KiB
Python
266 lines
10 KiB
Python
"""Meet Kevin pipeline orchestrator.
|
|
|
|
Contains the per-video state-machine (process_one_video) and the daily
|
|
cost accounting helper (daily_cost_used). Both are designed for
|
|
dependency injection so they are fully unit-testable without a real DB
|
|
or LLM backend.
|
|
|
|
Public exports:
|
|
PipelineDeps — frozen dataclass carrying all injected callables + config
|
|
process_one_video — advance one KevinVideo by one pipeline stage
|
|
daily_cost_used — sum today's LLM spend from kevin_analyses
|
|
"""
|
|
|
|
import logging
|
|
from dataclasses import dataclass
|
|
from datetime import datetime, timezone
|
|
from decimal import Decimal
|
|
from typing import Any, Callable, Coroutine
|
|
|
|
from sqlalchemy import func, select
|
|
from sqlalchemy.ext.asyncio import AsyncSession
|
|
|
|
from shared.models.meet_kevin import (
|
|
KevinAnalysis,
|
|
KevinStockMention,
|
|
KevinTranscript,
|
|
KevinVideo,
|
|
)
|
|
from services.meet_kevin_watcher.caption_extractor import CaptionResult
|
|
from services.meet_kevin_watcher.llm_analyzer import LlmCallResult
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Dependency-injection container
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class PipelineDeps:
|
|
"""Injected dependencies for the pipeline, making it unit-testable.
|
|
|
|
All async callables match the signatures of the real implementations
|
|
but can be replaced with AsyncMock in tests.
|
|
"""
|
|
|
|
extract_captions: Callable[..., Coroutine[Any, Any, CaptionResult | None]]
|
|
"""Async callable: (video_id: str, workdir: str) -> CaptionResult | None"""
|
|
|
|
analyze: Callable[..., Coroutine[Any, Any, LlmCallResult]]
|
|
"""Async callable: (**kwargs) -> LlmCallResult"""
|
|
|
|
daily_cost_used: Callable[..., Coroutine[Any, Any, Decimal]]
|
|
"""Async callable: (session: AsyncSession) -> Decimal"""
|
|
|
|
model: str
|
|
"""LLM model identifier stored in kevin_analyses.model."""
|
|
|
|
prompt_version: str
|
|
"""Prompt version string stored in kevin_analyses.prompt_version."""
|
|
|
|
daily_cost_cap_usd: Decimal
|
|
"""Hard ceiling for total LLM spend per calendar day (UTC)."""
|
|
|
|
workdir: str
|
|
"""Filesystem directory for yt-dlp caption downloads."""
|
|
|
|
inter_video_sleep_seconds: int = 30
|
|
"""Sleep between consecutive videos to stay under the LLM provider RPM limit."""
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Daily cost accounting
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
async def daily_cost_used(session: AsyncSession) -> Decimal:
|
|
"""Return total LLM cost incurred today (UTC) from kevin_analyses.
|
|
|
|
Uses a single SUM query truncated to the start of the current UTC day.
|
|
|
|
Args:
|
|
session: Async SQLAlchemy session.
|
|
|
|
Returns:
|
|
Sum of cost_usd for all analyses created since midnight UTC today,
|
|
as a Decimal. Returns Decimal("0") when no rows match.
|
|
"""
|
|
stmt = select(
|
|
func.coalesce(func.sum(KevinAnalysis.cost_usd), 0)
|
|
).where(
|
|
KevinAnalysis.created_at >= func.date_trunc("day", func.now())
|
|
)
|
|
result = await session.execute(stmt)
|
|
scalar = result.scalar_one()
|
|
return Decimal(str(scalar or 0))
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Per-video pipeline stage runner
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
async def process_one_video(
|
|
video: KevinVideo,
|
|
session: AsyncSession,
|
|
deps: PipelineDeps,
|
|
) -> str:
|
|
"""Advance *video* by one pipeline stage and return the new status string.
|
|
|
|
Stage transitions:
|
|
discovered → extract captions
|
|
• None result → status='failed', failure_reason='no_captions'
|
|
• CaptionResult → insert KevinTranscript, advance to 'captioned'
|
|
captioned → check daily cost cap
|
|
• over cap → leave as 'captioned' (retry tomorrow)
|
|
• under cap → call analyze()
|
|
- success → insert KevinAnalysis + KevinStockMention rows,
|
|
advance to 'analyzed', set processed_at
|
|
- exception → increment retry_count;
|
|
if retry_count >= 3: status='failed'
|
|
otherwise leave as 'captioned'
|
|
|
|
Args:
|
|
video: ORM instance (mutated in-place; caller is responsible for commit).
|
|
session: Async SQLAlchemy session (add/flush, NOT commit — caller commits).
|
|
deps: Injected callables and config.
|
|
|
|
Returns:
|
|
The new status string (e.g. "analyzed", "captioned", "failed").
|
|
"""
|
|
current_status: str = str(video.status.value) if hasattr(video.status, "value") else str(video.status)
|
|
|
|
# ------------------------------------------------------------------
|
|
# Stage 1: discovered → extract captions
|
|
# ------------------------------------------------------------------
|
|
if current_status == "discovered":
|
|
caption_result: CaptionResult | None = await deps.extract_captions(
|
|
video.youtube_video_id, deps.workdir
|
|
)
|
|
|
|
if caption_result is None:
|
|
logger.warning("No captions for video %s — marking failed", video.youtube_video_id)
|
|
video.status = "failed"
|
|
video.failure_reason = "no_captions"
|
|
return "failed"
|
|
|
|
# Determine transcript source from CaptionResult.source field
|
|
source_str = caption_result.source # e.g. "youtube", "captions_auto", "captions_manual"
|
|
if "manual" in source_str:
|
|
transcript_source = "captions_manual"
|
|
elif "auto" in source_str or source_str == "youtube":
|
|
transcript_source = "captions_auto"
|
|
else:
|
|
transcript_source = "captions_auto"
|
|
|
|
transcript = KevinTranscript(
|
|
video_id=video.id,
|
|
source=transcript_source,
|
|
language=caption_result.language,
|
|
raw_text=caption_result.raw_text,
|
|
segments_json=list(caption_result.segments),
|
|
word_count=caption_result.word_count,
|
|
)
|
|
session.add(transcript)
|
|
await session.flush()
|
|
|
|
video.status = "captioned"
|
|
logger.info("Captions extracted for video %s (%d words)", video.youtube_video_id, caption_result.word_count)
|
|
current_status = "captioned"
|
|
|
|
# ------------------------------------------------------------------
|
|
# Stage 2: captioned → LLM analysis
|
|
# ------------------------------------------------------------------
|
|
if current_status == "captioned":
|
|
# Check daily cost cap before calling the LLM
|
|
cost_so_far: Decimal = await deps.daily_cost_used(session)
|
|
if cost_so_far >= deps.daily_cost_cap_usd:
|
|
logger.info(
|
|
"Daily cost cap $%.4f reached ($%.4f used) — skipping LLM for %s",
|
|
deps.daily_cost_cap_usd, cost_so_far, video.youtube_video_id,
|
|
)
|
|
return "captioned"
|
|
|
|
# Fetch the transcript for this video to pass to the LLM
|
|
from sqlalchemy import select as _select
|
|
stmt = _select(KevinTranscript).where(KevinTranscript.video_id == video.id)
|
|
result = await session.execute(stmt)
|
|
transcript = result.scalar_one()
|
|
|
|
segments: list[dict] = transcript.segments_json or []
|
|
|
|
try:
|
|
llm_result: LlmCallResult = await deps.analyze(
|
|
title=getattr(video, "title", ""),
|
|
description=getattr(video, "description", "") or "",
|
|
published_at=getattr(video, "published_at", None) or datetime.now(tz=timezone.utc),
|
|
transcript_text=transcript.raw_text,
|
|
transcript_segments=segments,
|
|
)
|
|
except Exception as exc:
|
|
video.retry_count = (video.retry_count or 0) + 1
|
|
if video.retry_count >= 3:
|
|
video.status = "failed"
|
|
video.failure_reason = f"llm_error: {type(exc).__name__}"
|
|
logger.error(
|
|
"Video %s failed after %d retries: %s",
|
|
video.youtube_video_id, video.retry_count, exc,
|
|
)
|
|
return "failed"
|
|
else:
|
|
logger.warning(
|
|
"LLM error for video %s (retry %d/3): %s",
|
|
video.youtube_video_id, video.retry_count, exc,
|
|
)
|
|
return "captioned"
|
|
|
|
analysis = llm_result.analysis
|
|
|
|
# Persist KevinAnalysis row
|
|
db_analysis = KevinAnalysis(
|
|
video_id=video.id,
|
|
model=deps.model,
|
|
prompt_version=deps.prompt_version,
|
|
market_outlook_direction=analysis.market_outlook_direction.value,
|
|
market_outlook_reasoning=analysis.market_outlook_reasoning,
|
|
macro_themes_json=analysis.macro_themes,
|
|
key_risks_json=analysis.key_risks,
|
|
summary=analysis.summary,
|
|
raw_response_json=llm_result.raw_response,
|
|
prompt_tokens=llm_result.prompt_tokens,
|
|
completion_tokens=llm_result.completion_tokens,
|
|
cost_usd=llm_result.cost_usd,
|
|
)
|
|
session.add(db_analysis)
|
|
await session.flush() # get db_analysis.id
|
|
|
|
# Persist KevinStockMention rows
|
|
for ticker in analysis.tickers:
|
|
mention = KevinStockMention(
|
|
video_id=video.id,
|
|
analysis_id=db_analysis.id,
|
|
symbol=ticker.symbol,
|
|
action=ticker.action.value,
|
|
conviction=Decimal(str(ticker.conviction)),
|
|
time_horizon=ticker.time_horizon.value,
|
|
rationale_quote=ticker.rationale_quote,
|
|
video_timestamp_seconds=ticker.video_timestamp_seconds,
|
|
)
|
|
session.add(mention)
|
|
|
|
video.status = "analyzed"
|
|
video.processed_at = datetime.now(tz=timezone.utc)
|
|
logger.info(
|
|
"Analysis complete for video %s: %s, %d tickers, cost=$%.4f",
|
|
video.youtube_video_id,
|
|
analysis.market_outlook_direction.value,
|
|
len(analysis.tickers),
|
|
llm_result.cost_usd,
|
|
)
|
|
return "analyzed"
|
|
|
|
# Unknown status — log and return unchanged
|
|
logger.warning("process_one_video: unexpected status %r for video %s", current_status, video.youtube_video_id)
|
|
return current_status
|