First production run hit Anthropic's per-account rate_limit_error (429) trying to burn through 16 backfill videos in seconds. The SDK's built-in retry can't recover because the rate limit window resets slower than the 3 retry attempts. Added meet_kevin_inter_video_sleep_seconds (default 30s) to PipelineDeps and main's _process_pending_videos loop. 16 backfill videos now take ~8 min (16 * 30s sleeps + ~30s per LLM call) instead of bursting into the rate limit.
256 lines
9.5 KiB
Python
256 lines
9.5 KiB
Python
"""Meet Kevin watcher service entry point.
|
|
|
|
Polls YouTube RSS feeds for new Meet Kevin videos, deduplicates via
|
|
``INSERT … ON CONFLICT DO NOTHING``, and processes each video through
|
|
the caption-extraction + LLM-analysis pipeline.
|
|
|
|
Usage:
|
|
python -m services.meet_kevin_watcher.main
|
|
# or via Docker ENTRYPOINT
|
|
"""
|
|
|
|
import asyncio
|
|
import logging
|
|
import signal
|
|
from datetime import timezone
|
|
from decimal import Decimal
|
|
|
|
import httpx
|
|
from anthropic import AsyncAnthropic
|
|
from sqlalchemy import select
|
|
from sqlalchemy.dialects.postgresql import insert as pg_insert
|
|
|
|
from shared.db import create_db
|
|
from shared.telemetry import setup_telemetry
|
|
from services.meet_kevin_watcher.config import MeetKevinWatcherConfig
|
|
from services.meet_kevin_watcher.caption_extractor import extract_captions
|
|
from services.meet_kevin_watcher.llm_analyzer import LlmAnalyzer
|
|
from services.meet_kevin_watcher.pipeline import PipelineDeps, daily_cost_used, process_one_video
|
|
from services.meet_kevin_watcher.rss_poller import fetch_feed, parse_feed
|
|
from shared.models.meet_kevin import KevinChannel, KevinVideo
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# RSS polling helper
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
async def _poll_channels(
|
|
session_factory,
|
|
http_client: httpx.AsyncClient,
|
|
videos_discovered_counter,
|
|
) -> None:
|
|
"""Fetch RSS for every poll_enabled channel and upsert new videos."""
|
|
async with session_factory() as session:
|
|
result = await session.execute(
|
|
select(KevinChannel).where(KevinChannel.poll_enabled.is_(True))
|
|
)
|
|
channels = result.scalars().all()
|
|
|
|
for channel in channels:
|
|
try:
|
|
xml_bytes = await fetch_feed(channel.youtube_channel_id, http_client)
|
|
discovered = parse_feed(xml_bytes)
|
|
if not discovered:
|
|
continue
|
|
|
|
async with session_factory() as session:
|
|
for video in discovered:
|
|
stmt = (
|
|
pg_insert(KevinVideo)
|
|
.values(
|
|
channel_id=channel.id,
|
|
youtube_video_id=video.youtube_video_id,
|
|
title=video.title,
|
|
description=video.description,
|
|
published_at=video.published_at,
|
|
thumbnail_url=video.thumbnail_url,
|
|
status="discovered",
|
|
retry_count=0,
|
|
)
|
|
.on_conflict_do_nothing(index_elements=["youtube_video_id"])
|
|
.returning(KevinVideo.id)
|
|
)
|
|
row = await session.execute(stmt)
|
|
if row.scalar():
|
|
videos_discovered_counter.add(1)
|
|
logger.info("Discovered new video: %s", video.youtube_video_id)
|
|
|
|
await session.commit()
|
|
except Exception:
|
|
logger.exception("RSS poll failed for channel %s", channel.youtube_channel_id)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Pipeline processing helper
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
async def _process_pending_videos(
|
|
session_factory,
|
|
deps: PipelineDeps,
|
|
captions_extracted_counter,
|
|
llm_calls_counter,
|
|
llm_cost_counter,
|
|
) -> None:
|
|
"""Walk all videos with status discovered/captioned and advance each one."""
|
|
async with session_factory() as session:
|
|
result = await session.execute(
|
|
select(KevinVideo)
|
|
.where(KevinVideo.status.in_(["discovered", "captioned"]))
|
|
.order_by(KevinVideo.published_at.asc())
|
|
)
|
|
videos = result.scalars().all()
|
|
|
|
for i, video in enumerate(videos):
|
|
# Throttle between videos to avoid bursting into Anthropic's per-account RPM limit
|
|
if i > 0:
|
|
await asyncio.sleep(deps.inter_video_sleep_seconds)
|
|
async with session_factory() as session:
|
|
# Re-fetch inside its own session so we can commit per-video
|
|
result = await session.execute(
|
|
select(KevinVideo).where(KevinVideo.id == video.id)
|
|
)
|
|
db_video = result.scalar_one()
|
|
|
|
prev_status = str(db_video.status.value) if hasattr(db_video.status, "value") else str(db_video.status)
|
|
|
|
try:
|
|
new_status = await process_one_video(db_video, session, deps)
|
|
await session.commit()
|
|
|
|
# Update OTEL counters based on the transition
|
|
if prev_status == "discovered" and new_status == "captioned":
|
|
captions_extracted_counter.add(1)
|
|
elif prev_status == "captioned" and new_status == "analyzed":
|
|
llm_calls_counter.add(1)
|
|
# Retrieve cost from the most-recently-inserted analysis
|
|
from sqlalchemy import select as _sel
|
|
from shared.models.meet_kevin import KevinAnalysis
|
|
async with session_factory() as cost_session:
|
|
cost_row = await cost_session.execute(
|
|
_sel(KevinAnalysis.cost_usd)
|
|
.where(KevinAnalysis.video_id == db_video.id)
|
|
.order_by(KevinAnalysis.id.desc())
|
|
.limit(1)
|
|
)
|
|
cost = cost_row.scalar_one_or_none()
|
|
if cost is not None:
|
|
llm_cost_counter.add(float(cost))
|
|
|
|
except Exception:
|
|
logger.exception("Error processing video %s", db_video.youtube_video_id)
|
|
try:
|
|
await session.rollback()
|
|
except Exception:
|
|
pass
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Service entry point
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
async def run() -> None:
|
|
"""Boot the Meet Kevin watcher and enter the main polling loop."""
|
|
config = MeetKevinWatcherConfig()
|
|
|
|
logging.basicConfig(level=config.log_level)
|
|
logger.info("Starting meet-kevin-watcher service")
|
|
|
|
# Telemetry
|
|
meter = setup_telemetry(config.otel_service_name, config.otel_metrics_port)
|
|
videos_discovered_counter = meter.create_counter(
|
|
"meet_kevin.videos_discovered",
|
|
description="Total new Meet Kevin videos discovered via RSS",
|
|
)
|
|
captions_extracted_counter = meter.create_counter(
|
|
"meet_kevin.captions_extracted",
|
|
description="Total videos with captions successfully extracted",
|
|
)
|
|
llm_calls_counter = meter.create_counter(
|
|
"meet_kevin.llm_calls",
|
|
description="Total LLM analyze() calls made",
|
|
)
|
|
llm_cost_counter = meter.create_counter(
|
|
"meet_kevin.llm_cost_usd",
|
|
description="Cumulative LLM spend in USD",
|
|
)
|
|
|
|
# Database
|
|
engine, session_factory = create_db(config)
|
|
|
|
# Anthropic client + LLM analyzer (OAuth bearer token)
|
|
client = AsyncAnthropic(
|
|
auth_token=config.anthropic_oauth_token,
|
|
)
|
|
analyzer = LlmAnalyzer(
|
|
client=client,
|
|
model=config.meet_kevin_llm_model,
|
|
prompt_version=config.meet_kevin_prompt_version,
|
|
)
|
|
|
|
# Pipeline deps — wire real callables
|
|
async def _extract(video_id: str, workdir: str):
|
|
return await extract_captions(video_id, workdir)
|
|
|
|
async def _analyze(**kwargs):
|
|
return await analyzer.analyze(**kwargs)
|
|
|
|
async def _daily_cost(session):
|
|
return await daily_cost_used(session)
|
|
|
|
deps = PipelineDeps(
|
|
extract_captions=_extract,
|
|
analyze=_analyze,
|
|
daily_cost_used=_daily_cost,
|
|
model=config.meet_kevin_llm_model,
|
|
prompt_version=config.meet_kevin_prompt_version,
|
|
daily_cost_cap_usd=Decimal(str(config.meet_kevin_daily_cost_cap_usd)),
|
|
workdir=config.meet_kevin_workdir,
|
|
inter_video_sleep_seconds=config.meet_kevin_inter_video_sleep_seconds,
|
|
)
|
|
|
|
# Graceful shutdown
|
|
shutdown_event = asyncio.Event()
|
|
loop = asyncio.get_running_loop()
|
|
for sig in (signal.SIGTERM, signal.SIGINT):
|
|
loop.add_signal_handler(sig, shutdown_event.set)
|
|
|
|
# Main loop
|
|
try:
|
|
async with httpx.AsyncClient() as http_client:
|
|
while not shutdown_event.is_set():
|
|
try:
|
|
await _poll_channels(
|
|
session_factory, http_client, videos_discovered_counter
|
|
)
|
|
await _process_pending_videos(
|
|
session_factory,
|
|
deps,
|
|
captions_extracted_counter,
|
|
llm_calls_counter,
|
|
llm_cost_counter,
|
|
)
|
|
except Exception:
|
|
logger.exception("Main loop iteration failed")
|
|
|
|
# Wait for next poll interval (or shutdown signal)
|
|
try:
|
|
await asyncio.wait_for(
|
|
shutdown_event.wait(),
|
|
timeout=config.meet_kevin_poll_interval_seconds,
|
|
)
|
|
break # Shutdown signaled
|
|
except asyncio.TimeoutError:
|
|
pass # Normal timeout — loop again
|
|
finally:
|
|
await client.close()
|
|
await engine.dispose()
|
|
logger.info("meet-kevin-watcher stopped gracefully")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(run())
|