trading/services/meet_kevin_watcher/main.py

"""Meet Kevin watcher service entry point.

Polls YouTube RSS feeds for new Meet Kevin videos, deduplicates via
``INSERT … ON CONFLICT DO NOTHING``, and processes each video through
the caption-extraction + LLM-analysis pipeline.

Usage:
    python -m services.meet_kevin_watcher.main
    # or via Docker ENTRYPOINT
"""

import asyncio
import logging
import signal
from datetime import timezone
from decimal import Decimal

import httpx
from anthropic import AsyncAnthropic
from sqlalchemy import select
from sqlalchemy.dialects.postgresql import insert as pg_insert

from shared.db import create_db
from shared.telemetry import setup_telemetry
from services.meet_kevin_watcher.config import MeetKevinWatcherConfig
from services.meet_kevin_watcher.caption_extractor import extract_captions
from services.meet_kevin_watcher.llm_analyzer import LlmAnalyzer
from services.meet_kevin_watcher.pipeline import PipelineDeps, daily_cost_used, process_one_video
from services.meet_kevin_watcher.rss_poller import fetch_feed, parse_feed
from shared.models.meet_kevin import KevinChannel, KevinVideo

logger = logging.getLogger(__name__)


# ---------------------------------------------------------------------------
# RSS polling helper
# ---------------------------------------------------------------------------


async def _poll_channels(
    session_factory,
    http_client: httpx.AsyncClient,
    videos_discovered_counter,
) -> None:
    """Fetch RSS for every poll_enabled channel and upsert new videos."""
    async with session_factory() as session:
        result = await session.execute(
            select(KevinChannel).where(KevinChannel.poll_enabled.is_(True))
        )
        channels = result.scalars().all()

    for channel in channels:
        try:
            xml_bytes = await fetch_feed(channel.youtube_channel_id, http_client)
            discovered = parse_feed(xml_bytes)
            if not discovered:
                continue

            async with session_factory() as session:
                for video in discovered:
                    stmt = (
                        pg_insert(KevinVideo)
                        .values(
                            channel_id=channel.id,
                            youtube_video_id=video.youtube_video_id,
                            title=video.title,
                            description=video.description,
                            published_at=video.published_at,
                            thumbnail_url=video.thumbnail_url,
                            status="discovered",
                            retry_count=0,
                        )
                        .on_conflict_do_nothing(index_elements=["youtube_video_id"])
                        .returning(KevinVideo.id)
                    )
                    row = await session.execute(stmt)
                    if row.scalar():
                        videos_discovered_counter.add(1)
                        logger.info("Discovered new video: %s", video.youtube_video_id)

                await session.commit()
        except Exception:
            logger.exception("RSS poll failed for channel %s", channel.youtube_channel_id)


# ---------------------------------------------------------------------------
# Pipeline processing helper
# ---------------------------------------------------------------------------


async def _process_pending_videos(
    session_factory,
    deps: PipelineDeps,
    captions_extracted_counter,
    llm_calls_counter,
    llm_cost_counter,
) -> None:
    """Walk all videos with status discovered/captioned and advance each one."""
    async with session_factory() as session:
        result = await session.execute(
            select(KevinVideo)
            .where(KevinVideo.status.in_(["discovered", "captioned"]))
            .order_by(KevinVideo.published_at.asc())
        )
        videos = result.scalars().all()

    for i, video in enumerate(videos):
        # Throttle between videos to avoid bursting into Anthropic's per-account RPM limit
        if i > 0:
            await asyncio.sleep(deps.inter_video_sleep_seconds)
        async with session_factory() as session:
            # Re-fetch inside its own session so we can commit per-video
            result = await session.execute(
                select(KevinVideo).where(KevinVideo.id == video.id)
            )
            db_video = result.scalar_one()

            prev_status = str(db_video.status.value) if hasattr(db_video.status, "value") else str(db_video.status)

            try:
                new_status = await process_one_video(db_video, session, deps)
                await session.commit()

                # Update OTEL counters based on the transition
                if prev_status == "discovered" and new_status == "captioned":
                    captions_extracted_counter.add(1)
                elif prev_status == "captioned" and new_status == "analyzed":
                    llm_calls_counter.add(1)
                    # Retrieve cost from the most-recently-inserted analysis
                    from sqlalchemy import select as _sel
                    from shared.models.meet_kevin import KevinAnalysis
                    async with session_factory() as cost_session:
                        cost_row = await cost_session.execute(
                            _sel(KevinAnalysis.cost_usd)
                            .where(KevinAnalysis.video_id == db_video.id)
                            .order_by(KevinAnalysis.id.desc())
                            .limit(1)
                        )
                        cost = cost_row.scalar_one_or_none()
                        if cost is not None:
                            llm_cost_counter.add(float(cost))

            except Exception:
                logger.exception("Error processing video %s", db_video.youtube_video_id)
                try:
                    await session.rollback()
                except Exception:
                    pass


# ---------------------------------------------------------------------------
# Service entry point
# ---------------------------------------------------------------------------


async def run() -> None:
    """Boot the Meet Kevin watcher and enter the main polling loop."""
    config = MeetKevinWatcherConfig()

    logging.basicConfig(level=config.log_level)
    logger.info("Starting meet-kevin-watcher service")

    # Telemetry
    meter = setup_telemetry(config.otel_service_name, config.otel_metrics_port)
    videos_discovered_counter = meter.create_counter(
        "meet_kevin.videos_discovered",
        description="Total new Meet Kevin videos discovered via RSS",
    )
    captions_extracted_counter = meter.create_counter(
        "meet_kevin.captions_extracted",
        description="Total videos with captions successfully extracted",
    )
    llm_calls_counter = meter.create_counter(
        "meet_kevin.llm_calls",
        description="Total LLM analyze() calls made",
    )
    llm_cost_counter = meter.create_counter(
        "meet_kevin.llm_cost_usd",
        description="Cumulative LLM spend in USD",
    )

    # Database
    engine, session_factory = create_db(config)

    # Anthropic client + LLM analyzer (OAuth bearer token)
    client = AsyncAnthropic(
        auth_token=config.anthropic_oauth_token,
    )
    analyzer = LlmAnalyzer(
        client=client,
        model=config.meet_kevin_llm_model,
        prompt_version=config.meet_kevin_prompt_version,
    )

    # Pipeline deps — wire real callables
    async def _extract(video_id: str, workdir: str):
        return await extract_captions(video_id, workdir)

    async def _analyze(**kwargs):
        return await analyzer.analyze(**kwargs)

    async def _daily_cost(session):
        return await daily_cost_used(session)

    deps = PipelineDeps(
        extract_captions=_extract,
        analyze=_analyze,
        daily_cost_used=_daily_cost,
        model=config.meet_kevin_llm_model,
        prompt_version=config.meet_kevin_prompt_version,
        daily_cost_cap_usd=Decimal(str(config.meet_kevin_daily_cost_cap_usd)),
        workdir=config.meet_kevin_workdir,
        inter_video_sleep_seconds=config.meet_kevin_inter_video_sleep_seconds,
    )

    # Graceful shutdown
    shutdown_event = asyncio.Event()
    loop = asyncio.get_running_loop()
    for sig in (signal.SIGTERM, signal.SIGINT):
        loop.add_signal_handler(sig, shutdown_event.set)

    # Main loop
    try:
        async with httpx.AsyncClient() as http_client:
            while not shutdown_event.is_set():
                try:
                    await _poll_channels(
                        session_factory, http_client, videos_discovered_counter
                    )
                    await _process_pending_videos(
                        session_factory,
                        deps,
                        captions_extracted_counter,
                        llm_calls_counter,
                        llm_cost_counter,
                    )
                except Exception:
                    logger.exception("Main loop iteration failed")

                # Wait for next poll interval (or shutdown signal)
                try:
                    await asyncio.wait_for(
                        shutdown_event.wait(),
                        timeout=config.meet_kevin_poll_interval_seconds,
                    )
                    break  # Shutdown signaled
                except asyncio.TimeoutError:
                    pass  # Normal timeout — loop again
    finally:
        await client.close()
        await engine.dispose()
        logger.info("meet-kevin-watcher stopped gracefully")


if __name__ == "__main__":
    asyncio.run(run())