[ci skip] f1-stream: add extractor framework with demo streams (Phase 3)

- BaseExtractor ABC with health_check method - ExtractorRegistry with concurrent fan-out extraction - ExtractionService with in-memory cache and background polling - DemoExtractor with 3 public HLS test streams - Adaptive polling: 5min during live sessions, 30min otherwise - GET /streams, GET /extractors, POST /extract endpoints
2026-02-23 23:02:56 +00:00 · 2026-02-23 23:02:56 +00:00 · d15337e838
commit d15337e838
parent 461e355a5d
8 changed files with 608 additions and 5 deletions
--- a/stacks/f1-stream/files/backend/extractors/init.py
+++ b/stacks/f1-stream/files/backend/extractors/init.py
@ -0,0 +1,49 @@
+"""Stream extraction framework.
+
+To add a new extractor:
+1. Create a new file in this package (e.g., my_site.py)
+2. Subclass BaseExtractor from backend.extractors.base
+3. Implement site_key, site_name, and extract()
+4. Import and register it in this file's create_registry() function
+
+Example:
+    from backend.extractors.my_site import MySiteExtractor
+    registry.register(MySiteExtractor())
+"""
+
+from backend.extractors.demo import DemoExtractor
+from backend.extractors.models import ExtractedStream
+from backend.extractors.registry import ExtractorRegistry
+from backend.extractors.service import ExtractionService
+
+__all__ = [
+    "ExtractedStream",
+    "ExtractorRegistry",
+    "ExtractionService",
+    "create_registry",
+    "create_extraction_service",
+]
+
+
+def create_registry() -> ExtractorRegistry:
+    """Create and populate the extractor registry with all known extractors.
+
+    Add new extractors here by importing and registering them.
+    """
+    registry = ExtractorRegistry()
+
+    # --- Register extractors below ---
+    registry.register(DemoExtractor())
+    # registry.register(MySiteExtractor())  # Add new extractors here
+
+    return registry
+
+
+def create_extraction_service() -> ExtractionService:
+    """Create an ExtractionService with all extractors registered.
+
+    This is the main entry point for the extraction framework.
+    Call this once during app startup.
+    """
+    registry = create_registry()
+    return ExtractionService(registry)
--- a/stacks/f1-stream/files/backend/extractors/base.py
+++ b/stacks/f1-stream/files/backend/extractors/base.py
@ -0,0 +1,118 @@
+"""Base class for all site-specific stream extractors."""
+
+import logging
+from abc import ABC, abstractmethod
+
+import httpx
+
+from backend.extractors.models import ExtractedStream
+
+logger = logging.getLogger(__name__)
+
+
+class BaseExtractor(ABC):
+    """Abstract base class for site-specific stream extractors.
+
+    To create a new extractor:
+    1. Create a new file in backend/extractors/
+    2. Subclass BaseExtractor
+    3. Implement site_key, site_name, and extract()
+    4. Register it in backend/extractors/__init__.py
+    """
+
+    @property
+    @abstractmethod
+    def site_key(self) -> str:
+        """Unique identifier for this site (e.g., 'sportsurge').
+
+        Must be lowercase, alphanumeric with hyphens/underscores only.
+        Used as the cache key and in API responses.
+        """
+
+    @property
+    @abstractmethod
+    def site_name(self) -> str:
+        """Human-readable name (e.g., 'SportSurge').
+
+        Displayed in the UI and API responses.
+        """
+
+    @abstractmethod
+    async def extract(self) -> list[ExtractedStream]:
+        """Extract stream URLs from this site.
+
+        Returns a list of ExtractedStream objects. Each represents a
+        discovered stream URL. The extractor should set url, quality,
+        and title fields; site_key, site_name, and extracted_at are
+        auto-populated if left empty.
+
+        Implementations should:
+        - Use httpx for HTTP requests
+        - Handle their own errors gracefully (log and return empty list)
+        - Set quality when detectable from the source
+        - Set title to something descriptive
+        """
+
+    async def health_check(self, url: str) -> bool:
+        """Verify a URL is live (HEAD request, check for m3u8 content).
+
+        Sends a HEAD request and checks:
+        1. HTTP 200 response
+        2. Content-Type suggests HLS/media content (if available)
+
+        Returns True if the URL appears to be a live stream.
+        """
+        try:
+            async with httpx.AsyncClient(
+                timeout=10.0,
+                follow_redirects=True,
+                headers={"User-Agent": "Mozilla/5.0"},
+            ) as client:
+                response = await client.head(url)
+
+                if response.status_code != 200:
+                    logger.debug(
+                        "[%s] Health check failed for %s: HTTP %d",
+                        self.site_key,
+                        url,
+                        response.status_code,
+                    )
+                    return False
+
+                content_type = response.headers.get("content-type", "").lower()
+                # m3u8 streams typically have these content types
+                live_indicators = [
+                    "application/vnd.apple.mpegurl",
+                    "application/x-mpegurl",
+                    "video/",
+                    "audio/",
+                    "octet-stream",
+                ]
+
+                # If content-type is present and doesn't look like media,
+                # the URL might not be a stream. But some servers don't set
+                # content-type properly for HEAD, so we still return True
+                # if content-type is missing or generic.
+                if content_type and not any(ind in content_type for ind in live_indicators):
+                    # Content type present but doesn't look like media.
+                    # Could still be valid (some servers return text/plain for m3u8).
+                    if "text/" in content_type or "html" in content_type:
+                        logger.debug(
+                            "[%s] Health check suspect for %s: content-type=%s",
+                            self.site_key,
+                            url,
+                            content_type,
+                        )
+                        return False
+
+                return True
+
+        except httpx.TimeoutException:
+            logger.debug("[%s] Health check timed out for %s", self.site_key, url)
+            return False
+        except httpx.HTTPError as e:
+            logger.debug("[%s] Health check error for %s: %s", self.site_key, url, e)
+            return False
+        except Exception:
+            logger.exception("[%s] Unexpected error during health check for %s", self.site_key, url)
+            return False
--- a/stacks/f1-stream/files/backend/extractors/demo.py
+++ b/stacks/f1-stream/files/backend/extractors/demo.py
@ -0,0 +1,75 @@
+"""Demo extractor - returns hardcoded test streams for framework testing.
+
+This extractor exists purely for testing the extraction pipeline end-to-end.
+It does NOT connect to any real streaming site. Disable it in production by
+removing its registration from __init__.py or setting DEMO_EXTRACTOR_ENABLED=false.
+"""
+
+import logging
+import os
+
+from backend.extractors.base import BaseExtractor
+from backend.extractors.models import ExtractedStream
+
+logger = logging.getLogger(__name__)
+
+# Set DEMO_EXTRACTOR_ENABLED=false to disable this extractor
+DEMO_ENABLED = os.getenv("DEMO_EXTRACTOR_ENABLED", "true").lower() in ("true", "1", "yes")
+
+
+class DemoExtractor(BaseExtractor):
+    """Demo extractor that returns hardcoded test streams.
+
+    Use this to verify the extraction framework works end-to-end without
+    needing a real streaming site. The streams are publicly available HLS
+    test streams from Apple and others.
+    """
+
+    @property
+    def site_key(self) -> str:
+        return "demo"
+
+    @property
+    def site_name(self) -> str:
+        return "Demo (Test Streams)"
+
+    async def extract(self) -> list[ExtractedStream]:
+        """Return hardcoded test streams for framework testing."""
+        if not DEMO_ENABLED:
+            logger.info("[demo] Demo extractor is disabled via DEMO_EXTRACTOR_ENABLED")
+            return []
+
+        logger.info("[demo] Returning demo test streams")
+
+        streams = [
+            ExtractedStream(
+                url="https://test-streams.mux.dev/x36xhzz/x36xhzz.m3u8",
+                site_key=self.site_key,
+                site_name=self.site_name,
+                quality="720p",
+                title="Big Buck Bunny (Test Stream)",
+                is_live=False,
+            ),
+            ExtractedStream(
+                url="https://devstreaming-cdn.apple.com/videos/streaming/examples/bipbop_16x9/bipbop_16x9_variant.m3u8",
+                site_key=self.site_key,
+                site_name=self.site_name,
+                quality="1080p",
+                title="Apple Bipbop (Test Stream)",
+                is_live=False,
+            ),
+            ExtractedStream(
+                url="https://cph-p2p-msl.akamaized.net/hls/live/2000341/test/master.m3u8",
+                site_key=self.site_key,
+                site_name=self.site_name,
+                quality="",
+                title="Akamai Live Test Stream",
+                is_live=False,
+            ),
+        ]
+
+        # Optionally run health checks on the demo streams
+        for stream in streams:
+            stream.is_live = await self.health_check(stream.url)
+
+        return streams
--- a/stacks/f1-stream/files/backend/extractors/models.py
+++ b/stacks/f1-stream/files/backend/extractors/models.py
@ -0,0 +1,29 @@
+"""Data models for the stream extraction framework."""
+
+from dataclasses import dataclass, field
+from datetime import datetime, timezone
+
+
+@dataclass
+class ExtractedStream:
+    """Represents a single stream URL discovered by an extractor."""
+
+    url: str  # The HLS/m3u8 URL
+    site_key: str  # Which extractor found it
+    site_name: str  # Human-readable name
+    quality: str = ""  # e.g., "720p", "1080p", or empty
+    title: str = ""  # e.g., "F1 Race Live"
+    extracted_at: str = field(default_factory=lambda: datetime.now(timezone.utc).isoformat())
+    is_live: bool = False  # Whether it passed health check
+
+    def to_dict(self) -> dict:
+        """Serialize to a plain dictionary for JSON responses."""
+        return {
+            "url": self.url,
+            "site_key": self.site_key,
+            "site_name": self.site_name,
+            "quality": self.quality,
+            "title": self.title,
+            "extracted_at": self.extracted_at,
+            "is_live": self.is_live,
+        }
--- a/stacks/f1-stream/files/backend/extractors/registry.py
+++ b/stacks/f1-stream/files/backend/extractors/registry.py
@ -0,0 +1,116 @@
+"""Central registry for stream extractors."""
+
+import asyncio
+import logging
+from datetime import datetime, timezone
+
+from backend.extractors.base import BaseExtractor
+from backend.extractors.models import ExtractedStream
+
+logger = logging.getLogger(__name__)
+
+
+class ExtractorRegistry:
+    """Central registry for all site extractors.
+
+    Manages extractor instances and provides fan-out extraction across
+    all registered extractors with independent error handling.
+    """
+
+    def __init__(self) -> None:
+        self._extractors: dict[str, BaseExtractor] = {}
+
+    def register(self, extractor: BaseExtractor) -> None:
+        """Register an extractor instance.
+
+        Args:
+            extractor: A BaseExtractor subclass instance.
+
+        Raises:
+            ValueError: If an extractor with the same site_key is already registered.
+        """
+        key = extractor.site_key
+        if key in self._extractors:
+            raise ValueError(
+                f"Extractor with site_key '{key}' is already registered "
+                f"(existing: {self._extractors[key].site_name}, "
+                f"new: {extractor.site_name})"
+            )
+        self._extractors[key] = extractor
+        logger.info("Registered extractor: %s (%s)", extractor.site_name, key)
+
+    def get(self, site_key: str) -> BaseExtractor | None:
+        """Get an extractor by its site_key.
+
+        Args:
+            site_key: The unique identifier of the extractor.
+
+        Returns:
+            The extractor instance, or None if not found.
+        """
+        return self._extractors.get(site_key)
+
+    def list_extractors(self) -> list[dict]:
+        """List all registered extractors.
+
+        Returns:
+            A list of dicts with site_key and site_name for each extractor.
+        """
+        return [
+            {"site_key": ext.site_key, "site_name": ext.site_name}
+            for ext in self._extractors.values()
+        ]
+
+    async def extract_all(self) -> list[ExtractedStream]:
+        """Fan-out extraction to all registered extractors concurrently.
+
+        Each extractor runs independently. If one fails, the others
+        continue and their results are still collected.
+
+        Returns:
+            Combined list of ExtractedStream from all extractors.
+        """
+        if not self._extractors:
+            logger.warning("No extractors registered, nothing to extract")
+            return []
+
+        logger.info(
+            "Running extraction across %d extractor(s): %s",
+            len(self._extractors),
+            ", ".join(self._extractors.keys()),
+        )
+
+        async def _safe_extract(extractor: BaseExtractor) -> list[ExtractedStream]:
+            """Run a single extractor with error isolation."""
+            try:
+                streams = await extractor.extract()
+                # Fill in site_key/site_name if the extractor didn't set them
+                now = datetime.now(timezone.utc).isoformat()
+                for stream in streams:
+                    if not stream.site_key:
+                        stream.site_key = extractor.site_key
+                    if not stream.site_name:
+                        stream.site_name = extractor.site_name
+                    if not stream.extracted_at:
+                        stream.extracted_at = now
+                logger.info(
+                    "[%s] Extracted %d stream(s)", extractor.site_key, len(streams)
+                )
+                return streams
+            except Exception:
+                logger.exception(
+                    "[%s] Extractor failed during extraction", extractor.site_key
+                )
+                return []
+
+        # Run all extractors concurrently
+        tasks = [_safe_extract(ext) for ext in self._extractors.values()]
+        results = await asyncio.gather(*tasks)
+
+        # Flatten results
+        all_streams: list[ExtractedStream] = []
+        for stream_list in results:
+            all_streams.extend(stream_list)
+
+        logger.info("Extraction complete: %d total stream(s) found", len(all_streams))
+        return all_streams
--- a/stacks/f1-stream/files/backend/extractors/service.py
+++ b/stacks/f1-stream/files/backend/extractors/service.py
@ -0,0 +1,121 @@
+"""Extraction service - manages extraction lifecycle: polling, caching, serving."""
+
+import logging
+from datetime import datetime, timezone
+
+from backend.extractors.models import ExtractedStream
+from backend.extractors.registry import ExtractorRegistry
+
+logger = logging.getLogger(__name__)
+
+
+class ExtractionService:
+    """Manages the extraction lifecycle: polling, caching, and serving results.
+
+    Extraction runs on a background schedule (via APScheduler), never on
+    client request path. Results are cached in memory, keyed by site_key.
+    """
+
+    def __init__(self, registry: ExtractorRegistry) -> None:
+        self._registry = registry
+        # Cache: site_key -> list of ExtractedStream
+        self._cache: dict[str, list[ExtractedStream]] = {}
+        self._last_run: str | None = None
+        self._last_run_stream_count: int = 0
+
+    async def run_extraction(self) -> None:
+        """Run all extractors and cache their results.
+
+        This is called by the background scheduler. Each extractor's
+        results replace its previous cache entry entirely.
+        """
+        logger.info("Starting extraction run...")
+        start = datetime.now(timezone.utc)
+
+        streams = await self._registry.extract_all()
+
+        # Group streams by site_key and update cache
+        new_cache: dict[str, list[ExtractedStream]] = {}
+        for stream in streams:
+            new_cache.setdefault(stream.site_key, []).append(stream)
+
+        # Replace cache for extractors that returned results.
+        # Clear cache for extractors that returned nothing (site went down, etc.)
+        for extractor_info in self._registry.list_extractors():
+            key = extractor_info["site_key"]
+            if key in new_cache:
+                self._cache[key] = new_cache[key]
+            else:
+                # Extractor returned nothing - clear its cache
+                self._cache.pop(key, None)
+
+        self._last_run = start.isoformat()
+        self._last_run_stream_count = len(streams)
+
+        elapsed = (datetime.now(timezone.utc) - start).total_seconds()
+        logger.info(
+            "Extraction run complete: %d stream(s) from %d extractor(s) in %.1fs",
+            len(streams),
+            len(new_cache),
+            elapsed,
+        )
+
+    def get_streams(self) -> list[dict]:
+        """Return all cached streams as a flat list of dicts.
+
+        Returns:
+            List of serialized ExtractedStream dicts from all extractors.
+        """
+        all_streams: list[dict] = []
+        for streams in self._cache.values():
+            all_streams.extend(s.to_dict() for s in streams)
+        return all_streams
+
+    def get_streams_for_session(self, session_type: str) -> list[dict]:
+        """Return cached streams filtered/annotated for a specific session type.
+
+        Currently returns all streams (extractors don't yet differentiate by
+        session type). This method exists as a hook for future filtering,
+        e.g., some extractors might only have race streams but not FP streams.
+
+        Args:
+            session_type: The F1 session type (e.g., "race", "qualifying", "fp1").
+
+        Returns:
+            List of serialized ExtractedStream dicts.
+        """
+        # For now, all streams are potentially relevant to any session.
+        # Future extractors may tag streams with session types, at which
+        # point this method will filter accordingly.
+        streams = self.get_streams()
+        logger.debug(
+            "Returning %d stream(s) for session type '%s'",
+            len(streams),
+            session_type,
+        )
+        return streams
+
+    def get_status(self) -> dict:
+        """Return extraction service status for the /extractors endpoint."""
+        extractor_list = self._registry.list_extractors()
+        extractor_statuses = []
+
+        for info in extractor_list:
+            key = info["site_key"]
+            cached = self._cache.get(key, [])
+            extractor_statuses.append(
+                {
+                    "site_key": key,
+                    "site_name": info["site_name"],
+                    "cached_streams": len(cached),
+                }
+            )
+
+        return {
+            "extractors": extractor_statuses,
+            "total_cached_streams": sum(
+                len(streams) for streams in self._cache.values()
+            ),
+            "last_run": self._last_run,
+            "last_run_stream_count": self._last_run_stream_count,
+        }