[ci skip] f1-stream: add stream health checker and HLS proxy (Phases 4-5)

Phase 4 - Stream Health and Fallback: - StreamHealthChecker with partial GET validation of m3u8 content - Bitrate extraction from BANDWIDTH tags - Response time measurement for quality ranking - Fallback ordering: live first, fastest response time first - GET /streams now only returns health-verified streams Phase 5 - HLS Proxy Core: - GET /proxy?url= - m3u8 playlist fetch with full URI rewriting - GET /relay?url= - chunked segment relay (never buffers full segment) - m3u8 rewriter handles master, variant, and segment URIs - Base64url encoding for URL parameters - CORS middleware for browser playback - Range header forwarding for seeking support
2026-02-23 23:41:16 +00:00 · 2026-02-23 23:41:16 +00:00 · 6867036087
commit 6867036087
parent a9a4ac37a2
6 changed files with 926 additions and 20 deletions
--- a/stacks/f1-stream/files/backend/extractors/models.py
+++ b/stacks/f1-stream/files/backend/extractors/models.py
@ -15,6 +15,9 @@ class ExtractedStream:
    title: str = ""  # e.g., "F1 Race Live"
    extracted_at: str = field(default_factory=lambda: datetime.now(timezone.utc).isoformat())
    is_live: bool = False  # Whether it passed health check
+    response_time_ms: int = 0  # Health check response time (lower = better)
+    checked_at: str = ""  # ISO timestamp of last health check
+    bitrate: int = 0  # Bitrate in bps if detectable from m3u8 playlist

    def to_dict(self) -> dict:
        """Serialize to a plain dictionary for JSON responses."""
@ -26,4 +29,7 @@ class ExtractedStream:
            "title": self.title,
            "extracted_at": self.extracted_at,
            "is_live": self.is_live,
+            "response_time_ms": self.response_time_ms,
+            "checked_at": self.checked_at,
+            "bitrate": self.bitrate,
        }
--- a/stacks/f1-stream/files/backend/extractors/service.py
+++ b/stacks/f1-stream/files/backend/extractors/service.py
@ -1,19 +1,25 @@
-"""Extraction service - manages extraction lifecycle: polling, caching, serving."""
+"""Extraction service - manages extraction lifecycle: polling, caching, health checking, serving."""

 import logging
 from datetime import datetime, timezone

 from backend.extractors.models import ExtractedStream
 from backend.extractors.registry import ExtractorRegistry
+from backend.health import StreamHealthChecker

 logger = logging.getLogger(__name__)


 class ExtractionService:
-    """Manages the extraction lifecycle: polling, caching, and serving results.
+    """Manages the extraction lifecycle: polling, caching, health checking, and serving.

    Extraction runs on a background schedule (via APScheduler), never on
-    client request path. Results are cached in memory, keyed by site_key.
+    client request path. After extraction, health checks verify each stream
+    is live. Results are cached in memory, keyed by site_key.
+
+    GET /streams only returns streams that passed health checks, sorted by:
+    1. is_live (live streams first)
+    2. response_time_ms (fastest first)
    """

    def __init__(self, registry: ExtractorRegistry) -> None:
@ -22,18 +28,36 @@ class ExtractionService:
        self._cache: dict[str, list[ExtractedStream]] = {}
        self._last_run: str | None = None
        self._last_run_stream_count: int = 0
+        self._health_checker = StreamHealthChecker()

    async def run_extraction(self) -> None:
-        """Run all extractors and cache their results.
+        """Run all extractors, health-check results, and cache them.

        This is called by the background scheduler. Each extractor's
-        results replace its previous cache entry entirely.
+        results replace its previous cache entry entirely. After extraction,
+        health checks are run to verify streams are live and measure
+        response times.
        """
        logger.info("Starting extraction run...")
        start = datetime.now(timezone.utc)

        streams = await self._registry.extract_all()

+        # Run health checks on all extracted streams
+        if streams:
+            stream_dicts = [s.to_dict() for s in streams]
+            health_map = await self._health_checker.check_all(stream_dicts)
+
+            # Update stream objects with health check results
+            for stream in streams:
+                health = health_map.get(stream.url)
+                if health:
+                    stream.is_live = health.is_live
+                    stream.response_time_ms = health.response_time_ms
+                    stream.checked_at = health.checked_at
+                    if health.bitrate > 0:
+                        stream.bitrate = health.bitrate
+
        # Group streams by site_key and update cache
        new_cache: dict[str, list[ExtractedStream]] = {}
        for stream in streams:
@ -52,29 +76,68 @@ class ExtractionService:
        self._last_run = start.isoformat()
        self._last_run_stream_count = len(streams)

+        live_count = sum(
+            1 for streams_list in self._cache.values()
+            for s in streams_list if s.is_live
+        )
        elapsed = (datetime.now(timezone.utc) - start).total_seconds()
        logger.info(
-            "Extraction run complete: %d stream(s) from %d extractor(s) in %.1fs",
+            "Extraction run complete: %d stream(s) from %d extractor(s) in %.1fs (%d live)",
            len(streams),
            len(new_cache),
            elapsed,
+            live_count,
        )

    def get_streams(self) -> list[dict]:
-        """Return all cached streams as a flat list of dicts.
+        """Return all cached streams as a sorted list of dicts.
+
+        Only returns streams that passed health checks (is_live=True).
+        Sorted by fallback priority:
+        1. is_live (live streams first) - filters to live only
+        2. response_time_ms (fastest first)

        Returns:
-            List of serialized ExtractedStream dicts from all extractors.
+            List of serialized ExtractedStream dicts from all extractors,
+            filtered to live-only and sorted by response time.
        """
-        all_streams: list[dict] = []
+        all_streams: list[ExtractedStream] = []
        for streams in self._cache.values():
-            all_streams.extend(s.to_dict() for s in streams)
-        return all_streams
+            all_streams.extend(streams)
+
+        # Sort by fallback priority: live first, then fastest response
+        all_streams.sort(
+            key=lambda s: (not s.is_live, s.response_time_ms)
+        )
+
+        # Only return live streams to clients
+        live_streams = [s for s in all_streams if s.is_live]
+        return [s.to_dict() for s in live_streams]
+
+    def get_all_streams_unfiltered(self) -> list[dict]:
+        """Return ALL cached streams including unhealthy ones.
+
+        Used for debugging and status endpoints. Sorted by fallback priority
+        but includes streams that failed health checks.
+
+        Returns:
+            List of all serialized ExtractedStream dicts.
+        """
+        all_streams: list[ExtractedStream] = []
+        for streams in self._cache.values():
+            all_streams.extend(streams)
+
+        # Sort by fallback priority: live first, then fastest response
+        all_streams.sort(
+            key=lambda s: (not s.is_live, s.response_time_ms)
+        )
+
+        return [s.to_dict() for s in all_streams]

    def get_streams_for_session(self, session_type: str) -> list[dict]:
        """Return cached streams filtered/annotated for a specific session type.

-        Currently returns all streams (extractors don't yet differentiate by
+        Currently returns all live streams (extractors don't yet differentiate by
        session type). This method exists as a hook for future filtering,
        e.g., some extractors might only have race streams but not FP streams.

@ -82,7 +145,7 @@ class ExtractionService:
            session_type: The F1 session type (e.g., "race", "qualifying", "fp1").

        Returns:
-            List of serialized ExtractedStream dicts.
+            List of serialized ExtractedStream dicts (live only, sorted).
        """
        # For now, all streams are potentially relevant to any session.
        # Future extractors may tag streams with session types, at which
@ -103,19 +166,26 @@ class ExtractionService:
        for info in extractor_list:
            key = info["site_key"]
            cached = self._cache.get(key, [])
+            live_count = sum(1 for s in cached if s.is_live)
            extractor_statuses.append(
                {
                    "site_key": key,
                    "site_name": info["site_name"],
                    "cached_streams": len(cached),
+                    "live_streams": live_count,
                }
            )

+        total_cached = sum(len(streams) for streams in self._cache.values())
+        total_live = sum(
+            1 for streams in self._cache.values()
+            for s in streams if s.is_live
+        )
+
        return {
            "extractors": extractor_statuses,
-            "total_cached_streams": sum(
-                len(streams) for streams in self._cache.values()
-            ),
+            "total_cached_streams": total_cached,
+            "total_live_streams": total_live,
            "last_run": self._last_run,
            "last_run_stream_count": self._last_run_stream_count,
        }