[ci skip] f1-stream: add stream health checker and HLS proxy (Phases 4-5)

Phase 4 - Stream Health and Fallback:
- StreamHealthChecker with partial GET validation of m3u8 content
- Bitrate extraction from BANDWIDTH tags
- Response time measurement for quality ranking
- Fallback ordering: live first, fastest response time first
- GET /streams now only returns health-verified streams

Phase 5 - HLS Proxy Core:
- GET /proxy?url= - m3u8 playlist fetch with full URI rewriting
- GET /relay?url= - chunked segment relay (never buffers full segment)
- m3u8 rewriter handles master, variant, and segment URIs
- Base64url encoding for URL parameters
- CORS middleware for browser playback
- Range header forwarding for seeking support
This commit is contained in:
Viktor Barzin 2026-02-23 23:41:16 +00:00
parent a9a4ac37a2
commit 6867036087
6 changed files with 926 additions and 20 deletions

View file

@ -15,6 +15,9 @@ class ExtractedStream:
title: str = "" # e.g., "F1 Race Live"
extracted_at: str = field(default_factory=lambda: datetime.now(timezone.utc).isoformat())
is_live: bool = False # Whether it passed health check
response_time_ms: int = 0 # Health check response time (lower = better)
checked_at: str = "" # ISO timestamp of last health check
bitrate: int = 0 # Bitrate in bps if detectable from m3u8 playlist
def to_dict(self) -> dict:
"""Serialize to a plain dictionary for JSON responses."""
@ -26,4 +29,7 @@ class ExtractedStream:
"title": self.title,
"extracted_at": self.extracted_at,
"is_live": self.is_live,
"response_time_ms": self.response_time_ms,
"checked_at": self.checked_at,
"bitrate": self.bitrate,
}

View file

@ -1,19 +1,25 @@
"""Extraction service - manages extraction lifecycle: polling, caching, serving."""
"""Extraction service - manages extraction lifecycle: polling, caching, health checking, serving."""
import logging
from datetime import datetime, timezone
from backend.extractors.models import ExtractedStream
from backend.extractors.registry import ExtractorRegistry
from backend.health import StreamHealthChecker
logger = logging.getLogger(__name__)
class ExtractionService:
"""Manages the extraction lifecycle: polling, caching, and serving results.
"""Manages the extraction lifecycle: polling, caching, health checking, and serving.
Extraction runs on a background schedule (via APScheduler), never on
client request path. Results are cached in memory, keyed by site_key.
client request path. After extraction, health checks verify each stream
is live. Results are cached in memory, keyed by site_key.
GET /streams only returns streams that passed health checks, sorted by:
1. is_live (live streams first)
2. response_time_ms (fastest first)
"""
def __init__(self, registry: ExtractorRegistry) -> None:
@ -22,18 +28,36 @@ class ExtractionService:
self._cache: dict[str, list[ExtractedStream]] = {}
self._last_run: str | None = None
self._last_run_stream_count: int = 0
self._health_checker = StreamHealthChecker()
async def run_extraction(self) -> None:
"""Run all extractors and cache their results.
"""Run all extractors, health-check results, and cache them.
This is called by the background scheduler. Each extractor's
results replace its previous cache entry entirely.
results replace its previous cache entry entirely. After extraction,
health checks are run to verify streams are live and measure
response times.
"""
logger.info("Starting extraction run...")
start = datetime.now(timezone.utc)
streams = await self._registry.extract_all()
# Run health checks on all extracted streams
if streams:
stream_dicts = [s.to_dict() for s in streams]
health_map = await self._health_checker.check_all(stream_dicts)
# Update stream objects with health check results
for stream in streams:
health = health_map.get(stream.url)
if health:
stream.is_live = health.is_live
stream.response_time_ms = health.response_time_ms
stream.checked_at = health.checked_at
if health.bitrate > 0:
stream.bitrate = health.bitrate
# Group streams by site_key and update cache
new_cache: dict[str, list[ExtractedStream]] = {}
for stream in streams:
@ -52,29 +76,68 @@ class ExtractionService:
self._last_run = start.isoformat()
self._last_run_stream_count = len(streams)
live_count = sum(
1 for streams_list in self._cache.values()
for s in streams_list if s.is_live
)
elapsed = (datetime.now(timezone.utc) - start).total_seconds()
logger.info(
"Extraction run complete: %d stream(s) from %d extractor(s) in %.1fs",
"Extraction run complete: %d stream(s) from %d extractor(s) in %.1fs (%d live)",
len(streams),
len(new_cache),
elapsed,
live_count,
)
def get_streams(self) -> list[dict]:
"""Return all cached streams as a flat list of dicts.
"""Return all cached streams as a sorted list of dicts.
Only returns streams that passed health checks (is_live=True).
Sorted by fallback priority:
1. is_live (live streams first) - filters to live only
2. response_time_ms (fastest first)
Returns:
List of serialized ExtractedStream dicts from all extractors.
List of serialized ExtractedStream dicts from all extractors,
filtered to live-only and sorted by response time.
"""
all_streams: list[dict] = []
all_streams: list[ExtractedStream] = []
for streams in self._cache.values():
all_streams.extend(s.to_dict() for s in streams)
return all_streams
all_streams.extend(streams)
# Sort by fallback priority: live first, then fastest response
all_streams.sort(
key=lambda s: (not s.is_live, s.response_time_ms)
)
# Only return live streams to clients
live_streams = [s for s in all_streams if s.is_live]
return [s.to_dict() for s in live_streams]
def get_all_streams_unfiltered(self) -> list[dict]:
"""Return ALL cached streams including unhealthy ones.
Used for debugging and status endpoints. Sorted by fallback priority
but includes streams that failed health checks.
Returns:
List of all serialized ExtractedStream dicts.
"""
all_streams: list[ExtractedStream] = []
for streams in self._cache.values():
all_streams.extend(streams)
# Sort by fallback priority: live first, then fastest response
all_streams.sort(
key=lambda s: (not s.is_live, s.response_time_ms)
)
return [s.to_dict() for s in all_streams]
def get_streams_for_session(self, session_type: str) -> list[dict]:
"""Return cached streams filtered/annotated for a specific session type.
Currently returns all streams (extractors don't yet differentiate by
Currently returns all live streams (extractors don't yet differentiate by
session type). This method exists as a hook for future filtering,
e.g., some extractors might only have race streams but not FP streams.
@ -82,7 +145,7 @@ class ExtractionService:
session_type: The F1 session type (e.g., "race", "qualifying", "fp1").
Returns:
List of serialized ExtractedStream dicts.
List of serialized ExtractedStream dicts (live only, sorted).
"""
# For now, all streams are potentially relevant to any session.
# Future extractors may tag streams with session types, at which
@ -103,19 +166,26 @@ class ExtractionService:
for info in extractor_list:
key = info["site_key"]
cached = self._cache.get(key, [])
live_count = sum(1 for s in cached if s.is_live)
extractor_statuses.append(
{
"site_key": key,
"site_name": info["site_name"],
"cached_streams": len(cached),
"live_streams": live_count,
}
)
total_cached = sum(len(streams) for streams in self._cache.values())
total_live = sum(
1 for streams in self._cache.values()
for s in streams if s.is_live
)
return {
"extractors": extractor_statuses,
"total_cached_streams": sum(
len(streams) for streams in self._cache.values()
),
"total_cached_streams": total_cached,
"total_live_streams": total_live,
"last_run": self._last_run,
"last_run_stream_count": self._last_run_stream_count,
}