[ci skip] f1-stream: add extractor framework with demo streams (Phase 3)

- BaseExtractor ABC with health_check method
- ExtractorRegistry with concurrent fan-out extraction
- ExtractionService with in-memory cache and background polling
- DemoExtractor with 3 public HLS test streams
- Adaptive polling: 5min during live sessions, 30min otherwise
- GET /streams, GET /extractors, POST /extract endpoints
This commit is contained in:
Viktor Barzin 2026-02-23 23:02:56 +00:00
parent 461e355a5d
commit d15337e838
8 changed files with 608 additions and 5 deletions

View file

@ -0,0 +1,49 @@
"""Stream extraction framework.
To add a new extractor:
1. Create a new file in this package (e.g., my_site.py)
2. Subclass BaseExtractor from backend.extractors.base
3. Implement site_key, site_name, and extract()
4. Import and register it in this file's create_registry() function
Example:
from backend.extractors.my_site import MySiteExtractor
registry.register(MySiteExtractor())
"""
from backend.extractors.demo import DemoExtractor
from backend.extractors.models import ExtractedStream
from backend.extractors.registry import ExtractorRegistry
from backend.extractors.service import ExtractionService
__all__ = [
"ExtractedStream",
"ExtractorRegistry",
"ExtractionService",
"create_registry",
"create_extraction_service",
]
def create_registry() -> ExtractorRegistry:
"""Create and populate the extractor registry with all known extractors.
Add new extractors here by importing and registering them.
"""
registry = ExtractorRegistry()
# --- Register extractors below ---
registry.register(DemoExtractor())
# registry.register(MySiteExtractor()) # Add new extractors here
return registry
def create_extraction_service() -> ExtractionService:
"""Create an ExtractionService with all extractors registered.
This is the main entry point for the extraction framework.
Call this once during app startup.
"""
registry = create_registry()
return ExtractionService(registry)

View file

@ -0,0 +1,118 @@
"""Base class for all site-specific stream extractors."""
import logging
from abc import ABC, abstractmethod
import httpx
from backend.extractors.models import ExtractedStream
logger = logging.getLogger(__name__)
class BaseExtractor(ABC):
"""Abstract base class for site-specific stream extractors.
To create a new extractor:
1. Create a new file in backend/extractors/
2. Subclass BaseExtractor
3. Implement site_key, site_name, and extract()
4. Register it in backend/extractors/__init__.py
"""
@property
@abstractmethod
def site_key(self) -> str:
"""Unique identifier for this site (e.g., 'sportsurge').
Must be lowercase, alphanumeric with hyphens/underscores only.
Used as the cache key and in API responses.
"""
@property
@abstractmethod
def site_name(self) -> str:
"""Human-readable name (e.g., 'SportSurge').
Displayed in the UI and API responses.
"""
@abstractmethod
async def extract(self) -> list[ExtractedStream]:
"""Extract stream URLs from this site.
Returns a list of ExtractedStream objects. Each represents a
discovered stream URL. The extractor should set url, quality,
and title fields; site_key, site_name, and extracted_at are
auto-populated if left empty.
Implementations should:
- Use httpx for HTTP requests
- Handle their own errors gracefully (log and return empty list)
- Set quality when detectable from the source
- Set title to something descriptive
"""
async def health_check(self, url: str) -> bool:
"""Verify a URL is live (HEAD request, check for m3u8 content).
Sends a HEAD request and checks:
1. HTTP 200 response
2. Content-Type suggests HLS/media content (if available)
Returns True if the URL appears to be a live stream.
"""
try:
async with httpx.AsyncClient(
timeout=10.0,
follow_redirects=True,
headers={"User-Agent": "Mozilla/5.0"},
) as client:
response = await client.head(url)
if response.status_code != 200:
logger.debug(
"[%s] Health check failed for %s: HTTP %d",
self.site_key,
url,
response.status_code,
)
return False
content_type = response.headers.get("content-type", "").lower()
# m3u8 streams typically have these content types
live_indicators = [
"application/vnd.apple.mpegurl",
"application/x-mpegurl",
"video/",
"audio/",
"octet-stream",
]
# If content-type is present and doesn't look like media,
# the URL might not be a stream. But some servers don't set
# content-type properly for HEAD, so we still return True
# if content-type is missing or generic.
if content_type and not any(ind in content_type for ind in live_indicators):
# Content type present but doesn't look like media.
# Could still be valid (some servers return text/plain for m3u8).
if "text/" in content_type or "html" in content_type:
logger.debug(
"[%s] Health check suspect for %s: content-type=%s",
self.site_key,
url,
content_type,
)
return False
return True
except httpx.TimeoutException:
logger.debug("[%s] Health check timed out for %s", self.site_key, url)
return False
except httpx.HTTPError as e:
logger.debug("[%s] Health check error for %s: %s", self.site_key, url, e)
return False
except Exception:
logger.exception("[%s] Unexpected error during health check for %s", self.site_key, url)
return False

View file

@ -0,0 +1,75 @@
"""Demo extractor - returns hardcoded test streams for framework testing.
This extractor exists purely for testing the extraction pipeline end-to-end.
It does NOT connect to any real streaming site. Disable it in production by
removing its registration from __init__.py or setting DEMO_EXTRACTOR_ENABLED=false.
"""
import logging
import os
from backend.extractors.base import BaseExtractor
from backend.extractors.models import ExtractedStream
logger = logging.getLogger(__name__)
# Set DEMO_EXTRACTOR_ENABLED=false to disable this extractor
DEMO_ENABLED = os.getenv("DEMO_EXTRACTOR_ENABLED", "true").lower() in ("true", "1", "yes")
class DemoExtractor(BaseExtractor):
"""Demo extractor that returns hardcoded test streams.
Use this to verify the extraction framework works end-to-end without
needing a real streaming site. The streams are publicly available HLS
test streams from Apple and others.
"""
@property
def site_key(self) -> str:
return "demo"
@property
def site_name(self) -> str:
return "Demo (Test Streams)"
async def extract(self) -> list[ExtractedStream]:
"""Return hardcoded test streams for framework testing."""
if not DEMO_ENABLED:
logger.info("[demo] Demo extractor is disabled via DEMO_EXTRACTOR_ENABLED")
return []
logger.info("[demo] Returning demo test streams")
streams = [
ExtractedStream(
url="https://test-streams.mux.dev/x36xhzz/x36xhzz.m3u8",
site_key=self.site_key,
site_name=self.site_name,
quality="720p",
title="Big Buck Bunny (Test Stream)",
is_live=False,
),
ExtractedStream(
url="https://devstreaming-cdn.apple.com/videos/streaming/examples/bipbop_16x9/bipbop_16x9_variant.m3u8",
site_key=self.site_key,
site_name=self.site_name,
quality="1080p",
title="Apple Bipbop (Test Stream)",
is_live=False,
),
ExtractedStream(
url="https://cph-p2p-msl.akamaized.net/hls/live/2000341/test/master.m3u8",
site_key=self.site_key,
site_name=self.site_name,
quality="",
title="Akamai Live Test Stream",
is_live=False,
),
]
# Optionally run health checks on the demo streams
for stream in streams:
stream.is_live = await self.health_check(stream.url)
return streams

View file

@ -0,0 +1,29 @@
"""Data models for the stream extraction framework."""
from dataclasses import dataclass, field
from datetime import datetime, timezone
@dataclass
class ExtractedStream:
"""Represents a single stream URL discovered by an extractor."""
url: str # The HLS/m3u8 URL
site_key: str # Which extractor found it
site_name: str # Human-readable name
quality: str = "" # e.g., "720p", "1080p", or empty
title: str = "" # e.g., "F1 Race Live"
extracted_at: str = field(default_factory=lambda: datetime.now(timezone.utc).isoformat())
is_live: bool = False # Whether it passed health check
def to_dict(self) -> dict:
"""Serialize to a plain dictionary for JSON responses."""
return {
"url": self.url,
"site_key": self.site_key,
"site_name": self.site_name,
"quality": self.quality,
"title": self.title,
"extracted_at": self.extracted_at,
"is_live": self.is_live,
}

View file

@ -0,0 +1,116 @@
"""Central registry for stream extractors."""
import asyncio
import logging
from datetime import datetime, timezone
from backend.extractors.base import BaseExtractor
from backend.extractors.models import ExtractedStream
logger = logging.getLogger(__name__)
class ExtractorRegistry:
"""Central registry for all site extractors.
Manages extractor instances and provides fan-out extraction across
all registered extractors with independent error handling.
"""
def __init__(self) -> None:
self._extractors: dict[str, BaseExtractor] = {}
def register(self, extractor: BaseExtractor) -> None:
"""Register an extractor instance.
Args:
extractor: A BaseExtractor subclass instance.
Raises:
ValueError: If an extractor with the same site_key is already registered.
"""
key = extractor.site_key
if key in self._extractors:
raise ValueError(
f"Extractor with site_key '{key}' is already registered "
f"(existing: {self._extractors[key].site_name}, "
f"new: {extractor.site_name})"
)
self._extractors[key] = extractor
logger.info("Registered extractor: %s (%s)", extractor.site_name, key)
def get(self, site_key: str) -> BaseExtractor | None:
"""Get an extractor by its site_key.
Args:
site_key: The unique identifier of the extractor.
Returns:
The extractor instance, or None if not found.
"""
return self._extractors.get(site_key)
def list_extractors(self) -> list[dict]:
"""List all registered extractors.
Returns:
A list of dicts with site_key and site_name for each extractor.
"""
return [
{"site_key": ext.site_key, "site_name": ext.site_name}
for ext in self._extractors.values()
]
async def extract_all(self) -> list[ExtractedStream]:
"""Fan-out extraction to all registered extractors concurrently.
Each extractor runs independently. If one fails, the others
continue and their results are still collected.
Returns:
Combined list of ExtractedStream from all extractors.
"""
if not self._extractors:
logger.warning("No extractors registered, nothing to extract")
return []
logger.info(
"Running extraction across %d extractor(s): %s",
len(self._extractors),
", ".join(self._extractors.keys()),
)
async def _safe_extract(extractor: BaseExtractor) -> list[ExtractedStream]:
"""Run a single extractor with error isolation."""
try:
streams = await extractor.extract()
# Fill in site_key/site_name if the extractor didn't set them
now = datetime.now(timezone.utc).isoformat()
for stream in streams:
if not stream.site_key:
stream.site_key = extractor.site_key
if not stream.site_name:
stream.site_name = extractor.site_name
if not stream.extracted_at:
stream.extracted_at = now
logger.info(
"[%s] Extracted %d stream(s)", extractor.site_key, len(streams)
)
return streams
except Exception:
logger.exception(
"[%s] Extractor failed during extraction", extractor.site_key
)
return []
# Run all extractors concurrently
tasks = [_safe_extract(ext) for ext in self._extractors.values()]
results = await asyncio.gather(*tasks)
# Flatten results
all_streams: list[ExtractedStream] = []
for stream_list in results:
all_streams.extend(stream_list)
logger.info("Extraction complete: %d total stream(s) found", len(all_streams))
return all_streams

View file

@ -0,0 +1,121 @@
"""Extraction service - manages extraction lifecycle: polling, caching, serving."""
import logging
from datetime import datetime, timezone
from backend.extractors.models import ExtractedStream
from backend.extractors.registry import ExtractorRegistry
logger = logging.getLogger(__name__)
class ExtractionService:
"""Manages the extraction lifecycle: polling, caching, and serving results.
Extraction runs on a background schedule (via APScheduler), never on
client request path. Results are cached in memory, keyed by site_key.
"""
def __init__(self, registry: ExtractorRegistry) -> None:
self._registry = registry
# Cache: site_key -> list of ExtractedStream
self._cache: dict[str, list[ExtractedStream]] = {}
self._last_run: str | None = None
self._last_run_stream_count: int = 0
async def run_extraction(self) -> None:
"""Run all extractors and cache their results.
This is called by the background scheduler. Each extractor's
results replace its previous cache entry entirely.
"""
logger.info("Starting extraction run...")
start = datetime.now(timezone.utc)
streams = await self._registry.extract_all()
# Group streams by site_key and update cache
new_cache: dict[str, list[ExtractedStream]] = {}
for stream in streams:
new_cache.setdefault(stream.site_key, []).append(stream)
# Replace cache for extractors that returned results.
# Clear cache for extractors that returned nothing (site went down, etc.)
for extractor_info in self._registry.list_extractors():
key = extractor_info["site_key"]
if key in new_cache:
self._cache[key] = new_cache[key]
else:
# Extractor returned nothing - clear its cache
self._cache.pop(key, None)
self._last_run = start.isoformat()
self._last_run_stream_count = len(streams)
elapsed = (datetime.now(timezone.utc) - start).total_seconds()
logger.info(
"Extraction run complete: %d stream(s) from %d extractor(s) in %.1fs",
len(streams),
len(new_cache),
elapsed,
)
def get_streams(self) -> list[dict]:
"""Return all cached streams as a flat list of dicts.
Returns:
List of serialized ExtractedStream dicts from all extractors.
"""
all_streams: list[dict] = []
for streams in self._cache.values():
all_streams.extend(s.to_dict() for s in streams)
return all_streams
def get_streams_for_session(self, session_type: str) -> list[dict]:
"""Return cached streams filtered/annotated for a specific session type.
Currently returns all streams (extractors don't yet differentiate by
session type). This method exists as a hook for future filtering,
e.g., some extractors might only have race streams but not FP streams.
Args:
session_type: The F1 session type (e.g., "race", "qualifying", "fp1").
Returns:
List of serialized ExtractedStream dicts.
"""
# For now, all streams are potentially relevant to any session.
# Future extractors may tag streams with session types, at which
# point this method will filter accordingly.
streams = self.get_streams()
logger.debug(
"Returning %d stream(s) for session type '%s'",
len(streams),
session_type,
)
return streams
def get_status(self) -> dict:
"""Return extraction service status for the /extractors endpoint."""
extractor_list = self._registry.list_extractors()
extractor_statuses = []
for info in extractor_list:
key = info["site_key"]
cached = self._cache.get(key, [])
extractor_statuses.append(
{
"site_key": key,
"site_name": info["site_name"],
"cached_streams": len(cached),
}
)
return {
"extractors": extractor_statuses,
"total_cached_streams": sum(
len(streams) for streams in self._cache.values()
),
"last_run": self._last_run,
"last_run_stream_count": self._last_run_stream_count,
}