infra/stacks/f1-stream/files/backend/extractors/service.py
Viktor Barzin d15337e838 [ci skip] f1-stream: add extractor framework with demo streams (Phase 3)
- BaseExtractor ABC with health_check method
- ExtractorRegistry with concurrent fan-out extraction
- ExtractionService with in-memory cache and background polling
- DemoExtractor with 3 public HLS test streams
- Adaptive polling: 5min during live sessions, 30min otherwise
- GET /streams, GET /extractors, POST /extract endpoints
2026-02-23 23:02:56 +00:00

121 lines
4.4 KiB
Python

"""Extraction service - manages extraction lifecycle: polling, caching, serving."""
import logging
from datetime import datetime, timezone
from backend.extractors.models import ExtractedStream
from backend.extractors.registry import ExtractorRegistry
logger = logging.getLogger(__name__)
class ExtractionService:
"""Manages the extraction lifecycle: polling, caching, and serving results.
Extraction runs on a background schedule (via APScheduler), never on
client request path. Results are cached in memory, keyed by site_key.
"""
def __init__(self, registry: ExtractorRegistry) -> None:
self._registry = registry
# Cache: site_key -> list of ExtractedStream
self._cache: dict[str, list[ExtractedStream]] = {}
self._last_run: str | None = None
self._last_run_stream_count: int = 0
async def run_extraction(self) -> None:
"""Run all extractors and cache their results.
This is called by the background scheduler. Each extractor's
results replace its previous cache entry entirely.
"""
logger.info("Starting extraction run...")
start = datetime.now(timezone.utc)
streams = await self._registry.extract_all()
# Group streams by site_key and update cache
new_cache: dict[str, list[ExtractedStream]] = {}
for stream in streams:
new_cache.setdefault(stream.site_key, []).append(stream)
# Replace cache for extractors that returned results.
# Clear cache for extractors that returned nothing (site went down, etc.)
for extractor_info in self._registry.list_extractors():
key = extractor_info["site_key"]
if key in new_cache:
self._cache[key] = new_cache[key]
else:
# Extractor returned nothing - clear its cache
self._cache.pop(key, None)
self._last_run = start.isoformat()
self._last_run_stream_count = len(streams)
elapsed = (datetime.now(timezone.utc) - start).total_seconds()
logger.info(
"Extraction run complete: %d stream(s) from %d extractor(s) in %.1fs",
len(streams),
len(new_cache),
elapsed,
)
def get_streams(self) -> list[dict]:
"""Return all cached streams as a flat list of dicts.
Returns:
List of serialized ExtractedStream dicts from all extractors.
"""
all_streams: list[dict] = []
for streams in self._cache.values():
all_streams.extend(s.to_dict() for s in streams)
return all_streams
def get_streams_for_session(self, session_type: str) -> list[dict]:
"""Return cached streams filtered/annotated for a specific session type.
Currently returns all streams (extractors don't yet differentiate by
session type). This method exists as a hook for future filtering,
e.g., some extractors might only have race streams but not FP streams.
Args:
session_type: The F1 session type (e.g., "race", "qualifying", "fp1").
Returns:
List of serialized ExtractedStream dicts.
"""
# For now, all streams are potentially relevant to any session.
# Future extractors may tag streams with session types, at which
# point this method will filter accordingly.
streams = self.get_streams()
logger.debug(
"Returning %d stream(s) for session type '%s'",
len(streams),
session_type,
)
return streams
def get_status(self) -> dict:
"""Return extraction service status for the /extractors endpoint."""
extractor_list = self._registry.list_extractors()
extractor_statuses = []
for info in extractor_list:
key = info["site_key"]
cached = self._cache.get(key, [])
extractor_statuses.append(
{
"site_key": key,
"site_name": info["site_name"],
"cached_streams": len(cached),
}
)
return {
"extractors": extractor_statuses,
"total_cached_streams": sum(
len(streams) for streams in self._cache.values()
),
"last_run": self._last_run,
"last_run_stream_count": self._last_run_stream_count,
}