infra/stacks/f1-stream/files/backend/extractors/registry.py
Viktor Barzin d15337e838 [ci skip] f1-stream: add extractor framework with demo streams (Phase 3)
- BaseExtractor ABC with health_check method
- ExtractorRegistry with concurrent fan-out extraction
- ExtractionService with in-memory cache and background polling
- DemoExtractor with 3 public HLS test streams
- Adaptive polling: 5min during live sessions, 30min otherwise
- GET /streams, GET /extractors, POST /extract endpoints
2026-02-23 23:02:56 +00:00

116 lines
4 KiB
Python

"""Central registry for stream extractors."""
import asyncio
import logging
from datetime import datetime, timezone
from backend.extractors.base import BaseExtractor
from backend.extractors.models import ExtractedStream
logger = logging.getLogger(__name__)
class ExtractorRegistry:
"""Central registry for all site extractors.
Manages extractor instances and provides fan-out extraction across
all registered extractors with independent error handling.
"""
def __init__(self) -> None:
self._extractors: dict[str, BaseExtractor] = {}
def register(self, extractor: BaseExtractor) -> None:
"""Register an extractor instance.
Args:
extractor: A BaseExtractor subclass instance.
Raises:
ValueError: If an extractor with the same site_key is already registered.
"""
key = extractor.site_key
if key in self._extractors:
raise ValueError(
f"Extractor with site_key '{key}' is already registered "
f"(existing: {self._extractors[key].site_name}, "
f"new: {extractor.site_name})"
)
self._extractors[key] = extractor
logger.info("Registered extractor: %s (%s)", extractor.site_name, key)
def get(self, site_key: str) -> BaseExtractor | None:
"""Get an extractor by its site_key.
Args:
site_key: The unique identifier of the extractor.
Returns:
The extractor instance, or None if not found.
"""
return self._extractors.get(site_key)
def list_extractors(self) -> list[dict]:
"""List all registered extractors.
Returns:
A list of dicts with site_key and site_name for each extractor.
"""
return [
{"site_key": ext.site_key, "site_name": ext.site_name}
for ext in self._extractors.values()
]
async def extract_all(self) -> list[ExtractedStream]:
"""Fan-out extraction to all registered extractors concurrently.
Each extractor runs independently. If one fails, the others
continue and their results are still collected.
Returns:
Combined list of ExtractedStream from all extractors.
"""
if not self._extractors:
logger.warning("No extractors registered, nothing to extract")
return []
logger.info(
"Running extraction across %d extractor(s): %s",
len(self._extractors),
", ".join(self._extractors.keys()),
)
async def _safe_extract(extractor: BaseExtractor) -> list[ExtractedStream]:
"""Run a single extractor with error isolation."""
try:
streams = await extractor.extract()
# Fill in site_key/site_name if the extractor didn't set them
now = datetime.now(timezone.utc).isoformat()
for stream in streams:
if not stream.site_key:
stream.site_key = extractor.site_key
if not stream.site_name:
stream.site_name = extractor.site_name
if not stream.extracted_at:
stream.extracted_at = now
logger.info(
"[%s] Extracted %d stream(s)", extractor.site_key, len(streams)
)
return streams
except Exception:
logger.exception(
"[%s] Extractor failed during extraction", extractor.site_key
)
return []
# Run all extractors concurrently
tasks = [_safe_extract(ext) for ext in self._extractors.values()]
results = await asyncio.gather(*tasks)
# Flatten results
all_streams: list[ExtractedStream] = []
for stream_list in results:
all_streams.extend(stream_list)
logger.info("Extraction complete: %d total stream(s) found", len(all_streams))
return all_streams