infra/stacks/f1-stream/files/backend/extractors/timstreams.py
Viktor Barzin 147a8cff40 Restore f1-stream stack — undo accidental bundling into 63fe7d2b
Commit 63fe7d2b (fan-control) was made with a bare `git commit` in the
shared infra working tree and inadvertently swept in a parallel session's
staged f1-stream-extraction work (main.tf repoint, ~48 files/ removals,
ci-cd.md + .claude docs, two extraction plan docs).

This returns every f1-stream-related path to its pre-63fe7d2b state
(3493c347) so that extraction can be committed cleanly by its own
session. The fan-control files added in 63fe7d2b are untouched.

[ci skip]

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-05 09:19:12 +00:00

190 lines
6.9 KiB
Python

"""TimStreams extractor - fetches F1 streams from the TimStreams JSON API.
Returns embed URLs from hmembeds.one for iframe playback.
The public API at stra.viaplus.site/main requires no authentication
and returns all events/channels across Events, Replays, and 24/7 categories.
"""
import logging
import httpx
from backend.extractors.base import BaseExtractor
from backend.extractors.models import ExtractedStream
logger = logging.getLogger(__name__)
API_URL = "https://stra.viaplus.site/main"
USER_AGENT = (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/120.0.0.0 Safari/537.36"
)
# Direct F1 keyword matches (case-insensitive)
F1_KEYWORDS = {"formula 1", "formula one", "f1", "sky sports f1", "dazn f1"}
# "Grand prix" is F1-related only if non-F1 motorsport keywords are absent
GP_KEYWORD = "grand prix"
# Exclude these motorsport series when matching on "grand prix"
NON_F1_KEYWORDS = {
"motogp", "moto gp", "moto2", "moto3", "motoe",
"indycar", "indy car", "nascar",
"rally", "wrc", "wec", "lemans", "le mans",
"superbike", "dtm", "supercars",
}
# 24/7 channels that should always be included (embed hashes on hmembeds.one)
ALWAYS_INCLUDE_HASHES = {
"888520f36cd94c5da4c71fddc1a5fc9b", # Sky Sports F1
"fc3a54634d0867b0c02ee3223292e7c6", # DAZN F1
}
def _is_f1_event(name: str) -> bool:
"""Check if an event/channel is Formula 1 related by name.
Returns True when the name contains a direct F1 keyword, or contains
"grand prix" without non-F1 series keywords.
Note: The TimStreams API genre field (genre=2) covers ALL sports channels,
not just motorsport, so we rely solely on name-based matching.
"""
lower = name.lower()
# Direct F1 keyword match
if any(kw in lower for kw in F1_KEYWORDS):
return True
# Grand prix without competing series
if GP_KEYWORD in lower and not any(kw in lower for kw in NON_F1_KEYWORDS):
return True
return False
def _extract_embed_hash(url: str) -> str | None:
"""Extract the hash from an hmembeds.one embed URL.
Expected format: https://hmembeds.one/embed/{hash}
Returns the hash string, or None if the URL is not in the expected format.
"""
if not url:
return None
# Handle both with and without trailing slash
url = url.rstrip("/")
prefix = "https://hmembeds.one/embed/"
alt_prefix = "http://hmembeds.one/embed/"
if url.startswith(prefix):
return url[len(prefix):] or None
if url.startswith(alt_prefix):
return url[len(alt_prefix):] or None
return None
def _is_always_include(url: str) -> bool:
"""Check if a stream URL is one of the always-include 24/7 channels."""
embed_hash = _extract_embed_hash(url)
return embed_hash in ALWAYS_INCLUDE_HASHES if embed_hash else False
class TimStreamsExtractor(BaseExtractor):
"""Extracts embed URLs from TimStreams' public JSON API.
The API at stra.viaplus.site/main returns a JSON array of categories,
each containing events with stream URLs pointing to hmembeds.one embeds.
"""
@property
def site_key(self) -> str:
return "timstreams"
@property
def site_name(self) -> str:
return "TimStreams"
async def extract(self) -> list[ExtractedStream]:
"""Fetch F1 events/channels and return embed URLs for iframe playback."""
streams: list[ExtractedStream] = []
seen_urls: set[str] = set()
try:
async with httpx.AsyncClient(
timeout=15.0,
follow_redirects=True,
headers={"User-Agent": USER_AGENT, "Accept": "application/json"},
) as client:
resp = await client.get(API_URL)
if resp.status_code != 200:
logger.warning(
"[timstreams] API returned HTTP %d", resp.status_code
)
return []
data = resp.json()
if not isinstance(data, list):
logger.warning("[timstreams] Unexpected API response type: %s", type(data).__name__)
return []
logger.info("[timstreams] API returned %d categorie(s)", len(data))
for category in data:
category_name = category.get("category", "Unknown")
events = category.get("events", [])
if not isinstance(events, list):
continue
for event in events:
event_name = event.get("name", "Unknown")
event_streams = event.get("streams", [])
if not isinstance(event_streams, list) or not event_streams:
continue
# Check if any stream URL matches an always-include channel
always_include = any(
_is_always_include(s.get("url", ""))
for s in event_streams
)
# Filter: must be F1-related or an always-include channel
if not always_include and not _is_f1_event(event_name):
continue
for stream_info in event_streams:
stream_name = stream_info.get("name", "")
stream_url = stream_info.get("url", "")
if not stream_url:
continue
# Deduplicate by URL
if stream_url in seen_urls:
continue
seen_urls.add(stream_url)
# Build a descriptive title
title = event_name
if stream_name and stream_name.lower() != event_name.lower():
title = f"{event_name} - {stream_name}"
if category_name:
title = f"[{category_name}] {title}"
streams.append(
ExtractedStream(
url=stream_url,
site_key=self.site_key,
site_name=self.site_name,
quality="",
title=title,
stream_type="embed",
embed_url=stream_url,
)
)
except httpx.TimeoutException:
logger.warning("[timstreams] API request timed out")
except Exception:
logger.exception("[timstreams] Failed to fetch from API")
logger.info("[timstreams] Extracted %d stream(s)", len(streams))
return streams