Cuts the stream list from 23 mostly-broken entries to ~6 confirmed-playable ones, and adds an iframe-stripping proxy so embed sources (hmembeds, etc.) load through our origin without X-Frame-Options / CSP / JS frame-buster blocks. Why: the previous list was dominated by Discord-shared news article URLs, hardcoded aggregator landing pages, and other non-stream URLs that all sat at is_live=true because embed streams skipped the health check entirely. Users could not tell which links would actually play. What: - backend/playback_verifier.py: new headless-Chromium verifier (Playwright) that polls each candidate stream for a codec-independent "playable" signal (hls.js MANIFEST_PARSED for m3u8; <video>/player div for embed). Replaces the unconditional is_live=True for embed streams in service.py. - backend/embed_proxy.py: new /embed and /embed-asset routes that fetch upstream embed pages, strip X-Frame-Options/CSP/Set-Cookie, and inject a <base href> + frame-buster-defeat <script> that locks down window.top, document.referrer, console.clear/table, and window.location so the hmembeds disable-devtool.js redirect-to-google trap can't fire. - extractors/curated.py: new always-on extractor with two known-good 24/7 hmembeds embeds (Sky Sports F1, DAZN F1) so the list isn't empty between race weekends. - extractors/__init__.py: register CuratedExtractor first; drop FallbackExtractor (its 10 aggregator landing-pages can't iframe-play). - extractors/discord_source.py: positive-match path filter (must look like /embed/, /stream, /watch, /live, /player, *.m3u8, *.php) plus expanded domain blocklist for news sites — was 10 noise URLs, now ~1. - extractors/service.py: run_extraction now health-checks AND verifier- checks both stream types; only verified-playable streams reach is_live. - main.py: register /embed + /embed-asset routes; defer initial extraction by 8s so the verifier can reach the local /embed proxy on 127.0.0.1:8000. - frontend/lib/api.js + watch/+page.svelte: route embed iframes through /embed proxy instead of the upstream URL, so X-Frame-Options/CSP can't block them. - Dockerfile: install Playwright chromium + system codec-runtime libs. - main.tf: bump pod memory 256Mi → 1Gi for chromium. Verified end-to-end with Playwright against https://f1.viktorbarzin.me/watch — 6/6 streams reach a player UI; the 3 demo m3u8s actually play (codec-bearing browser); the 3 embeds (Sky Sports F1, DAZN F1, sportsurge) render iframes through the proxy. Image: viktorbarzin/f1-stream:v6.0.5 Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
190 lines
6.9 KiB
Python
190 lines
6.9 KiB
Python
"""TimStreams extractor - fetches F1 streams from the TimStreams JSON API.
|
|
|
|
Returns embed URLs from hmembeds.one for iframe playback.
|
|
The public API at stra.viaplus.site/main requires no authentication
|
|
and returns all events/channels across Events, Replays, and 24/7 categories.
|
|
"""
|
|
|
|
import logging
|
|
|
|
import httpx
|
|
|
|
from backend.extractors.base import BaseExtractor
|
|
from backend.extractors.models import ExtractedStream
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
API_URL = "https://stra.viaplus.site/main"
|
|
USER_AGENT = (
|
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
|
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
|
"Chrome/120.0.0.0 Safari/537.36"
|
|
)
|
|
|
|
# Direct F1 keyword matches (case-insensitive)
|
|
F1_KEYWORDS = {"formula 1", "formula one", "f1", "sky sports f1", "dazn f1"}
|
|
# "Grand prix" is F1-related only if non-F1 motorsport keywords are absent
|
|
GP_KEYWORD = "grand prix"
|
|
# Exclude these motorsport series when matching on "grand prix"
|
|
NON_F1_KEYWORDS = {
|
|
"motogp", "moto gp", "moto2", "moto3", "motoe",
|
|
"indycar", "indy car", "nascar",
|
|
"rally", "wrc", "wec", "lemans", "le mans",
|
|
"superbike", "dtm", "supercars",
|
|
}
|
|
|
|
# 24/7 channels that should always be included (embed hashes on hmembeds.one)
|
|
ALWAYS_INCLUDE_HASHES = {
|
|
"888520f36cd94c5da4c71fddc1a5fc9b", # Sky Sports F1
|
|
"fc3a54634d0867b0c02ee3223292e7c6", # DAZN F1
|
|
}
|
|
|
|
|
|
def _is_f1_event(name: str) -> bool:
|
|
"""Check if an event/channel is Formula 1 related by name.
|
|
|
|
Returns True when the name contains a direct F1 keyword, or contains
|
|
"grand prix" without non-F1 series keywords.
|
|
|
|
Note: The TimStreams API genre field (genre=2) covers ALL sports channels,
|
|
not just motorsport, so we rely solely on name-based matching.
|
|
"""
|
|
lower = name.lower()
|
|
|
|
# Direct F1 keyword match
|
|
if any(kw in lower for kw in F1_KEYWORDS):
|
|
return True
|
|
|
|
# Grand prix without competing series
|
|
if GP_KEYWORD in lower and not any(kw in lower for kw in NON_F1_KEYWORDS):
|
|
return True
|
|
|
|
return False
|
|
|
|
|
|
def _extract_embed_hash(url: str) -> str | None:
|
|
"""Extract the hash from an hmembeds.one embed URL.
|
|
|
|
Expected format: https://hmembeds.one/embed/{hash}
|
|
Returns the hash string, or None if the URL is not in the expected format.
|
|
"""
|
|
if not url:
|
|
return None
|
|
# Handle both with and without trailing slash
|
|
url = url.rstrip("/")
|
|
prefix = "https://hmembeds.one/embed/"
|
|
alt_prefix = "http://hmembeds.one/embed/"
|
|
if url.startswith(prefix):
|
|
return url[len(prefix):] or None
|
|
if url.startswith(alt_prefix):
|
|
return url[len(alt_prefix):] or None
|
|
return None
|
|
|
|
|
|
def _is_always_include(url: str) -> bool:
|
|
"""Check if a stream URL is one of the always-include 24/7 channels."""
|
|
embed_hash = _extract_embed_hash(url)
|
|
return embed_hash in ALWAYS_INCLUDE_HASHES if embed_hash else False
|
|
|
|
|
|
class TimStreamsExtractor(BaseExtractor):
|
|
"""Extracts embed URLs from TimStreams' public JSON API.
|
|
|
|
The API at stra.viaplus.site/main returns a JSON array of categories,
|
|
each containing events with stream URLs pointing to hmembeds.one embeds.
|
|
"""
|
|
|
|
@property
|
|
def site_key(self) -> str:
|
|
return "timstreams"
|
|
|
|
@property
|
|
def site_name(self) -> str:
|
|
return "TimStreams"
|
|
|
|
async def extract(self) -> list[ExtractedStream]:
|
|
"""Fetch F1 events/channels and return embed URLs for iframe playback."""
|
|
streams: list[ExtractedStream] = []
|
|
seen_urls: set[str] = set()
|
|
|
|
try:
|
|
async with httpx.AsyncClient(
|
|
timeout=15.0,
|
|
follow_redirects=True,
|
|
headers={"User-Agent": USER_AGENT, "Accept": "application/json"},
|
|
) as client:
|
|
resp = await client.get(API_URL)
|
|
if resp.status_code != 200:
|
|
logger.warning(
|
|
"[timstreams] API returned HTTP %d", resp.status_code
|
|
)
|
|
return []
|
|
|
|
data = resp.json()
|
|
if not isinstance(data, list):
|
|
logger.warning("[timstreams] Unexpected API response type: %s", type(data).__name__)
|
|
return []
|
|
|
|
logger.info("[timstreams] API returned %d categorie(s)", len(data))
|
|
|
|
for category in data:
|
|
category_name = category.get("category", "Unknown")
|
|
events = category.get("events", [])
|
|
if not isinstance(events, list):
|
|
continue
|
|
|
|
for event in events:
|
|
event_name = event.get("name", "Unknown")
|
|
event_streams = event.get("streams", [])
|
|
|
|
if not isinstance(event_streams, list) or not event_streams:
|
|
continue
|
|
|
|
# Check if any stream URL matches an always-include channel
|
|
always_include = any(
|
|
_is_always_include(s.get("url", ""))
|
|
for s in event_streams
|
|
)
|
|
|
|
# Filter: must be F1-related or an always-include channel
|
|
if not always_include and not _is_f1_event(event_name):
|
|
continue
|
|
|
|
for stream_info in event_streams:
|
|
stream_name = stream_info.get("name", "")
|
|
stream_url = stream_info.get("url", "")
|
|
|
|
if not stream_url:
|
|
continue
|
|
|
|
# Deduplicate by URL
|
|
if stream_url in seen_urls:
|
|
continue
|
|
seen_urls.add(stream_url)
|
|
|
|
# Build a descriptive title
|
|
title = event_name
|
|
if stream_name and stream_name.lower() != event_name.lower():
|
|
title = f"{event_name} - {stream_name}"
|
|
if category_name:
|
|
title = f"[{category_name}] {title}"
|
|
|
|
streams.append(
|
|
ExtractedStream(
|
|
url=stream_url,
|
|
site_key=self.site_key,
|
|
site_name=self.site_name,
|
|
quality="",
|
|
title=title,
|
|
stream_type="embed",
|
|
embed_url=stream_url,
|
|
)
|
|
)
|
|
|
|
except httpx.TimeoutException:
|
|
logger.warning("[timstreams] API request timed out")
|
|
except Exception:
|
|
logger.exception("[timstreams] Failed to fetch from API")
|
|
|
|
logger.info("[timstreams] Extracted %d stream(s)", len(streams))
|
|
return streams
|