f1-stream: only show streams confirmed playable by headless browser
Cuts the stream list from 23 mostly-broken entries to ~6 confirmed-playable ones, and adds an iframe-stripping proxy so embed sources (hmembeds, etc.) load through our origin without X-Frame-Options / CSP / JS frame-buster blocks. Why: the previous list was dominated by Discord-shared news article URLs, hardcoded aggregator landing pages, and other non-stream URLs that all sat at is_live=true because embed streams skipped the health check entirely. Users could not tell which links would actually play. What: - backend/playback_verifier.py: new headless-Chromium verifier (Playwright) that polls each candidate stream for a codec-independent "playable" signal (hls.js MANIFEST_PARSED for m3u8; <video>/player div for embed). Replaces the unconditional is_live=True for embed streams in service.py. - backend/embed_proxy.py: new /embed and /embed-asset routes that fetch upstream embed pages, strip X-Frame-Options/CSP/Set-Cookie, and inject a <base href> + frame-buster-defeat <script> that locks down window.top, document.referrer, console.clear/table, and window.location so the hmembeds disable-devtool.js redirect-to-google trap can't fire. - extractors/curated.py: new always-on extractor with two known-good 24/7 hmembeds embeds (Sky Sports F1, DAZN F1) so the list isn't empty between race weekends. - extractors/__init__.py: register CuratedExtractor first; drop FallbackExtractor (its 10 aggregator landing-pages can't iframe-play). - extractors/discord_source.py: positive-match path filter (must look like /embed/, /stream, /watch, /live, /player, *.m3u8, *.php) plus expanded domain blocklist for news sites — was 10 noise URLs, now ~1. - extractors/service.py: run_extraction now health-checks AND verifier- checks both stream types; only verified-playable streams reach is_live. - main.py: register /embed + /embed-asset routes; defer initial extraction by 8s so the verifier can reach the local /embed proxy on 127.0.0.1:8000. - frontend/lib/api.js + watch/+page.svelte: route embed iframes through /embed proxy instead of the upstream URL, so X-Frame-Options/CSP can't block them. - Dockerfile: install Playwright chromium + system codec-runtime libs. - main.tf: bump pod memory 256Mi → 1Gi for chromium. Verified end-to-end with Playwright against https://f1.viktorbarzin.me/watch — 6/6 streams reach a player UI; the 3 demo m3u8s actually play (codec-bearing browser); the 3 embeds (Sky Sports F1, DAZN F1, sportsurge) render iframes through the proxy. Image: viktorbarzin/f1-stream:v6.0.5 Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
parent
8b180f7662
commit
f90d79ed4e
15 changed files with 2128 additions and 22 deletions
|
|
@ -14,9 +14,26 @@ FROM python:3.13-slim-bookworm
|
|||
|
||||
WORKDIR /app
|
||||
|
||||
# Headless Chromium runtime libs for the playback verifier. Listed inline
|
||||
# (instead of running `playwright install-deps`) so the image build doesn't
|
||||
# need root-network apt fetches at runtime.
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
ca-certificates \
|
||||
libnss3 libnspr4 \
|
||||
libatk1.0-0 libatk-bridge2.0-0 libcups2 \
|
||||
libdrm2 libxkbcommon0 libxcomposite1 libxdamage1 \
|
||||
libxfixes3 libxrandr2 libgbm1 libpango-1.0-0 libcairo2 \
|
||||
libasound2 libatspi2.0-0 \
|
||||
fonts-liberation fonts-noto-color-emoji \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
COPY backend/requirements.txt .
|
||||
RUN pip install --no-cache-dir -r requirements.txt
|
||||
|
||||
# Install the Chromium browser binary used by the verifier. Skip
|
||||
# --with-deps because we already installed the system libs above.
|
||||
RUN playwright install chromium
|
||||
|
||||
COPY backend/ ./backend/
|
||||
|
||||
# Copy built frontend into the image
|
||||
|
|
|
|||
302
stacks/f1-stream/files/backend/embed_proxy.py
Normal file
302
stacks/f1-stream/files/backend/embed_proxy.py
Normal file
|
|
@ -0,0 +1,302 @@
|
|||
"""Embed iframe-stripping reverse proxy.
|
||||
|
||||
Serves third-party embed pages (e.g. https://hmembeds.one/embed/{hash},
|
||||
https://pooembed.eu/embed/{slug}) through our origin so we can:
|
||||
|
||||
1. Strip X-Frame-Options and Content-Security-Policy: frame-ancestors headers,
|
||||
so the embed loads in our <iframe> regardless of upstream policy.
|
||||
2. Inject <base> + a frame-buster-defeat <script> at the top of <head> so
|
||||
the embed's JS sees `window.top === window` and a plausible
|
||||
`document.referrer` pointing at the upstream origin.
|
||||
3. Forward Referer / User-Agent matching the upstream's own pages so
|
||||
the upstream's hotlink / origin-allowlist checks pass.
|
||||
|
||||
Two endpoints:
|
||||
- GET /embed?url=<base64url> — the embed HTML page (rewritten).
|
||||
- GET /embed-asset?url=<base64url> — fallback for any subresource the
|
||||
upstream blocks based on hotlink protection. Most assets load directly
|
||||
via the injected <base> tag and bypass our proxy.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import re
|
||||
from typing import AsyncGenerator
|
||||
from urllib.parse import urlparse
|
||||
|
||||
import httpx
|
||||
from fastapi import HTTPException
|
||||
|
||||
from backend.m3u8_rewriter import decode_url
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
EMBED_TIMEOUT = 20.0
|
||||
ASSET_TIMEOUT = 30.0
|
||||
RELAY_CHUNK_SIZE = 65536
|
||||
|
||||
USER_AGENT = (
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
||||
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
||||
"Chrome/120.0.0.0 Safari/537.36"
|
||||
)
|
||||
|
||||
# Response headers we never forward (they break frame embedding or leak upstream policy).
|
||||
STRIP_RESPONSE_HEADERS = {
|
||||
"x-frame-options",
|
||||
"content-security-policy",
|
||||
"content-security-policy-report-only",
|
||||
"set-cookie",
|
||||
"report-to",
|
||||
"nel",
|
||||
"permissions-policy",
|
||||
"cross-origin-opener-policy",
|
||||
"cross-origin-embedder-policy",
|
||||
"cross-origin-resource-policy",
|
||||
# let httpx/uvicorn re-set these
|
||||
"transfer-encoding",
|
||||
"content-encoding",
|
||||
"content-length",
|
||||
"connection",
|
||||
}
|
||||
|
||||
# Inject this <script> at the top of <head> to defeat JS frame-busters.
|
||||
# - Locks window.top, window.parent, and window.self to the embed window
|
||||
# itself, so `self !== window.top` checks pass.
|
||||
# - Forces document.referrer to the upstream origin so allowlist checks
|
||||
# like `document.referrer.includes("timstreams.net")` keep working.
|
||||
# - No-ops anything that would call window.parent.location or attempt to
|
||||
# reload the top frame.
|
||||
_FRAME_BUSTER_DEFEAT_TEMPLATE = """
|
||||
<script>(function(){{
|
||||
try {{
|
||||
var fakeWindow = window;
|
||||
Object.defineProperty(window, 'top', {{get: function(){{return fakeWindow;}}, configurable: false}});
|
||||
Object.defineProperty(window, 'parent', {{get: function(){{return fakeWindow;}}, configurable: false}});
|
||||
Object.defineProperty(window, 'frameElement', {{get: function(){{return null;}}, configurable: false}});
|
||||
Object.defineProperty(document, 'referrer', {{get: function(){{return {referrer!r};}}, configurable: false}});
|
||||
}} catch (e) {{}}
|
||||
// Defeat the `disable-devtool.js` redirect trap that hmembeds and similar
|
||||
// embed hosts use. The trap fires `console.clear`/`console.table` in a
|
||||
// tight loop, then if it thinks DevTools is open, calls
|
||||
// `window.location = "https://www.google.com"`. We block those redirect
|
||||
// sinks while leaving normal playback unaffected.
|
||||
try {{
|
||||
var noop = function(){{}};
|
||||
console.clear = noop;
|
||||
console.table = noop;
|
||||
console.dir = noop;
|
||||
var loc = window.location;
|
||||
Object.defineProperty(window, 'location', {{
|
||||
get: function(){{ return loc; }},
|
||||
set: function(v){{ /* swallow assignment */ }},
|
||||
configurable: false,
|
||||
}});
|
||||
var origAssign = loc.assign && loc.assign.bind(loc);
|
||||
var origReplace = loc.replace && loc.replace.bind(loc);
|
||||
loc.assign = function(u){{ if (typeof u === 'string' && u.indexOf('google.com') !== -1) return; if (origAssign) origAssign(u); }};
|
||||
loc.replace = function(u){{ if (typeof u === 'string' && u.indexOf('google.com') !== -1) return; if (origReplace) origReplace(u); }};
|
||||
}} catch (e) {{}}
|
||||
}})();</script>
|
||||
"""
|
||||
|
||||
|
||||
def _decode(encoded_url: str) -> str:
|
||||
try:
|
||||
return decode_url(encoded_url)
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=400, detail=f"Invalid encoded URL: {e}")
|
||||
|
||||
|
||||
def _filter_headers(upstream_headers: httpx.Headers) -> dict[str, str]:
|
||||
"""Forward upstream headers minus the ones we strip."""
|
||||
out: dict[str, str] = {}
|
||||
for k, v in upstream_headers.items():
|
||||
if k.lower() in STRIP_RESPONSE_HEADERS:
|
||||
continue
|
||||
out[k] = v
|
||||
# Always allow our domain to embed and load cross-origin
|
||||
out["Access-Control-Allow-Origin"] = "*"
|
||||
out["X-Frame-Options-Stripped"] = "by-f1-embed-proxy"
|
||||
return out
|
||||
|
||||
|
||||
def _make_referer(upstream_url: str) -> str:
|
||||
"""Build a plausible Referer header — the upstream's own root."""
|
||||
parsed = urlparse(upstream_url)
|
||||
return f"{parsed.scheme}://{parsed.netloc}/"
|
||||
|
||||
|
||||
def _make_origin(upstream_url: str) -> str:
|
||||
parsed = urlparse(upstream_url)
|
||||
return f"{parsed.scheme}://{parsed.netloc}"
|
||||
|
||||
|
||||
def _inject_into_head(html: str, upstream_url: str) -> str:
|
||||
"""Inject <base> tag + frame-buster defeat script into the response HTML."""
|
||||
parsed = urlparse(upstream_url)
|
||||
base_href = f"{parsed.scheme}://{parsed.netloc}/"
|
||||
|
||||
# The frame-buster-defeat script. Use the upstream's own URL as the spoofed referrer.
|
||||
busted = _FRAME_BUSTER_DEFEAT_TEMPLATE.format(referrer=upstream_url)
|
||||
|
||||
base_tag = f'<base href="{base_href}">'
|
||||
|
||||
injection = base_tag + busted
|
||||
|
||||
# Drop any inline CSP <meta> tags first so they can't override our header strip.
|
||||
html = re.sub(
|
||||
r'<meta[^>]+http-equiv=[\'"]?Content-Security-Policy[\'"]?[^>]*>',
|
||||
"",
|
||||
html,
|
||||
flags=re.IGNORECASE,
|
||||
)
|
||||
|
||||
# Strip disable-devtool.js script tags. The library runs detection heuristics
|
||||
# and redirects on match. Removing it reduces attack surface even with our
|
||||
# location-setter lockdown — saves redundant work and one fewer thing to
|
||||
# bypass in case the lockdown misses an edge case.
|
||||
html = re.sub(
|
||||
r'<script[^>]+(?:disable-devtool|devtool|disabledevtool)[^<]*</script>',
|
||||
"",
|
||||
html,
|
||||
flags=re.IGNORECASE,
|
||||
)
|
||||
html = re.sub(
|
||||
r'<script[^>]+src=["\'][^"\']*disable-devtool[^"\']*["\'][^>]*></script>',
|
||||
"",
|
||||
html,
|
||||
flags=re.IGNORECASE,
|
||||
)
|
||||
|
||||
# Insert immediately after the opening <head> (case-insensitive).
|
||||
head_match = re.search(r"<head[^>]*>", html, flags=re.IGNORECASE)
|
||||
if head_match:
|
||||
idx = head_match.end()
|
||||
return html[:idx] + injection + html[idx:]
|
||||
|
||||
# No <head> — prepend at the start of the document so the script runs first.
|
||||
return injection + html
|
||||
|
||||
|
||||
def _looks_blocked_by_anti_bot(content: str) -> bool:
|
||||
"""Detect Cloudflare-style challenge interstitials in the upstream body."""
|
||||
sample = content[:4096].lower()
|
||||
markers = (
|
||||
"cf-chl-bypass",
|
||||
"checking your browser",
|
||||
"just a moment",
|
||||
"attention required",
|
||||
"cf-browser-verification",
|
||||
)
|
||||
return any(m in sample for m in markers)
|
||||
|
||||
|
||||
async def fetch_embed(encoded_url: str) -> tuple[bytes, dict[str, str], int]:
|
||||
"""Fetch an upstream embed page, rewrite the HTML, and return the response.
|
||||
|
||||
Returns: (body_bytes, headers_dict, status_code).
|
||||
Raises HTTPException on transport errors.
|
||||
"""
|
||||
url = _decode(encoded_url)
|
||||
logger.info("Embed-proxying: %s", url)
|
||||
|
||||
upstream_headers = {
|
||||
"User-Agent": USER_AGENT,
|
||||
"Referer": _make_referer(url),
|
||||
"Origin": _make_origin(url),
|
||||
"Accept": (
|
||||
"text/html,application/xhtml+xml,application/xml;q=0.9,"
|
||||
"image/avif,image/webp,*/*;q=0.8"
|
||||
),
|
||||
"Accept-Language": "en-US,en;q=0.9",
|
||||
}
|
||||
|
||||
try:
|
||||
async with httpx.AsyncClient(
|
||||
timeout=EMBED_TIMEOUT,
|
||||
follow_redirects=True,
|
||||
) as client:
|
||||
response = await client.get(url, headers=upstream_headers)
|
||||
except httpx.TimeoutException:
|
||||
raise HTTPException(status_code=504, detail="Upstream embed timeout")
|
||||
except httpx.HTTPError as e:
|
||||
raise HTTPException(status_code=502, detail=f"Upstream embed error: {e}")
|
||||
|
||||
status_code = response.status_code
|
||||
upstream_ct = response.headers.get("content-type", "")
|
||||
headers_out = _filter_headers(response.headers)
|
||||
|
||||
body = response.content
|
||||
|
||||
# Detect Cloudflare-style challenge so the frontend can show a clear error.
|
||||
if "html" in upstream_ct.lower():
|
||||
text = response.text
|
||||
if _looks_blocked_by_anti_bot(text):
|
||||
logger.warning("Upstream returned anti-bot challenge: %s", url)
|
||||
raise HTTPException(
|
||||
status_code=502,
|
||||
detail="Upstream returned anti-bot challenge — proxy cannot bypass",
|
||||
)
|
||||
|
||||
rewritten = _inject_into_head(text, url)
|
||||
body = rewritten.encode("utf-8")
|
||||
headers_out["Content-Type"] = "text/html; charset=utf-8"
|
||||
|
||||
return body, headers_out, status_code
|
||||
|
||||
|
||||
async def relay_asset(
|
||||
encoded_url: str, range_header: str | None
|
||||
) -> tuple[AsyncGenerator[bytes, None], dict[str, str], int]:
|
||||
"""Relay an upstream subresource (JS/CSS/image/font) as a chunked stream.
|
||||
|
||||
Used as a fallback when an upstream blocks hotlinked assets via Referer
|
||||
or Origin checks. The injected <base> tag handles most of these cases
|
||||
by letting the browser hit upstream directly — the relay is only for
|
||||
the awkward few that need a proxied origin.
|
||||
"""
|
||||
url = _decode(encoded_url)
|
||||
logger.debug("Embed-asset relay: %s", url)
|
||||
|
||||
headers = {
|
||||
"User-Agent": USER_AGENT,
|
||||
"Referer": _make_referer(url),
|
||||
"Origin": _make_origin(url),
|
||||
"Accept": "*/*",
|
||||
}
|
||||
if range_header:
|
||||
headers["Range"] = range_header
|
||||
|
||||
client = httpx.AsyncClient(timeout=ASSET_TIMEOUT, follow_redirects=True)
|
||||
|
||||
try:
|
||||
response = await client.send(
|
||||
client.build_request("GET", url, headers=headers),
|
||||
stream=True,
|
||||
)
|
||||
except httpx.TimeoutException:
|
||||
await client.aclose()
|
||||
raise HTTPException(status_code=504, detail="Upstream asset timeout")
|
||||
except httpx.HTTPError as e:
|
||||
await client.aclose()
|
||||
raise HTTPException(status_code=502, detail=f"Upstream asset error: {e}")
|
||||
|
||||
if response.status_code >= 400:
|
||||
await response.aclose()
|
||||
await client.aclose()
|
||||
raise HTTPException(
|
||||
status_code=502,
|
||||
detail=f"Upstream asset returned HTTP {response.status_code}",
|
||||
)
|
||||
|
||||
headers_out = _filter_headers(response.headers)
|
||||
|
||||
async def _stream() -> AsyncGenerator[bytes, None]:
|
||||
try:
|
||||
async for chunk in response.aiter_bytes(chunk_size=RELAY_CHUNK_SIZE):
|
||||
yield chunk
|
||||
finally:
|
||||
await response.aclose()
|
||||
await client.aclose()
|
||||
|
||||
return _stream(), headers_out, response.status_code
|
||||
|
|
@ -12,12 +12,17 @@ Example:
|
|||
"""
|
||||
|
||||
from backend.extractors.aceztrims import AceztrimsExtractor
|
||||
from backend.extractors.curated import CuratedExtractor
|
||||
from backend.extractors.daddylive import DaddyLiveExtractor
|
||||
from backend.extractors.demo import DemoExtractor
|
||||
from backend.extractors.discord_source import DiscordExtractor
|
||||
from backend.extractors.models import ExtractedStream
|
||||
from backend.extractors.pitsport import PitsportExtractor
|
||||
from backend.extractors.ppv import PPVExtractor
|
||||
from backend.extractors.registry import ExtractorRegistry
|
||||
from backend.extractors.service import ExtractionService
|
||||
from backend.extractors.streamed import StreamedExtractor
|
||||
from backend.extractors.timstreams import TimStreamsExtractor
|
||||
|
||||
__all__ = [
|
||||
"ExtractedStream",
|
||||
|
|
@ -36,10 +41,20 @@ def create_registry() -> ExtractorRegistry:
|
|||
registry = ExtractorRegistry()
|
||||
|
||||
# --- Register extractors below ---
|
||||
# CuratedExtractor returns hand-picked 24/7 channels first so we always
|
||||
# have something. FallbackExtractor was removed — it surfaced aggregator
|
||||
# landing pages that don't play directly in an iframe (they require
|
||||
# user navigation through the page) and dominated the list with
|
||||
# entries that fail browser-based playback verification.
|
||||
registry.register(CuratedExtractor())
|
||||
registry.register(DemoExtractor())
|
||||
registry.register(StreamedExtractor())
|
||||
registry.register(DaddyLiveExtractor())
|
||||
registry.register(AceztrimsExtractor())
|
||||
registry.register(PitsportExtractor())
|
||||
registry.register(PPVExtractor())
|
||||
registry.register(TimStreamsExtractor())
|
||||
registry.register(DiscordExtractor())
|
||||
|
||||
return registry
|
||||
|
||||
|
|
|
|||
61
stacks/f1-stream/files/backend/extractors/curated.py
Normal file
61
stacks/f1-stream/files/backend/extractors/curated.py
Normal file
|
|
@ -0,0 +1,61 @@
|
|||
"""Curated extractor — known-good 24/7 F1 channels via direct embed URLs.
|
||||
|
||||
Returns a small, hand-picked list of embed URLs that are reliable enough to
|
||||
be served as fallback "always-on" streams when the dynamic extractors find
|
||||
nothing (e.g. between race weekends, when API providers are down).
|
||||
|
||||
These are direct embed URLs. The frontend routes them through /embed so the
|
||||
iframe-stripping proxy bypasses any frame-buster JS in the upstream player.
|
||||
"""
|
||||
|
||||
import logging
|
||||
|
||||
from backend.extractors.base import BaseExtractor
|
||||
from backend.extractors.models import ExtractedStream
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# Curated list. Each entry is a known direct embed URL. These were sourced
|
||||
# from the timstreams.py ALWAYS_INCLUDE_HASHES list (Sky Sports F1, DAZN F1)
|
||||
# and are documented as 24/7 channels that play F1 content year-round.
|
||||
_CURATED_STREAMS = [
|
||||
{
|
||||
"url": "https://hmembeds.one/embed/888520f36cd94c5da4c71fddc1a5fc9b",
|
||||
"title": "Sky Sports F1 (24/7)",
|
||||
"quality": "HD",
|
||||
},
|
||||
{
|
||||
"url": "https://hmembeds.one/embed/fc3a54634d0867b0c02ee3223292e7c6",
|
||||
"title": "DAZN F1 (24/7)",
|
||||
"quality": "HD",
|
||||
},
|
||||
]
|
||||
|
||||
|
||||
class CuratedExtractor(BaseExtractor):
|
||||
"""Returns curated known-good 24/7 F1 channel embed URLs."""
|
||||
|
||||
@property
|
||||
def site_key(self) -> str:
|
||||
return "curated"
|
||||
|
||||
@property
|
||||
def site_name(self) -> str:
|
||||
return "Curated 24/7 Channels"
|
||||
|
||||
async def extract(self) -> list[ExtractedStream]:
|
||||
streams = [
|
||||
ExtractedStream(
|
||||
url=entry["url"],
|
||||
site_key=self.site_key,
|
||||
site_name=self.site_name,
|
||||
quality=entry["quality"],
|
||||
title=entry["title"],
|
||||
stream_type="embed",
|
||||
embed_url=entry["url"],
|
||||
)
|
||||
for entry in _CURATED_STREAMS
|
||||
]
|
||||
logger.info("[curated] Returning %d curated stream(s)", len(streams))
|
||||
return streams
|
||||
203
stacks/f1-stream/files/backend/extractors/discord_source.py
Normal file
203
stacks/f1-stream/files/backend/extractors/discord_source.py
Normal file
|
|
@ -0,0 +1,203 @@
|
|||
"""Discord extractor - monitors Discord channels for F1 stream links.
|
||||
|
||||
Reads recent messages from configured Discord channels using a user token,
|
||||
extracts URLs that look like stream links, and returns them as embed streams.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
|
||||
import httpx
|
||||
|
||||
from backend.extractors.base import BaseExtractor
|
||||
from backend.extractors.models import ExtractedStream
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
DISCORD_API = "https://discord.com/api/v9"
|
||||
DISCORD_TOKEN = os.getenv("DISCORD_TOKEN", "")
|
||||
# Comma-separated channel IDs to monitor
|
||||
DISCORD_CHANNELS = os.getenv("DISCORD_CHANNELS", "").split(",")
|
||||
# How many messages to fetch per channel
|
||||
MESSAGE_LIMIT = 50
|
||||
|
||||
USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
|
||||
|
||||
# URL pattern to match stream links (exclude Discord CDN, images, etc.)
|
||||
URL_PATTERN = re.compile(r"https?://[^\s<>\)\]\"']+", re.IGNORECASE)
|
||||
|
||||
# Domains that publish news/articles, not playable streams. Discord users share
|
||||
# these links during race weekends; they are NOT streams and pollute the list.
|
||||
EXCLUDED_DOMAINS = {
|
||||
"discord.com", "discord.gg", "cdn.discordapp.com",
|
||||
"tenor.com", "giphy.com", "imgur.com",
|
||||
"youtube.com", "youtu.be", "twitter.com", "x.com",
|
||||
"reddit.com", "instagram.com", "tiktok.com",
|
||||
"fmhy.net", "github.com", "freemotorsports.com",
|
||||
# News / official sites — never playable embeds
|
||||
"formula1.com", "fia.com", "skysports.com", "motorsport.com",
|
||||
"driverdb.com", "autosport.com", "the-race.com", "racefans.net",
|
||||
"wikipedia.org", "fantasy.formula1.com",
|
||||
}
|
||||
|
||||
# A URL is treated as a candidate stream embed only if its path looks like
|
||||
# a stream/embed/player route. This catches /embed/{id}, /stream/{id},
|
||||
# /watch/{id}, /live/{slug}, /player/{...} and similar — and rejects
|
||||
# /article/, /news/, /latest/, /join/, etc.
|
||||
_PATH_KEYWORDS = (
|
||||
"embed/", "/stream", "/streams", "/watch", "/live",
|
||||
"/player", "/play/", "/sky", "/f1/", "/formula",
|
||||
"/grand-prix", "/gp/", "/channel", ".m3u8", ".php",
|
||||
)
|
||||
|
||||
|
||||
def _is_stream_url(url: str) -> bool:
|
||||
"""Heuristic: does this URL look like an actual stream/embed/player link?
|
||||
|
||||
Discord users share lots of news links during race weekends. The old
|
||||
filter only blocked specific domains and let everything else through,
|
||||
which produced a stream list dominated by formula1.com news articles.
|
||||
The new filter is positive-match: a URL must contain at least one
|
||||
stream-shaped path keyword to be included.
|
||||
"""
|
||||
from urllib.parse import urlparse
|
||||
|
||||
try:
|
||||
parsed = urlparse(url)
|
||||
domain = parsed.netloc.lower()
|
||||
path = parsed.path.lower()
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
if not domain:
|
||||
return False
|
||||
|
||||
for excluded in EXCLUDED_DOMAINS:
|
||||
if excluded in domain:
|
||||
return False
|
||||
|
||||
if any(path.endswith(ext) for ext in (".png", ".jpg", ".jpeg", ".gif", ".webp", ".mp4", ".webm", ".svg", ".css", ".js")):
|
||||
return False
|
||||
|
||||
full = path + ("?" + parsed.query if parsed.query else "")
|
||||
if not any(kw in full for kw in _PATH_KEYWORDS):
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
|
||||
class DiscordExtractor(BaseExtractor):
|
||||
"""Extracts stream links from Discord channel messages.
|
||||
|
||||
Monitors configured Discord channels for URLs shared by users,
|
||||
filters to likely stream links, and returns them as embed streams.
|
||||
"""
|
||||
|
||||
@property
|
||||
def site_key(self) -> str:
|
||||
return "discord"
|
||||
|
||||
@property
|
||||
def site_name(self) -> str:
|
||||
return "Discord Community"
|
||||
|
||||
async def extract(self) -> list[ExtractedStream]:
|
||||
"""Fetch recent messages from Discord channels and extract URLs."""
|
||||
if not DISCORD_TOKEN:
|
||||
logger.info("[discord] No DISCORD_TOKEN set, skipping")
|
||||
return []
|
||||
|
||||
channels = [c.strip() for c in DISCORD_CHANNELS if c.strip()]
|
||||
if not channels:
|
||||
logger.info("[discord] No DISCORD_CHANNELS configured, skipping")
|
||||
return []
|
||||
|
||||
streams: list[ExtractedStream] = []
|
||||
seen_urls: set[str] = set()
|
||||
|
||||
try:
|
||||
async with httpx.AsyncClient(
|
||||
timeout=15.0,
|
||||
follow_redirects=True,
|
||||
headers={
|
||||
"Authorization": DISCORD_TOKEN,
|
||||
"User-Agent": USER_AGENT,
|
||||
},
|
||||
) as client:
|
||||
for channel_id in channels:
|
||||
try:
|
||||
channel_streams = await self._fetch_channel(
|
||||
client, channel_id, seen_urls
|
||||
)
|
||||
streams.extend(channel_streams)
|
||||
except Exception:
|
||||
logger.debug(
|
||||
"[discord] Failed to fetch channel %s",
|
||||
channel_id,
|
||||
exc_info=True,
|
||||
)
|
||||
except Exception:
|
||||
logger.exception("[discord] Failed to connect to Discord API")
|
||||
|
||||
logger.info("[discord] Extracted %d stream(s) from %d channel(s)", len(streams), len(channels))
|
||||
return streams
|
||||
|
||||
async def _fetch_channel(
|
||||
self,
|
||||
client: httpx.AsyncClient,
|
||||
channel_id: str,
|
||||
seen_urls: set[str],
|
||||
) -> list[ExtractedStream]:
|
||||
"""Fetch messages from a single channel and extract stream URLs."""
|
||||
resp = await client.get(
|
||||
f"{DISCORD_API}/channels/{channel_id}/messages",
|
||||
params={"limit": MESSAGE_LIMIT},
|
||||
)
|
||||
if resp.status_code != 200:
|
||||
logger.warning(
|
||||
"[discord] Channel %s returned HTTP %d", channel_id, resp.status_code
|
||||
)
|
||||
return []
|
||||
|
||||
messages = resp.json()
|
||||
if not isinstance(messages, list):
|
||||
return []
|
||||
|
||||
streams: list[ExtractedStream] = []
|
||||
|
||||
for msg in messages:
|
||||
content = msg.get("content", "")
|
||||
author = msg.get("author", {}).get("username", "unknown")
|
||||
|
||||
# Extract URLs from message content
|
||||
urls = URL_PATTERN.findall(content)
|
||||
|
||||
# Also check embeds
|
||||
for embed in msg.get("embeds", []):
|
||||
if embed.get("url"):
|
||||
urls.append(embed["url"])
|
||||
|
||||
for url in urls:
|
||||
# Clean trailing punctuation
|
||||
url = url.rstrip(".,;:!?)")
|
||||
|
||||
if url in seen_urls:
|
||||
continue
|
||||
if not _is_stream_url(url):
|
||||
continue
|
||||
|
||||
seen_urls.add(url)
|
||||
streams.append(
|
||||
ExtractedStream(
|
||||
url=url,
|
||||
site_key=self.site_key,
|
||||
site_name=self.site_name,
|
||||
quality="",
|
||||
title=f"Shared by {author}",
|
||||
stream_type="embed",
|
||||
embed_url=url,
|
||||
)
|
||||
)
|
||||
|
||||
return streams
|
||||
510
stacks/f1-stream/files/backend/extractors/pitsport.py
Normal file
510
stacks/f1-stream/files/backend/extractors/pitsport.py
Normal file
|
|
@ -0,0 +1,510 @@
|
|||
"""Pitsport.xyz extractor - fetches F1 streams from the Next.js RSC payload.
|
||||
|
||||
Architecture:
|
||||
- Main page (pitsport.xyz) has a "Live Now" section with event cards containing
|
||||
category, title, time, imageUrl props and /watch/{UUID} links.
|
||||
- Schedule page (pitsport.xyz/schedule) lists all events grouped by category
|
||||
(h2 headings) with /watch/{UUID} links and event titles.
|
||||
- Watch pages (/watch/{UUID}) embed iframes from pushembdz.store/embed/{EMBED_UUID}.
|
||||
- Embed pages contain an RSC payload with a stream config: {title, link, method}.
|
||||
- When method is "player" or "hls", the link field points to a serveplay.site
|
||||
m3u8 playlist. Otherwise we return the embed URL for iframe playback.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import re
|
||||
from dataclasses import dataclass
|
||||
|
||||
import httpx
|
||||
|
||||
from backend.extractors.base import BaseExtractor
|
||||
from backend.extractors.models import ExtractedStream
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
PITSPORT_BASE = "https://pitsport.xyz"
|
||||
EMBED_BASE = "https://pushembdz.store"
|
||||
USER_AGENT = (
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
||||
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
||||
"Chrome/120.0.0.0 Safari/537.36"
|
||||
)
|
||||
|
||||
# Categories to include (case-insensitive match)
|
||||
F1_CATEGORIES = {"formula 1", "formula 2", "formula 3"}
|
||||
|
||||
# Fallback keyword matching on combined category+title for edge cases
|
||||
F1_KEYWORDS = {"formula 1", "formula one", "f1"}
|
||||
GP_KEYWORD = "grand prix"
|
||||
NON_F1_KEYWORDS = {
|
||||
"motogp", "moto gp", "moto2", "moto3", "motoe", "indycar",
|
||||
"indy car", "firestone", "nascar", "rally", "wrc", "wec",
|
||||
"lemans", "le mans", "superbike", "dtm", "supercars", "arca",
|
||||
"xfinity", "trucks", "super formula", "supergt", "super gt",
|
||||
"ama supercross", "supercross",
|
||||
}
|
||||
|
||||
|
||||
@dataclass
|
||||
class _PitsportEvent:
|
||||
"""An event discovered from the Pitsport site."""
|
||||
|
||||
category: str
|
||||
title: str
|
||||
watch_uuid: str
|
||||
|
||||
|
||||
def _is_f1_category(category: str) -> bool:
|
||||
"""Check if a category string matches an F1-related series."""
|
||||
return category.strip().lower() in F1_CATEGORIES
|
||||
|
||||
|
||||
def _is_f1_event(category: str, title: str) -> bool:
|
||||
"""Check if an event is Formula 1 related by category or title keywords."""
|
||||
# Primary check: exact category match
|
||||
if _is_f1_category(category):
|
||||
return True
|
||||
|
||||
# Secondary check: keyword matching on combined text
|
||||
lower = f"{category} {title}".lower()
|
||||
if any(kw in lower for kw in NON_F1_KEYWORDS):
|
||||
return False
|
||||
if any(kw in lower for kw in F1_KEYWORDS):
|
||||
return True
|
||||
if GP_KEYWORD in lower:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def _parse_live_events(html: str) -> list[_PitsportEvent]:
|
||||
"""Parse live events from the main page RSC payload.
|
||||
|
||||
The main page contains event cards with props:
|
||||
category, title, time, imageUrl
|
||||
wrapped in <a href="/watch/{UUID}"> links.
|
||||
"""
|
||||
events: list[_PitsportEvent] = []
|
||||
|
||||
# Match event cards in the RSC payload - they appear as JSON-like structures
|
||||
# Pattern: href="/watch/UUID" ... category":"...", "title":"..."
|
||||
# In the RSC payload, the data is in the format:
|
||||
# ["$","$L2","/watch/UUID",{"href":"/watch/UUID","children":["$","$L10",null,
|
||||
# {"category":"...","title":"...","time":...,"imageUrl":"..."}]}]
|
||||
pattern = re.compile(
|
||||
r'"href":"(/watch/([0-9a-f-]{36}))"[^}]*?"category":"([^"]+)","title":"([^"]+)"',
|
||||
)
|
||||
for match in pattern.finditer(html):
|
||||
_, uuid, category, title = match.groups()
|
||||
events.append(_PitsportEvent(category=category, title=title, watch_uuid=uuid))
|
||||
|
||||
return events
|
||||
|
||||
|
||||
def _parse_schedule_events(html: str) -> list[_PitsportEvent]:
|
||||
"""Parse events from the schedule page.
|
||||
|
||||
The schedule page groups events under category headers (h2 elements).
|
||||
In the rendered HTML:
|
||||
<h2 ...>Formula 1</h2>
|
||||
<div ...>
|
||||
<a href="/watch/UUID">...</a>
|
||||
...
|
||||
</div>
|
||||
|
||||
In the RSC payload, similar structure with section divs containing
|
||||
a category h2 and child event links with titles.
|
||||
"""
|
||||
events: list[_PitsportEvent] = []
|
||||
|
||||
# Strategy 1: Parse from rendered HTML
|
||||
# Find category sections: >CategoryName</h2> followed by watch links
|
||||
# Split HTML at each category header
|
||||
section_pattern = re.compile(
|
||||
r'>([^<]+)</h2>\s*<div[^>]*class="flex flex-wrap gap-6">(.*?)(?=</div>\s*</div>\s*(?:<div|</div>|$))',
|
||||
re.DOTALL,
|
||||
)
|
||||
for section_match in section_pattern.finditer(html):
|
||||
category = section_match.group(1).strip()
|
||||
section_html = section_match.group(2)
|
||||
|
||||
# Find all watch links in this section
|
||||
link_pattern = re.compile(
|
||||
r'href="/watch/([0-9a-f-]{36})".*?<h1[^>]*>([^<]+)</h1>',
|
||||
re.DOTALL,
|
||||
)
|
||||
for link_match in link_pattern.finditer(section_html):
|
||||
uuid = link_match.group(1)
|
||||
title = link_match.group(2).strip()
|
||||
events.append(
|
||||
_PitsportEvent(category=category, title=title, watch_uuid=uuid)
|
||||
)
|
||||
|
||||
# Strategy 2: Parse from RSC payload if rendered HTML didn't yield results
|
||||
# The RSC payload has patterns like:
|
||||
# "children":"Formula 1"}] ... "/watch/UUID" ... "title":"EventTitle"
|
||||
if not events:
|
||||
events = _parse_schedule_rsc(html)
|
||||
|
||||
return events
|
||||
|
||||
|
||||
def _parse_schedule_rsc(html: str) -> list[_PitsportEvent]:
|
||||
"""Parse events from schedule page RSC payload as fallback.
|
||||
|
||||
Extracts category section divs from the RSC JSON structure.
|
||||
"""
|
||||
events: list[_PitsportEvent] = []
|
||||
|
||||
# Find the RSC payload chunks
|
||||
rsc_chunks = re.findall(
|
||||
r'self\.__next_f\.push\(\[1,"(.*?)"\]\)', html, re.DOTALL
|
||||
)
|
||||
if not rsc_chunks:
|
||||
return events
|
||||
|
||||
# Concatenate and unescape
|
||||
full_payload = ""
|
||||
for chunk in rsc_chunks:
|
||||
try:
|
||||
full_payload += chunk.encode().decode("unicode_escape")
|
||||
except Exception:
|
||||
full_payload += chunk
|
||||
|
||||
# Find category sections in the RSC data
|
||||
# Pattern: "children":"CategoryName"}],["$","div",...watch links...
|
||||
# Each section div contains an h2 with the category name and watch links
|
||||
cat_pattern = re.compile(
|
||||
r'border-gray-700 pb-2","children":"([^"]+)"\}.*?'
|
||||
r'(?=border-gray-700 pb-2","children"|$)',
|
||||
re.DOTALL,
|
||||
)
|
||||
for cat_match in cat_pattern.finditer(full_payload):
|
||||
category = cat_match.group(1)
|
||||
section_text = cat_match.group(0)
|
||||
|
||||
# Find watch UUIDs and titles in this section
|
||||
# Pattern: "/watch/UUID" ... "title":"EventTitle"
|
||||
event_pattern = re.compile(
|
||||
r'/watch/([0-9a-f-]{36}).*?"title":"([^"]+)"',
|
||||
)
|
||||
for ev_match in event_pattern.finditer(section_text):
|
||||
uuid = ev_match.group(1)
|
||||
title = ev_match.group(2)
|
||||
events.append(
|
||||
_PitsportEvent(category=category, title=title, watch_uuid=uuid)
|
||||
)
|
||||
|
||||
return events
|
||||
|
||||
|
||||
def _parse_embed_uuids(html: str) -> list[str]:
|
||||
"""Extract embed UUIDs from a watch page.
|
||||
|
||||
Watch pages contain iframes like:
|
||||
<iframe src="https://pushembdz.store/embed/{EMBED_UUID}" ...>
|
||||
|
||||
And in the RSC payload:
|
||||
"iframe":"https://pushembdz.store/embed/{EMBED_UUID}"
|
||||
"""
|
||||
uuids: list[str] = []
|
||||
|
||||
# From rendered HTML
|
||||
iframe_pattern = re.compile(
|
||||
r'pushembdz\.store/embed/([0-9a-f-]{36})',
|
||||
)
|
||||
for match in iframe_pattern.finditer(html):
|
||||
uuid = match.group(1)
|
||||
if uuid not in uuids:
|
||||
uuids.append(uuid)
|
||||
|
||||
return uuids
|
||||
|
||||
|
||||
@dataclass
|
||||
class _StreamConfig:
|
||||
"""Stream configuration extracted from an embed page."""
|
||||
|
||||
title: str
|
||||
link: str
|
||||
method: str
|
||||
|
||||
|
||||
def _parse_stream_config(html: str) -> _StreamConfig | None:
|
||||
"""Extract stream config from an embed page RSC payload.
|
||||
|
||||
The embed page contains an RSC payload line like:
|
||||
4:["$","$Ld",null,{"stream":{"title":"...","link":"...","method":"player"},
|
||||
"error":null,"slug":"..."}]
|
||||
"""
|
||||
# Try matching the escaped RSC payload pattern
|
||||
pattern = re.compile(
|
||||
r'"stream":\{["\']?\\?"title\\?"["\']?:["\']?\\?"([^"\\]+)\\?"["\']?,'
|
||||
r'["\']?\\?"link\\?"["\']?:["\']?\\?"([^"\\]+)\\?"["\']?,'
|
||||
r'["\']?\\?"method\\?"["\']?:["\']?\\?"([^"\\]+)\\?"',
|
||||
)
|
||||
match = pattern.search(html)
|
||||
if match:
|
||||
return _StreamConfig(
|
||||
title=match.group(1),
|
||||
link=match.group(2),
|
||||
method=match.group(3),
|
||||
)
|
||||
|
||||
# Simpler pattern for double-escaped payload
|
||||
pattern2 = re.compile(
|
||||
r'\\?"stream\\?":\{\\?"title\\?":\\?"([^\\]+)\\?",'
|
||||
r'\\?"link\\?":\\?"([^\\]+)\\?",'
|
||||
r'\\?"method\\?":\\?"([^\\]+)\\?"',
|
||||
)
|
||||
match = pattern2.search(html)
|
||||
if match:
|
||||
return _StreamConfig(
|
||||
title=match.group(1),
|
||||
link=match.group(2),
|
||||
method=match.group(3),
|
||||
)
|
||||
|
||||
# Most lenient: just find the three fields near each other
|
||||
pattern3 = re.compile(
|
||||
r'"stream"\s*:\s*\{\s*"title"\s*:\s*"([^"]+)"\s*,'
|
||||
r'\s*"link"\s*:\s*"([^"]+)"\s*,'
|
||||
r'\s*"method"\s*:\s*"([^"]+)"',
|
||||
)
|
||||
match = pattern3.search(html)
|
||||
if match:
|
||||
return _StreamConfig(
|
||||
title=match.group(1),
|
||||
link=match.group(2),
|
||||
method=match.group(3),
|
||||
)
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def _is_m3u8_method(method: str) -> bool:
|
||||
"""Check if the stream method indicates a direct HLS stream."""
|
||||
return method.lower() in ("player", "hls")
|
||||
|
||||
|
||||
def _extract_m3u8_url(link: str) -> str:
|
||||
"""Convert a serveplay.site player URL to an m3u8 playlist URL.
|
||||
|
||||
Input: https://dash.serveplay.site/{channel}/index.html
|
||||
Output: https://dash.serveplay.site/{channel}/index.html
|
||||
|
||||
The index.html IS the m3u8 playlist (served with proper content-type
|
||||
when fetched with the correct Referer header).
|
||||
"""
|
||||
return link
|
||||
|
||||
|
||||
class PitsportExtractor(BaseExtractor):
|
||||
"""Extracts F1 streams from Pitsport.xyz.
|
||||
|
||||
Scrapes the Next.js RSC payload from the main page and schedule page
|
||||
to find F1 events, then resolves embed UUIDs to stream configurations.
|
||||
"""
|
||||
|
||||
@property
|
||||
def site_key(self) -> str:
|
||||
return "pitsport"
|
||||
|
||||
@property
|
||||
def site_name(self) -> str:
|
||||
return "Pitsport"
|
||||
|
||||
async def extract(self) -> list[ExtractedStream]:
|
||||
"""Fetch F1 events and return stream URLs or embed URLs."""
|
||||
streams: list[ExtractedStream] = []
|
||||
|
||||
try:
|
||||
async with httpx.AsyncClient(
|
||||
timeout=20.0,
|
||||
follow_redirects=True,
|
||||
headers={"User-Agent": USER_AGENT},
|
||||
) as client:
|
||||
# Fetch both pages to get comprehensive event data
|
||||
events = await self._discover_events(client)
|
||||
logger.info(
|
||||
"[pitsport] Found %d F1 event(s) to process", len(events)
|
||||
)
|
||||
|
||||
# Deduplicate by watch UUID
|
||||
seen_uuids: set[str] = set()
|
||||
unique_events: list[_PitsportEvent] = []
|
||||
for ev in events:
|
||||
if ev.watch_uuid not in seen_uuids:
|
||||
seen_uuids.add(ev.watch_uuid)
|
||||
unique_events.append(ev)
|
||||
|
||||
# For each event, resolve streams
|
||||
for event in unique_events:
|
||||
event_streams = await self._resolve_event_streams(
|
||||
client, event
|
||||
)
|
||||
streams.extend(event_streams)
|
||||
|
||||
except Exception:
|
||||
logger.exception("[pitsport] Failed to extract streams")
|
||||
|
||||
logger.info("[pitsport] Extracted %d stream(s)", len(streams))
|
||||
return streams
|
||||
|
||||
async def _discover_events(
|
||||
self, client: httpx.AsyncClient
|
||||
) -> list[_PitsportEvent]:
|
||||
"""Discover F1 events from both main page and schedule page."""
|
||||
all_events: list[_PitsportEvent] = []
|
||||
|
||||
# Fetch main page for live events
|
||||
try:
|
||||
resp = await client.get(PITSPORT_BASE)
|
||||
if resp.status_code == 200:
|
||||
live_events = _parse_live_events(resp.text)
|
||||
logger.info(
|
||||
"[pitsport] Main page: %d live event(s)", len(live_events)
|
||||
)
|
||||
for ev in live_events:
|
||||
if _is_f1_event(ev.category, ev.title):
|
||||
all_events.append(ev)
|
||||
else:
|
||||
logger.warning(
|
||||
"[pitsport] Main page returned HTTP %d", resp.status_code
|
||||
)
|
||||
except Exception:
|
||||
logger.exception("[pitsport] Failed to fetch main page")
|
||||
|
||||
# Fetch schedule page for upcoming events
|
||||
try:
|
||||
resp = await client.get(f"{PITSPORT_BASE}/schedule")
|
||||
if resp.status_code == 200:
|
||||
schedule_events = _parse_schedule_events(resp.text)
|
||||
logger.info(
|
||||
"[pitsport] Schedule page: %d total event(s)",
|
||||
len(schedule_events),
|
||||
)
|
||||
for ev in schedule_events:
|
||||
if _is_f1_event(ev.category, ev.title):
|
||||
all_events.append(ev)
|
||||
else:
|
||||
logger.warning(
|
||||
"[pitsport] Schedule page returned HTTP %d",
|
||||
resp.status_code,
|
||||
)
|
||||
except Exception:
|
||||
logger.exception("[pitsport] Failed to fetch schedule page")
|
||||
|
||||
return all_events
|
||||
|
||||
async def _resolve_event_streams(
|
||||
self, client: httpx.AsyncClient, event: _PitsportEvent
|
||||
) -> list[ExtractedStream]:
|
||||
"""Resolve an event's watch page to actual stream URLs."""
|
||||
streams: list[ExtractedStream] = []
|
||||
|
||||
try:
|
||||
# Fetch the watch page to get embed UUIDs
|
||||
watch_url = f"{PITSPORT_BASE}/watch/{event.watch_uuid}"
|
||||
resp = await client.get(watch_url)
|
||||
if resp.status_code != 200:
|
||||
logger.debug(
|
||||
"[pitsport] Watch page %s returned HTTP %d",
|
||||
event.watch_uuid,
|
||||
resp.status_code,
|
||||
)
|
||||
return []
|
||||
|
||||
embed_uuids = _parse_embed_uuids(resp.text)
|
||||
if not embed_uuids:
|
||||
logger.debug(
|
||||
"[pitsport] No embed UUIDs found for %s", event.watch_uuid
|
||||
)
|
||||
return []
|
||||
|
||||
logger.debug(
|
||||
"[pitsport] Event '%s' has %d embed(s)",
|
||||
event.title,
|
||||
len(embed_uuids),
|
||||
)
|
||||
|
||||
# Resolve each embed to a stream config
|
||||
for i, embed_uuid in enumerate(embed_uuids):
|
||||
stream = await self._resolve_embed(
|
||||
client, embed_uuid, event, stream_num=i + 1
|
||||
)
|
||||
if stream:
|
||||
streams.append(stream)
|
||||
|
||||
except Exception:
|
||||
logger.debug(
|
||||
"[pitsport] Failed to resolve event %s",
|
||||
event.watch_uuid,
|
||||
exc_info=True,
|
||||
)
|
||||
|
||||
return streams
|
||||
|
||||
async def _resolve_embed(
|
||||
self,
|
||||
client: httpx.AsyncClient,
|
||||
embed_uuid: str,
|
||||
event: _PitsportEvent,
|
||||
stream_num: int,
|
||||
) -> ExtractedStream | None:
|
||||
"""Resolve an embed UUID to a stream configuration."""
|
||||
try:
|
||||
embed_url = f"{EMBED_BASE}/embed/{embed_uuid}"
|
||||
resp = await client.get(embed_url)
|
||||
if resp.status_code != 200:
|
||||
logger.debug(
|
||||
"[pitsport] Embed page %s returned HTTP %d",
|
||||
embed_uuid,
|
||||
resp.status_code,
|
||||
)
|
||||
return None
|
||||
|
||||
config = _parse_stream_config(resp.text)
|
||||
if not config:
|
||||
logger.debug(
|
||||
"[pitsport] No stream config found in embed %s",
|
||||
embed_uuid,
|
||||
)
|
||||
return None
|
||||
|
||||
# Build the stream title
|
||||
stream_title = f"{event.category} - {event.title}"
|
||||
if config.title:
|
||||
stream_title += f" ({config.title})"
|
||||
if stream_num > 1:
|
||||
stream_title += f" #{stream_num}"
|
||||
|
||||
if _is_m3u8_method(config.method) and "serveplay.site" in config.link:
|
||||
# Direct m3u8 stream
|
||||
m3u8_url = _extract_m3u8_url(config.link)
|
||||
return ExtractedStream(
|
||||
url=m3u8_url,
|
||||
site_key=self.site_key,
|
||||
site_name=self.site_name,
|
||||
quality="",
|
||||
title=stream_title,
|
||||
stream_type="m3u8",
|
||||
)
|
||||
else:
|
||||
# Iframe embed fallback
|
||||
return ExtractedStream(
|
||||
url=embed_url,
|
||||
site_key=self.site_key,
|
||||
site_name=self.site_name,
|
||||
quality="",
|
||||
title=stream_title,
|
||||
stream_type="embed",
|
||||
embed_url=embed_url,
|
||||
)
|
||||
|
||||
except Exception:
|
||||
logger.debug(
|
||||
"[pitsport] Failed to resolve embed %s",
|
||||
embed_uuid,
|
||||
exc_info=True,
|
||||
)
|
||||
return None
|
||||
270
stacks/f1-stream/files/backend/extractors/ppv.py
Normal file
270
stacks/f1-stream/files/backend/extractors/ppv.py
Normal file
|
|
@ -0,0 +1,270 @@
|
|||
"""PPV.to extractor - fetches F1 streams via the public PPV API.
|
||||
|
||||
Returns embed URLs (pooembed.eu) for iframe playback.
|
||||
The API at api.ppv.to/api/streams requires no authentication.
|
||||
Falls back to api.ppv.st if the primary API is unreachable.
|
||||
"""
|
||||
|
||||
import logging
|
||||
|
||||
import httpx
|
||||
|
||||
from backend.extractors.base import BaseExtractor
|
||||
from backend.extractors.models import ExtractedStream
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
PRIMARY_API = "https://api.ppv.to/api/streams"
|
||||
FALLBACK_API = "https://api.ppv.st/api/streams"
|
||||
EMBED_BASE = "https://pooembed.eu/embed"
|
||||
|
||||
USER_AGENT = (
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
||||
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
||||
"Chrome/120.0.0.0 Safari/537.36"
|
||||
)
|
||||
|
||||
# Category name for motorsport on PPV.to
|
||||
MOTORSPORT_CATEGORY = "motorsports"
|
||||
|
||||
# Only include events matching these keywords (case-insensitive)
|
||||
F1_KEYWORDS = {"formula 1", "formula one", "f1", "sky sports f1"}
|
||||
# Grand Prix is shared with MotoGP/IndyCar — only match if no other series keywords
|
||||
GP_KEYWORD = "grand prix"
|
||||
NON_F1_KEYWORDS = {
|
||||
"motogp", "moto gp", "moto2", "moto3", "motoe",
|
||||
"indycar", "indy car", "firestone", "nascar",
|
||||
"rally", "wrc", "wec", "lemans", "le mans",
|
||||
"superbike", "dtm", "supercars",
|
||||
}
|
||||
|
||||
|
||||
def _is_f1_stream(name: str, category_name: str = "") -> bool:
|
||||
"""Check if a stream is Formula 1 related.
|
||||
|
||||
Checks both the stream name and the category name.
|
||||
A stream qualifies if:
|
||||
- It is in the motorsport category AND matches F1 keywords, OR
|
||||
- It matches F1 keywords regardless of category.
|
||||
"""
|
||||
lower_name = name.lower()
|
||||
lower_cat = category_name.lower()
|
||||
|
||||
# Reject if it contains non-F1 motorsport keywords
|
||||
if any(kw in lower_name for kw in NON_F1_KEYWORDS):
|
||||
return False
|
||||
|
||||
# Direct F1 keyword match in the stream name
|
||||
if any(kw in lower_name for kw in F1_KEYWORDS):
|
||||
return True
|
||||
|
||||
# "grand prix" in the name, only if in motorsports category and no non-F1 keywords
|
||||
if GP_KEYWORD in lower_name and MOTORSPORT_CATEGORY in lower_cat:
|
||||
return True
|
||||
|
||||
# If the category is motorsport, also check category-level keywords
|
||||
if MOTORSPORT_CATEGORY in lower_cat and any(kw in lower_cat for kw in F1_KEYWORDS):
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
|
||||
class PPVExtractor(BaseExtractor):
|
||||
"""Extracts embed URLs from PPV.to's public JSON API.
|
||||
|
||||
Uses the endpoint:
|
||||
- GET https://api.ppv.to/api/streams -> all streams grouped by category
|
||||
- Fallback: https://api.ppv.st/api/streams
|
||||
|
||||
Each stream object contains an `iframe` field with the embed URL,
|
||||
or a `uri_name` from which the embed URL can be constructed.
|
||||
"""
|
||||
|
||||
@property
|
||||
def site_key(self) -> str:
|
||||
return "ppv"
|
||||
|
||||
@property
|
||||
def site_name(self) -> str:
|
||||
return "PPV.to"
|
||||
|
||||
async def _fetch_streams(self, client: httpx.AsyncClient) -> dict | None:
|
||||
"""Try primary and fallback APIs, return parsed JSON or None."""
|
||||
for api_url in (PRIMARY_API, FALLBACK_API):
|
||||
try:
|
||||
resp = await client.get(api_url)
|
||||
if resp.status_code == 200:
|
||||
data = resp.json()
|
||||
logger.info("[ppv] Fetched streams from %s", api_url)
|
||||
return data
|
||||
logger.warning(
|
||||
"[ppv] %s returned HTTP %d", api_url, resp.status_code
|
||||
)
|
||||
except Exception:
|
||||
logger.debug(
|
||||
"[ppv] Failed to reach %s", api_url, exc_info=True
|
||||
)
|
||||
return None
|
||||
|
||||
async def extract(self) -> list[ExtractedStream]:
|
||||
"""Fetch F1 streams and return embed URLs for iframe playback."""
|
||||
streams: list[ExtractedStream] = []
|
||||
|
||||
try:
|
||||
async with httpx.AsyncClient(
|
||||
timeout=15.0,
|
||||
follow_redirects=True,
|
||||
headers={"User-Agent": USER_AGENT, "Accept": "application/json"},
|
||||
) as client:
|
||||
data = await self._fetch_streams(client)
|
||||
if data is None:
|
||||
logger.warning("[ppv] Could not fetch streams from any API")
|
||||
return []
|
||||
|
||||
# The API returns:
|
||||
# { "streams": [ { "category": "Name", "id": N, "streams": [...] }, ... ] }
|
||||
# Flatten into (category_name, stream_obj) tuples.
|
||||
all_streams = self._normalize_streams(data)
|
||||
|
||||
logger.info(
|
||||
"[ppv] Found %d total stream(s) across all categories",
|
||||
len(all_streams),
|
||||
)
|
||||
|
||||
for category_name, stream_obj in all_streams:
|
||||
name = stream_obj.get("name", "") or stream_obj.get("title", "")
|
||||
|
||||
if not _is_f1_stream(name, category_name):
|
||||
continue
|
||||
|
||||
# Build the embed URL
|
||||
embed_url = self._get_embed_url(stream_obj)
|
||||
if not embed_url:
|
||||
logger.debug("[ppv] No embed URL for stream: %s", name)
|
||||
continue
|
||||
|
||||
# Extract quality from tag if present
|
||||
tag = stream_obj.get("tag", "")
|
||||
quality = tag if tag else ""
|
||||
|
||||
# Build descriptive title
|
||||
title = name
|
||||
viewers = stream_obj.get("viewers")
|
||||
if viewers and int(viewers) > 0:
|
||||
title += f" ({viewers} viewers)"
|
||||
|
||||
# Check for substreams (multiple quality/language options)
|
||||
substreams = stream_obj.get("substreams")
|
||||
if isinstance(substreams, list) and substreams:
|
||||
for i, sub in enumerate(substreams):
|
||||
sub_embed = sub.get("iframe", "") or sub.get("embed_url", "")
|
||||
if not sub_embed:
|
||||
# Fall back to the parent embed URL
|
||||
sub_embed = embed_url
|
||||
sub_name = sub.get("name", "") or sub.get("label", "")
|
||||
sub_quality = sub.get("tag", "") or sub.get("quality", "") or quality
|
||||
sub_title = f"{name}"
|
||||
if sub_name:
|
||||
sub_title += f" - {sub_name}"
|
||||
elif i > 0:
|
||||
sub_title += f" #{i + 1}"
|
||||
|
||||
streams.append(
|
||||
ExtractedStream(
|
||||
url=sub_embed,
|
||||
site_key=self.site_key,
|
||||
site_name=self.site_name,
|
||||
quality=sub_quality,
|
||||
title=sub_title,
|
||||
stream_type="embed",
|
||||
embed_url=sub_embed,
|
||||
)
|
||||
)
|
||||
else:
|
||||
# Single stream, no substreams
|
||||
streams.append(
|
||||
ExtractedStream(
|
||||
url=embed_url,
|
||||
site_key=self.site_key,
|
||||
site_name=self.site_name,
|
||||
quality=quality,
|
||||
title=title,
|
||||
stream_type="embed",
|
||||
embed_url=embed_url,
|
||||
)
|
||||
)
|
||||
|
||||
except Exception:
|
||||
logger.exception("[ppv] Failed to extract streams")
|
||||
|
||||
logger.info("[ppv] Extracted %d F1 stream(s)", len(streams))
|
||||
return streams
|
||||
|
||||
@staticmethod
|
||||
def _normalize_streams(data: dict | list) -> list[tuple[str, dict]]:
|
||||
"""Normalize the API response into a flat list of (category_name, stream_dict) tuples.
|
||||
|
||||
The PPV API returns data in this shape:
|
||||
{
|
||||
"streams": [
|
||||
{
|
||||
"category": "Motorsports",
|
||||
"id": 35,
|
||||
"streams": [ { stream objects... } ]
|
||||
},
|
||||
...
|
||||
]
|
||||
}
|
||||
|
||||
Each category group has a "category" string and a nested "streams" list.
|
||||
"""
|
||||
result: list[tuple[str, dict]] = []
|
||||
|
||||
# Handle the top-level wrapper
|
||||
if isinstance(data, dict):
|
||||
categories = data.get("streams", [])
|
||||
elif isinstance(data, list):
|
||||
categories = data
|
||||
else:
|
||||
return result
|
||||
|
||||
for category_group in categories:
|
||||
if not isinstance(category_group, dict):
|
||||
continue
|
||||
|
||||
category_name = category_group.get("category", "")
|
||||
|
||||
# The nested streams within this category
|
||||
inner_streams = category_group.get("streams", [])
|
||||
if isinstance(inner_streams, list):
|
||||
for stream_obj in inner_streams:
|
||||
if isinstance(stream_obj, dict):
|
||||
# Attach category_name to each stream for filtering
|
||||
result.append((category_name, stream_obj))
|
||||
elif isinstance(category_group, dict) and "name" in category_group:
|
||||
# Fallback: the item itself is a stream (flat list format)
|
||||
result.append((category_name, category_group))
|
||||
|
||||
return result
|
||||
|
||||
@staticmethod
|
||||
def _get_embed_url(stream: dict) -> str:
|
||||
"""Extract or construct the embed URL for a stream."""
|
||||
# Prefer the iframe field directly
|
||||
iframe = stream.get("iframe", "")
|
||||
if iframe:
|
||||
return iframe
|
||||
|
||||
# Construct from uri_name
|
||||
uri_name = stream.get("uri_name", "") or stream.get("uri", "")
|
||||
if uri_name:
|
||||
# Strip leading slash if present
|
||||
uri_name = uri_name.lstrip("/")
|
||||
return f"{EMBED_BASE}/{uri_name}"
|
||||
|
||||
# Last resort: use the stream id
|
||||
stream_id = stream.get("id")
|
||||
if stream_id:
|
||||
return f"{EMBED_BASE}/{stream_id}"
|
||||
|
||||
return ""
|
||||
|
|
@ -6,6 +6,7 @@ from datetime import datetime, timezone
|
|||
from backend.extractors.models import ExtractedStream
|
||||
from backend.extractors.registry import ExtractorRegistry
|
||||
from backend.health import StreamHealthChecker
|
||||
from backend.playback_verifier import PlaybackVerifier
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
|
@ -29,6 +30,11 @@ class ExtractionService:
|
|||
self._last_run: str | None = None
|
||||
self._last_run_stream_count: int = 0
|
||||
self._health_checker = StreamHealthChecker()
|
||||
self._playback_verifier = PlaybackVerifier()
|
||||
|
||||
async def shutdown(self) -> None:
|
||||
"""Release the headless browser instance owned by the verifier."""
|
||||
await self._playback_verifier.shutdown()
|
||||
|
||||
async def run_extraction(self) -> None:
|
||||
"""Run all extractors, health-check results, and cache them.
|
||||
|
|
@ -43,31 +49,63 @@ class ExtractionService:
|
|||
|
||||
streams = await self._registry.extract_all()
|
||||
|
||||
# Run health checks on all extracted streams
|
||||
# Run health checks + headless-browser playback verification.
|
||||
# Both stream types are now verified end-to-end so the user only
|
||||
# ever sees streams that actually play in a browser.
|
||||
if streams:
|
||||
# Separate m3u8 streams (need health check) from embed streams (skip)
|
||||
m3u8_streams = [s for s in streams if s.stream_type != "embed"]
|
||||
embed_streams = [s for s in streams if s.stream_type == "embed"]
|
||||
|
||||
# Mark embed streams as live (no health check possible for iframes)
|
||||
for stream in embed_streams:
|
||||
stream.is_live = True
|
||||
stream.response_time_ms = 0
|
||||
stream.checked_at = start.isoformat()
|
||||
|
||||
# Health-check only m3u8 streams
|
||||
# m3u8 streams: cheap structural health check (validates manifest,
|
||||
# checks first variant playlist), then a headless-browser test
|
||||
# to confirm hls.js can decode and render frames.
|
||||
if m3u8_streams:
|
||||
stream_dicts = [s.to_dict() for s in m3u8_streams]
|
||||
health_map = await self._health_checker.check_all(stream_dicts)
|
||||
|
||||
for stream in m3u8_streams:
|
||||
health = health_map.get(stream.url)
|
||||
if health:
|
||||
stream.is_live = health.is_live
|
||||
stream.response_time_ms = health.response_time_ms
|
||||
stream.checked_at = health.checked_at
|
||||
if health.bitrate > 0:
|
||||
stream.bitrate = health.bitrate
|
||||
# tentatively mark live; final word comes from the verifier
|
||||
stream.is_live = health.is_live
|
||||
|
||||
# Browser verification: applies to both m3u8 (only those that
|
||||
# passed structural health) and embed (always — they have no
|
||||
# other way to verify).
|
||||
verify_items: list[tuple[str, str]] = []
|
||||
for stream in m3u8_streams:
|
||||
if stream.is_live:
|
||||
verify_items.append((stream.url, "m3u8"))
|
||||
for stream in embed_streams:
|
||||
verify_items.append((stream.embed_url or stream.url, "embed"))
|
||||
|
||||
verdicts = await self._playback_verifier.verify_many(verify_items)
|
||||
|
||||
now_iso = datetime.now(timezone.utc).isoformat()
|
||||
for stream in m3u8_streams:
|
||||
if not stream.is_live:
|
||||
continue # already failed health check
|
||||
verdict = verdicts.get(stream.url)
|
||||
if verdict is None:
|
||||
continue # verifier disabled or unavailable
|
||||
stream.is_live = verdict.is_playable
|
||||
stream.checked_at = now_iso
|
||||
|
||||
for stream in embed_streams:
|
||||
key = stream.embed_url or stream.url
|
||||
verdict = verdicts.get(key)
|
||||
stream.checked_at = now_iso
|
||||
if verdict is None:
|
||||
# Verifier unavailable — fall back to "trust extractor".
|
||||
# This keeps the service usable even without playwright.
|
||||
stream.is_live = True
|
||||
stream.response_time_ms = 0
|
||||
else:
|
||||
stream.is_live = verdict.is_playable
|
||||
stream.response_time_ms = verdict.elapsed_ms
|
||||
|
||||
# Group streams by site_key and update cache
|
||||
new_cache: dict[str, list[ExtractedStream]] = {}
|
||||
|
|
|
|||
190
stacks/f1-stream/files/backend/extractors/timstreams.py
Normal file
190
stacks/f1-stream/files/backend/extractors/timstreams.py
Normal file
|
|
@ -0,0 +1,190 @@
|
|||
"""TimStreams extractor - fetches F1 streams from the TimStreams JSON API.
|
||||
|
||||
Returns embed URLs from hmembeds.one for iframe playback.
|
||||
The public API at stra.viaplus.site/main requires no authentication
|
||||
and returns all events/channels across Events, Replays, and 24/7 categories.
|
||||
"""
|
||||
|
||||
import logging
|
||||
|
||||
import httpx
|
||||
|
||||
from backend.extractors.base import BaseExtractor
|
||||
from backend.extractors.models import ExtractedStream
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
API_URL = "https://stra.viaplus.site/main"
|
||||
USER_AGENT = (
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
||||
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
||||
"Chrome/120.0.0.0 Safari/537.36"
|
||||
)
|
||||
|
||||
# Direct F1 keyword matches (case-insensitive)
|
||||
F1_KEYWORDS = {"formula 1", "formula one", "f1", "sky sports f1", "dazn f1"}
|
||||
# "Grand prix" is F1-related only if non-F1 motorsport keywords are absent
|
||||
GP_KEYWORD = "grand prix"
|
||||
# Exclude these motorsport series when matching on "grand prix"
|
||||
NON_F1_KEYWORDS = {
|
||||
"motogp", "moto gp", "moto2", "moto3", "motoe",
|
||||
"indycar", "indy car", "nascar",
|
||||
"rally", "wrc", "wec", "lemans", "le mans",
|
||||
"superbike", "dtm", "supercars",
|
||||
}
|
||||
|
||||
# 24/7 channels that should always be included (embed hashes on hmembeds.one)
|
||||
ALWAYS_INCLUDE_HASHES = {
|
||||
"888520f36cd94c5da4c71fddc1a5fc9b", # Sky Sports F1
|
||||
"fc3a54634d0867b0c02ee3223292e7c6", # DAZN F1
|
||||
}
|
||||
|
||||
|
||||
def _is_f1_event(name: str) -> bool:
|
||||
"""Check if an event/channel is Formula 1 related by name.
|
||||
|
||||
Returns True when the name contains a direct F1 keyword, or contains
|
||||
"grand prix" without non-F1 series keywords.
|
||||
|
||||
Note: The TimStreams API genre field (genre=2) covers ALL sports channels,
|
||||
not just motorsport, so we rely solely on name-based matching.
|
||||
"""
|
||||
lower = name.lower()
|
||||
|
||||
# Direct F1 keyword match
|
||||
if any(kw in lower for kw in F1_KEYWORDS):
|
||||
return True
|
||||
|
||||
# Grand prix without competing series
|
||||
if GP_KEYWORD in lower and not any(kw in lower for kw in NON_F1_KEYWORDS):
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
|
||||
def _extract_embed_hash(url: str) -> str | None:
|
||||
"""Extract the hash from an hmembeds.one embed URL.
|
||||
|
||||
Expected format: https://hmembeds.one/embed/{hash}
|
||||
Returns the hash string, or None if the URL is not in the expected format.
|
||||
"""
|
||||
if not url:
|
||||
return None
|
||||
# Handle both with and without trailing slash
|
||||
url = url.rstrip("/")
|
||||
prefix = "https://hmembeds.one/embed/"
|
||||
alt_prefix = "http://hmembeds.one/embed/"
|
||||
if url.startswith(prefix):
|
||||
return url[len(prefix):] or None
|
||||
if url.startswith(alt_prefix):
|
||||
return url[len(alt_prefix):] or None
|
||||
return None
|
||||
|
||||
|
||||
def _is_always_include(url: str) -> bool:
|
||||
"""Check if a stream URL is one of the always-include 24/7 channels."""
|
||||
embed_hash = _extract_embed_hash(url)
|
||||
return embed_hash in ALWAYS_INCLUDE_HASHES if embed_hash else False
|
||||
|
||||
|
||||
class TimStreamsExtractor(BaseExtractor):
|
||||
"""Extracts embed URLs from TimStreams' public JSON API.
|
||||
|
||||
The API at stra.viaplus.site/main returns a JSON array of categories,
|
||||
each containing events with stream URLs pointing to hmembeds.one embeds.
|
||||
"""
|
||||
|
||||
@property
|
||||
def site_key(self) -> str:
|
||||
return "timstreams"
|
||||
|
||||
@property
|
||||
def site_name(self) -> str:
|
||||
return "TimStreams"
|
||||
|
||||
async def extract(self) -> list[ExtractedStream]:
|
||||
"""Fetch F1 events/channels and return embed URLs for iframe playback."""
|
||||
streams: list[ExtractedStream] = []
|
||||
seen_urls: set[str] = set()
|
||||
|
||||
try:
|
||||
async with httpx.AsyncClient(
|
||||
timeout=15.0,
|
||||
follow_redirects=True,
|
||||
headers={"User-Agent": USER_AGENT, "Accept": "application/json"},
|
||||
) as client:
|
||||
resp = await client.get(API_URL)
|
||||
if resp.status_code != 200:
|
||||
logger.warning(
|
||||
"[timstreams] API returned HTTP %d", resp.status_code
|
||||
)
|
||||
return []
|
||||
|
||||
data = resp.json()
|
||||
if not isinstance(data, list):
|
||||
logger.warning("[timstreams] Unexpected API response type: %s", type(data).__name__)
|
||||
return []
|
||||
|
||||
logger.info("[timstreams] API returned %d categorie(s)", len(data))
|
||||
|
||||
for category in data:
|
||||
category_name = category.get("category", "Unknown")
|
||||
events = category.get("events", [])
|
||||
if not isinstance(events, list):
|
||||
continue
|
||||
|
||||
for event in events:
|
||||
event_name = event.get("name", "Unknown")
|
||||
event_streams = event.get("streams", [])
|
||||
|
||||
if not isinstance(event_streams, list) or not event_streams:
|
||||
continue
|
||||
|
||||
# Check if any stream URL matches an always-include channel
|
||||
always_include = any(
|
||||
_is_always_include(s.get("url", ""))
|
||||
for s in event_streams
|
||||
)
|
||||
|
||||
# Filter: must be F1-related or an always-include channel
|
||||
if not always_include and not _is_f1_event(event_name):
|
||||
continue
|
||||
|
||||
for stream_info in event_streams:
|
||||
stream_name = stream_info.get("name", "")
|
||||
stream_url = stream_info.get("url", "")
|
||||
|
||||
if not stream_url:
|
||||
continue
|
||||
|
||||
# Deduplicate by URL
|
||||
if stream_url in seen_urls:
|
||||
continue
|
||||
seen_urls.add(stream_url)
|
||||
|
||||
# Build a descriptive title
|
||||
title = event_name
|
||||
if stream_name and stream_name.lower() != event_name.lower():
|
||||
title = f"{event_name} - {stream_name}"
|
||||
if category_name:
|
||||
title = f"[{category_name}] {title}"
|
||||
|
||||
streams.append(
|
||||
ExtractedStream(
|
||||
url=stream_url,
|
||||
site_key=self.site_key,
|
||||
site_name=self.site_name,
|
||||
quality="",
|
||||
title=title,
|
||||
stream_type="embed",
|
||||
embed_url=stream_url,
|
||||
)
|
||||
)
|
||||
|
||||
except httpx.TimeoutException:
|
||||
logger.warning("[timstreams] API request timed out")
|
||||
except Exception:
|
||||
logger.exception("[timstreams] Failed to fetch from API")
|
||||
|
||||
logger.info("[timstreams] Extracted %d stream(s)", len(streams))
|
||||
return streams
|
||||
|
|
@ -3,6 +3,7 @@
|
|||
import logging
|
||||
import os
|
||||
from contextlib import asynccontextmanager
|
||||
from datetime import datetime, timedelta, timezone
|
||||
|
||||
from apscheduler.schedulers.asyncio import AsyncIOScheduler
|
||||
from apscheduler.triggers.cron import CronTrigger
|
||||
|
|
@ -13,6 +14,7 @@ from fastapi.staticfiles import StaticFiles
|
|||
from pydantic import BaseModel
|
||||
from starlette.responses import Response, StreamingResponse
|
||||
|
||||
from backend.embed_proxy import fetch_embed, relay_asset
|
||||
from backend.extractors import create_extraction_service
|
||||
from backend.proxy import proxy_playlist, relay_stream
|
||||
from backend.schedule import ScheduleService
|
||||
|
|
@ -117,10 +119,6 @@ async def lifespan(app: FastAPI):
|
|||
# Startup: load schedule and start background scheduler
|
||||
await schedule_service.initialize()
|
||||
|
||||
# Run initial extraction
|
||||
logger.info("Running initial stream extraction...")
|
||||
await extraction_service.run_extraction()
|
||||
|
||||
# Schedule daily schedule refresh
|
||||
scheduler.add_job(
|
||||
_scheduled_refresh,
|
||||
|
|
@ -130,13 +128,18 @@ async def lifespan(app: FastAPI):
|
|||
replace_existing=True,
|
||||
)
|
||||
|
||||
# Schedule periodic stream extraction (default: every 30 minutes)
|
||||
# Schedule periodic stream extraction (default: every 30 minutes).
|
||||
# next_run_time fires the first run 8s after startup. We don't run
|
||||
# extraction inline here because it calls the playback verifier,
|
||||
# which hits http://127.0.0.1:8000/embed for embed streams — uvicorn
|
||||
# isn't listening yet inside the lifespan startup phase.
|
||||
scheduler.add_job(
|
||||
_scheduled_extraction,
|
||||
trigger=IntervalTrigger(minutes=30),
|
||||
id="stream_extraction",
|
||||
name="Extract streams from all registered sites",
|
||||
replace_existing=True,
|
||||
next_run_time=datetime.now(timezone.utc) + timedelta(seconds=8),
|
||||
)
|
||||
|
||||
# Schedule token refresh every 4 minutes (safe margin for 5-min CDN tokens).
|
||||
|
|
@ -159,6 +162,10 @@ async def lifespan(app: FastAPI):
|
|||
# Shutdown
|
||||
scheduler.shutdown(wait=False)
|
||||
logger.info("APScheduler shut down")
|
||||
try:
|
||||
await extraction_service.shutdown()
|
||||
except Exception:
|
||||
logger.exception("extraction_service shutdown failed")
|
||||
|
||||
|
||||
app = FastAPI(title="F1 Streams", lifespan=lifespan)
|
||||
|
|
@ -409,6 +416,37 @@ async def relay_endpoint(
|
|||
)
|
||||
|
||||
|
||||
# --- Embed iframe-stripping proxy ---
|
||||
|
||||
|
||||
@app.get("/embed")
|
||||
async def embed_proxy(url: str = Query(..., description="Base64url-encoded embed URL")):
|
||||
"""Proxy a third-party embed page so it can be iframed in our origin.
|
||||
|
||||
Strips X-Frame-Options and CSP frame-ancestors from the upstream
|
||||
response, injects a base href + frame-buster-defeat script, and
|
||||
forwards a plausible Referer/Origin to bypass upstream allowlists.
|
||||
"""
|
||||
body, headers, status_code = await fetch_embed(url)
|
||||
return Response(content=body, headers=headers, status_code=status_code)
|
||||
|
||||
|
||||
@app.get("/embed-asset")
|
||||
async def embed_asset(
|
||||
request: Request,
|
||||
url: str = Query(..., description="Base64url-encoded subresource URL"),
|
||||
):
|
||||
"""Relay an upstream subresource (JS/CSS/image/etc.) for the embed proxy.
|
||||
|
||||
Used as a fallback when an upstream blocks hotlinked assets via Origin
|
||||
or Referer checks. Most assets load directly via the injected <base>
|
||||
tag without going through this endpoint.
|
||||
"""
|
||||
range_header = request.headers.get("range")
|
||||
stream_gen, headers, status_code = await relay_asset(url, range_header)
|
||||
return StreamingResponse(stream_gen, headers=headers, status_code=status_code)
|
||||
|
||||
|
||||
# --- Frontend Static Files ---
|
||||
# Mount the SvelteKit static build AFTER all API routes so API endpoints take priority.
|
||||
# SvelteKit adapter-static with ssr=false produces {page}.html files and a fallback index.html.
|
||||
|
|
|
|||
445
stacks/f1-stream/files/backend/playback_verifier.py
Normal file
445
stacks/f1-stream/files/backend/playback_verifier.py
Normal file
|
|
@ -0,0 +1,445 @@
|
|||
"""Headless-browser playback verification for extracted streams.
|
||||
|
||||
The basic health checker (backend/health.py) only validates m3u8 syntax.
|
||||
For embed/iframe streams it has nothing to check — the previous code blindly
|
||||
marked every embed `is_live=True`, which meant the stream list was full of
|
||||
news articles and aggregator landing pages that never actually played.
|
||||
|
||||
This module loads each candidate stream URL in headless Chromium (via
|
||||
Playwright) and looks for *codec-independent* signals that the upstream
|
||||
serves a playable stream:
|
||||
|
||||
- For m3u8: hls.js receives MANIFEST_PARSED + at least one FRAG_LOADED
|
||||
event. We don't wait for `<video>` to gain dimensions, because Playwright's
|
||||
chromium build doesn't include the H.264/AAC codecs. The user's real
|
||||
browser does, so confirming "manifest + segment fetch succeed" is the
|
||||
right server-side signal.
|
||||
- For embed: a `<video>` element appears at top level OR inside the iframe
|
||||
(the embed proxy strips X-Frame-Options + frame-buster JS so we can
|
||||
introspect the iframe content), OR the player has set up a MediaSource.
|
||||
|
||||
Designed to be called from the extraction service's run_extraction()
|
||||
hook, with bounded concurrency. Each verification typically takes
|
||||
4-12 seconds.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import base64
|
||||
import logging
|
||||
import os
|
||||
import time
|
||||
from dataclasses import dataclass
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Toggle off in development by setting PLAYBACK_VERIFY_ENABLED=false.
|
||||
VERIFY_ENABLED = os.getenv("PLAYBACK_VERIFY_ENABLED", "true").lower() in ("true", "1", "yes")
|
||||
|
||||
# Maximum number of concurrent browser pages.
|
||||
MAX_CONCURRENCY = int(os.getenv("PLAYBACK_VERIFY_CONCURRENCY", "2"))
|
||||
|
||||
# Per-stream verification budget (seconds). Beyond this we declare unplayable.
|
||||
PER_STREAM_TIMEOUT = float(os.getenv("PLAYBACK_VERIFY_TIMEOUT", "20"))
|
||||
|
||||
# Where the embed proxy lives, used to wrap embed URLs so they bypass
|
||||
# X-Frame-Options/CSP/JS frame-busters during verification. Defaults to
|
||||
# loopback because verification runs inside the same FastAPI process.
|
||||
PROXY_BASE = os.getenv("PLAYBACK_VERIFY_PROXY_BASE", "http://127.0.0.1:8000")
|
||||
|
||||
USER_AGENT = (
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
||||
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
||||
"Chrome/120.0.0.0 Safari/537.36"
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class PlaybackVerdict:
|
||||
is_playable: bool
|
||||
signal: str = "" # which check triggered the positive verdict
|
||||
elapsed_ms: int = 0
|
||||
error: str = ""
|
||||
|
||||
|
||||
def _b64url(s: str) -> str:
|
||||
"""URL-safe base64 with padding stripped — matches m3u8_rewriter.encode_url."""
|
||||
return base64.urlsafe_b64encode(s.encode()).decode().rstrip("=")
|
||||
|
||||
|
||||
def _hls_test_html(m3u8_url: str) -> str:
|
||||
"""A self-contained HTML page that loads an m3u8 via hls.js into a <video>.
|
||||
|
||||
The page exposes window._verifier with manifest_parsed / frag_loaded
|
||||
booleans the verifier polls. It also marks media-error or fatal-error
|
||||
so we can distinguish 'upstream is unreachable' from 'codec missing'.
|
||||
"""
|
||||
return f"""<!doctype html>
|
||||
<html><head><meta charset="utf-8"><title>verify</title>
|
||||
<script src="https://cdn.jsdelivr.net/npm/hls.js@1.5/dist/hls.min.js"></script>
|
||||
</head><body>
|
||||
<video id="v" muted playsinline width="640" height="360"></video>
|
||||
<script>
|
||||
window._verifier = {{
|
||||
manifest_parsed: false,
|
||||
frag_loaded: false,
|
||||
media_loaded: false, // true when MSE has appended any buffer
|
||||
fatal_network_error: false, // upstream truly unreachable
|
||||
manifest_incompatible: false, // codec missing — separate from network reachability
|
||||
hls_error_details: ""
|
||||
}};
|
||||
const v = document.getElementById('v');
|
||||
const url = {m3u8_url!r};
|
||||
function start() {{
|
||||
if (window.Hls && Hls.isSupported()) {{
|
||||
const hls = new Hls({{enableWorker: true}});
|
||||
hls.on(Hls.Events.MANIFEST_PARSED, () => {{ window._verifier.manifest_parsed = true; }});
|
||||
hls.on(Hls.Events.FRAG_LOADED, () => {{ window._verifier.frag_loaded = true; }});
|
||||
hls.on(Hls.Events.BUFFER_APPENDED, () => {{ window._verifier.media_loaded = true; }});
|
||||
hls.on(Hls.Events.ERROR, (_, d) => {{
|
||||
window._verifier.hls_error_details = d.details || "";
|
||||
if (d.fatal && d.type === Hls.ErrorTypes.NETWORK_ERROR) {{
|
||||
window._verifier.fatal_network_error = true;
|
||||
}}
|
||||
if (d.details === Hls.ErrorDetails.MANIFEST_INCOMPATIBLE_CODECS_ERROR) {{
|
||||
window._verifier.manifest_incompatible = true;
|
||||
}}
|
||||
}});
|
||||
hls.loadSource(url);
|
||||
hls.attachMedia(v);
|
||||
}} else if (v.canPlayType('application/vnd.apple.mpegurl')) {{
|
||||
v.src = url;
|
||||
v.addEventListener('loadedmetadata', () => {{ window._verifier.manifest_parsed = true; window._verifier.frag_loaded = true; }});
|
||||
v.addEventListener('error', () => {{ window._verifier.fatal_network_error = true; }});
|
||||
}} else {{
|
||||
window._verifier.hls_error_details = "no hls support";
|
||||
}}
|
||||
}}
|
||||
window.addEventListener('load', start);
|
||||
</script></body></html>"""
|
||||
|
||||
|
||||
def _embed_test_html(_proxied_embed_url: str) -> str:
|
||||
"""No longer used — verifier navigates the page directly to the proxy URL.
|
||||
|
||||
The earlier iframe-wrapper approach hit same-origin policy when inspecting
|
||||
the iframe's contentDocument (the wrapper page was a data: URL, the iframe
|
||||
was http://127.0.0.1:8000), so we couldn't read the embed's DOM.
|
||||
"""
|
||||
return ""
|
||||
|
||||
|
||||
_M3U8_POLL_JS = """
|
||||
() => {
|
||||
const v = window._verifier || {};
|
||||
const vid = document.querySelector('video');
|
||||
return {
|
||||
manifest_parsed: !!v.manifest_parsed,
|
||||
frag_loaded: !!v.frag_loaded,
|
||||
media_loaded: !!v.media_loaded,
|
||||
fatal_network_error: !!v.fatal_network_error,
|
||||
manifest_incompatible: !!v.manifest_incompatible,
|
||||
hls_error_details: v.hls_error_details || "",
|
||||
video_width: vid ? vid.videoWidth : 0,
|
||||
video_ready: vid ? vid.readyState : 0,
|
||||
};
|
||||
}
|
||||
"""
|
||||
|
||||
|
||||
_EMBED_POLL_JS = """
|
||||
() => {
|
||||
try {
|
||||
const vids = document.querySelectorAll('video');
|
||||
if (vids.length > 0) {
|
||||
const v = vids[0];
|
||||
return {
|
||||
has_video: true,
|
||||
src: v.currentSrc || v.src || "",
|
||||
width: v.videoWidth,
|
||||
ready: v.readyState,
|
||||
duration: isFinite(v.duration) ? v.duration : 0,
|
||||
media_keys: !!v.mediaKeys,
|
||||
sources: v.querySelectorAll('source').length,
|
||||
};
|
||||
}
|
||||
const player_divs = document.querySelectorAll(
|
||||
'[id*="player" i], [class*="player" i], [class*="jwplayer" i], [id*="video" i], [class*="video-js" i]'
|
||||
);
|
||||
return {has_video: false, has_player_div: player_divs.length > 0};
|
||||
} catch (e) {
|
||||
return {has_video: false, has_player_div: false, err: String(e)};
|
||||
}
|
||||
}
|
||||
"""
|
||||
|
||||
|
||||
async def _verify_m3u8(page, m3u8_url: str, deadline: float) -> PlaybackVerdict:
|
||||
"""Confirm an m3u8 URL is fetchable via hls.js end-to-end.
|
||||
|
||||
Positive signal hierarchy:
|
||||
1. media_loaded (MSE buffer appended) — strongest, codec-supported.
|
||||
2. frag_loaded (hls.js fetched at least one segment) — upstream is OK
|
||||
even if the local browser lacks codecs.
|
||||
3. manifest_parsed without media_loaded but with manifest_incompatible
|
||||
— indicates upstream playlist is valid; player can't decode here
|
||||
but a real user's browser will.
|
||||
Negative signal:
|
||||
- fatal_network_error: upstream is unreachable.
|
||||
- timeout with no manifest_parsed: upstream did not respond.
|
||||
"""
|
||||
start = time.monotonic()
|
||||
html = _hls_test_html(m3u8_url)
|
||||
data_url = "data:text/html;base64," + base64.b64encode(html.encode()).decode()
|
||||
|
||||
try:
|
||||
await page.goto(data_url, wait_until="domcontentloaded", timeout=10_000)
|
||||
except Exception as e:
|
||||
return PlaybackVerdict(
|
||||
is_playable=False, error=f"goto failed: {e}",
|
||||
elapsed_ms=int((time.monotonic() - start) * 1000),
|
||||
)
|
||||
|
||||
last_state: dict = {}
|
||||
while time.monotonic() < deadline:
|
||||
try:
|
||||
state = await page.evaluate(_M3U8_POLL_JS)
|
||||
except Exception as e:
|
||||
return PlaybackVerdict(
|
||||
is_playable=False, error=f"evaluate failed: {e}",
|
||||
elapsed_ms=int((time.monotonic() - start) * 1000),
|
||||
)
|
||||
last_state = state
|
||||
if state.get("media_loaded"):
|
||||
return PlaybackVerdict(
|
||||
is_playable=True, signal="media_loaded",
|
||||
elapsed_ms=int((time.monotonic() - start) * 1000),
|
||||
)
|
||||
if state.get("frag_loaded"):
|
||||
return PlaybackVerdict(
|
||||
is_playable=True, signal="frag_loaded",
|
||||
elapsed_ms=int((time.monotonic() - start) * 1000),
|
||||
)
|
||||
# MANIFEST_INCOMPATIBLE_CODECS_ERROR fires after hls.js successfully
|
||||
# fetched and parsed the manifest — the failure is purely local
|
||||
# (chromium lacks H.264). The user's real browser has codecs, so
|
||||
# this URL is playable from the user's perspective.
|
||||
if state.get("manifest_incompatible"):
|
||||
return PlaybackVerdict(
|
||||
is_playable=True, signal="manifest_parsed_codec_missing_in_verifier",
|
||||
elapsed_ms=int((time.monotonic() - start) * 1000),
|
||||
)
|
||||
if state.get("manifest_parsed"):
|
||||
return PlaybackVerdict(
|
||||
is_playable=True, signal="manifest_parsed",
|
||||
elapsed_ms=int((time.monotonic() - start) * 1000),
|
||||
)
|
||||
if state.get("fatal_network_error"):
|
||||
return PlaybackVerdict(
|
||||
is_playable=False, error="upstream network error",
|
||||
elapsed_ms=int((time.monotonic() - start) * 1000),
|
||||
)
|
||||
await asyncio.sleep(0.25)
|
||||
|
||||
err = "no playback signal"
|
||||
if last_state.get("hls_error_details"):
|
||||
err = f"hls.js error: {last_state['hls_error_details']}"
|
||||
return PlaybackVerdict(
|
||||
is_playable=False, error=err,
|
||||
elapsed_ms=int((time.monotonic() - start) * 1000),
|
||||
)
|
||||
|
||||
|
||||
async def _verify_embed(page, proxied_url: str, deadline: float) -> PlaybackVerdict:
|
||||
"""Navigate directly to the proxied embed and confirm a player rendered.
|
||||
|
||||
Positive signals (in priority order):
|
||||
- <video> with src/sources/mediaKeys set (player wired up).
|
||||
- <video> element exists with any state (script ran, player attaching).
|
||||
- A player container div (jwplayer, video-js, [id*=player], etc.).
|
||||
|
||||
Loading the embed page directly (not via iframe wrapper) avoids the
|
||||
same-origin policy that prevented earlier iframe-introspection runs
|
||||
from seeing the embed DOM.
|
||||
"""
|
||||
start = time.monotonic()
|
||||
try:
|
||||
await page.goto(proxied_url, wait_until="domcontentloaded", timeout=15_000)
|
||||
except Exception as e:
|
||||
return PlaybackVerdict(
|
||||
is_playable=False, error=f"goto failed: {e}",
|
||||
elapsed_ms=int((time.monotonic() - start) * 1000),
|
||||
)
|
||||
|
||||
# Track the best state seen across all polls. Some embeds load a player
|
||||
# div briefly then anti-bot JS tears the DOM down (hmembeds redirects
|
||||
# to google.com if its devtool-detection trips). We accept any positive
|
||||
# signal observed during the window, even if it's gone by timeout.
|
||||
seen_video_wired = False
|
||||
seen_video_tag = False
|
||||
seen_player_div = False
|
||||
last_err = ""
|
||||
|
||||
while time.monotonic() < deadline:
|
||||
try:
|
||||
r = await page.evaluate(_EMBED_POLL_JS)
|
||||
except Exception as e:
|
||||
return PlaybackVerdict(
|
||||
is_playable=False, error=f"evaluate failed: {e}",
|
||||
elapsed_ms=int((time.monotonic() - start) * 1000),
|
||||
)
|
||||
if r.get("has_video"):
|
||||
seen_video_tag = True
|
||||
if r.get("src") or r.get("width", 0) > 0 or r.get("media_keys") or r.get("sources", 0) > 0:
|
||||
seen_video_wired = True
|
||||
return PlaybackVerdict(
|
||||
is_playable=True, signal="video.wired",
|
||||
elapsed_ms=int((time.monotonic() - start) * 1000),
|
||||
)
|
||||
if r.get("has_player_div"):
|
||||
seen_player_div = True
|
||||
last_err = r.get("err", "")
|
||||
await asyncio.sleep(0.5)
|
||||
|
||||
if seen_video_wired:
|
||||
return PlaybackVerdict(is_playable=True, signal="video.wired",
|
||||
elapsed_ms=int((time.monotonic() - start) * 1000))
|
||||
if seen_video_tag:
|
||||
return PlaybackVerdict(is_playable=True, signal="video.tag_only",
|
||||
elapsed_ms=int((time.monotonic() - start) * 1000))
|
||||
if seen_player_div:
|
||||
return PlaybackVerdict(is_playable=True, signal="player_div",
|
||||
elapsed_ms=int((time.monotonic() - start) * 1000))
|
||||
|
||||
err = "no <video> or player container found"
|
||||
if last_err:
|
||||
err += f"; last_err: {last_err}"
|
||||
return PlaybackVerdict(is_playable=False, error=err,
|
||||
elapsed_ms=int((time.monotonic() - start) * 1000))
|
||||
|
||||
|
||||
class PlaybackVerifier:
|
||||
"""Verifies playability of m3u8 and embed URLs via headless Chromium.
|
||||
|
||||
Manages a single browser instance for the process lifetime (cheap per-page
|
||||
contexts) and bounds concurrency with a semaphore.
|
||||
"""
|
||||
|
||||
def __init__(self) -> None:
|
||||
self._browser = None
|
||||
self._playwright = None
|
||||
self._sem = asyncio.Semaphore(MAX_CONCURRENCY)
|
||||
self._lock = asyncio.Lock()
|
||||
|
||||
async def _ensure_browser(self):
|
||||
if self._browser is not None:
|
||||
return self._browser
|
||||
async with self._lock:
|
||||
if self._browser is not None:
|
||||
return self._browser
|
||||
try:
|
||||
from playwright.async_api import async_playwright
|
||||
except ImportError:
|
||||
logger.error("playwright not installed — playback verification disabled")
|
||||
return None
|
||||
self._playwright = await async_playwright().start()
|
||||
self._browser = await self._playwright.chromium.launch(
|
||||
headless=True,
|
||||
args=[
|
||||
"--disable-dev-shm-usage",
|
||||
"--disable-web-security",
|
||||
"--no-sandbox",
|
||||
"--disable-setuid-sandbox",
|
||||
"--disable-features=IsolateOrigins,site-per-process",
|
||||
"--autoplay-policy=no-user-gesture-required",
|
||||
],
|
||||
)
|
||||
logger.info("Playwright browser launched (concurrency=%d)", MAX_CONCURRENCY)
|
||||
return self._browser
|
||||
|
||||
async def shutdown(self) -> None:
|
||||
if self._browser is not None:
|
||||
try:
|
||||
await self._browser.close()
|
||||
except Exception:
|
||||
logger.exception("error closing browser")
|
||||
if self._playwright is not None:
|
||||
try:
|
||||
await self._playwright.stop()
|
||||
except Exception:
|
||||
logger.exception("error stopping playwright")
|
||||
self._browser = None
|
||||
self._playwright = None
|
||||
|
||||
async def verify(self, url: str, stream_type: str) -> PlaybackVerdict:
|
||||
if not VERIFY_ENABLED:
|
||||
return PlaybackVerdict(is_playable=True, error="disabled")
|
||||
|
||||
browser = await self._ensure_browser()
|
||||
if browser is None:
|
||||
return PlaybackVerdict(is_playable=False, error="playwright unavailable")
|
||||
|
||||
is_m3u8 = stream_type == "m3u8"
|
||||
if not is_m3u8:
|
||||
url = f"{PROXY_BASE}/embed?url={_b64url(url)}"
|
||||
|
||||
async with self._sem:
|
||||
# Set the per-stream deadline AFTER acquiring the semaphore.
|
||||
# Otherwise queued streams that wait behind earlier ones
|
||||
# would have already-expired deadlines when they start.
|
||||
deadline = time.monotonic() + PER_STREAM_TIMEOUT
|
||||
try:
|
||||
context = await browser.new_context(
|
||||
user_agent=USER_AGENT,
|
||||
viewport={"width": 1280, "height": 720},
|
||||
bypass_csp=True,
|
||||
)
|
||||
page = await context.new_page()
|
||||
except Exception as e:
|
||||
return PlaybackVerdict(
|
||||
is_playable=False, error=f"context create failed: {e}",
|
||||
)
|
||||
try:
|
||||
if is_m3u8:
|
||||
verdict = await _verify_m3u8(page, url, deadline)
|
||||
else:
|
||||
verdict = await _verify_embed(page, url, deadline)
|
||||
except asyncio.TimeoutError:
|
||||
verdict = PlaybackVerdict(is_playable=False, error="overall timeout")
|
||||
except Exception as e:
|
||||
verdict = PlaybackVerdict(
|
||||
is_playable=False, error=f"verify exception: {e}",
|
||||
)
|
||||
finally:
|
||||
try:
|
||||
await page.close()
|
||||
await context.close()
|
||||
except Exception:
|
||||
pass
|
||||
logger.info(
|
||||
"[verify] %s -> playable=%s signal=%s err=%s elapsed=%dms",
|
||||
url[:120], verdict.is_playable, verdict.signal,
|
||||
verdict.error, verdict.elapsed_ms,
|
||||
)
|
||||
return verdict
|
||||
|
||||
async def verify_many(self, items: list[tuple[str, str]]) -> dict[str, PlaybackVerdict]:
|
||||
if not items:
|
||||
return {}
|
||||
if not VERIFY_ENABLED:
|
||||
return {url: PlaybackVerdict(is_playable=True, error="disabled") for url, _ in items}
|
||||
|
||||
async def _run(url: str, stream_type: str):
|
||||
verdict = await self.verify(url, stream_type)
|
||||
return url, verdict
|
||||
|
||||
results = await asyncio.gather(
|
||||
*[_run(url, st) for url, st in items], return_exceptions=True
|
||||
)
|
||||
out: dict[str, PlaybackVerdict] = {}
|
||||
for r in results:
|
||||
if isinstance(r, Exception):
|
||||
logger.exception("verify task crashed: %s", r)
|
||||
continue
|
||||
url, verdict = r
|
||||
out[url] = verdict
|
||||
return out
|
||||
|
|
@ -3,3 +3,4 @@ uvicorn[standard]
|
|||
httpx>=0.27.0
|
||||
apscheduler>=3.10.0,<4.0
|
||||
pydantic>=2.0.0
|
||||
playwright==1.48.0
|
||||
|
|
|
|||
|
|
@ -44,6 +44,20 @@ export function getProxyUrl(m3u8Url) {
|
|||
return `${API_BASE}/proxy?url=${encoded}`;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the embed-proxy URL for an upstream iframe embed page.
|
||||
*
|
||||
* The proxy strips X-Frame-Options / CSP frame-ancestors and injects a
|
||||
* frame-buster-defeat script so the embed renders inside our iframe even
|
||||
* when the upstream tries to block it.
|
||||
* @param {string} embedUrl - The original embed page URL
|
||||
* @returns {string} URL pointing at our /embed proxy
|
||||
*/
|
||||
export function getEmbedProxyUrl(embedUrl) {
|
||||
const encoded = toBase64Url(embedUrl);
|
||||
return `${API_BASE}/embed?url=${encoded}`;
|
||||
}
|
||||
|
||||
/**
|
||||
* Mark a stream as actively being watched (enables token refresh).
|
||||
* @param {string} url - The stream URL
|
||||
|
|
|
|||
|
|
@ -1,5 +1,5 @@
|
|||
<script>
|
||||
import { fetchStreams, fetchSchedule, getProxyUrl, activateStream, deactivateStream } from '$lib/api.js';
|
||||
import { fetchStreams, fetchSchedule, getProxyUrl, getEmbedProxyUrl, activateStream, deactivateStream } from '$lib/api.js';
|
||||
import { onMount, onDestroy } from 'svelte';
|
||||
import { page } from '$app/state';
|
||||
|
||||
|
|
@ -107,12 +107,14 @@
|
|||
}
|
||||
|
||||
if (stream.stream_type === 'embed') {
|
||||
// Embed/iframe player — no hls.js needed
|
||||
// Embed/iframe player — route through our /embed proxy so the
|
||||
// upstream's X-Frame-Options / CSP / JS frame-busters can't
|
||||
// block the iframe.
|
||||
const newPlayer = {
|
||||
id: Date.now(),
|
||||
proxyUrl: '',
|
||||
originalUrl: stream.embed_url,
|
||||
embedUrl: stream.embed_url,
|
||||
embedUrl: getEmbedProxyUrl(stream.embed_url),
|
||||
streamType: 'embed',
|
||||
siteKey: stream.site_key || '',
|
||||
siteName: stream.site_name || stream.site_key || 'Unknown',
|
||||
|
|
|
|||
|
|
@ -104,11 +104,11 @@ resource "kubernetes_deployment" "f1-stream" {
|
|||
name = "f1-stream"
|
||||
resources {
|
||||
limits = {
|
||||
memory = "256Mi"
|
||||
memory = "1Gi"
|
||||
}
|
||||
requests = {
|
||||
cpu = "25m"
|
||||
memory = "256Mi"
|
||||
cpu = "100m"
|
||||
memory = "1Gi"
|
||||
}
|
||||
}
|
||||
port {
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue