infra/stacks/f1-stream/files/backend/playback_verifier.py
Viktor Barzin d77a02357c chrome-service: in-cluster headed Chromium pool for f1-stream verifier
The f1-stream verifier's in-process headless Chromium kept tripping
hmembeds' disable-devtool.js Performance detector (CDP latency on
console.log vs console.table) and getting redirected to google.com.

This adds a single-replica chrome-service stack running Playwright
launch-server under Xvfb so callers can connect via WS+token to a
shared headed browser. f1-stream's _ensure_browser now prefers
chromium.connect(CHROME_WS_URL/CHROME_WS_TOKEN) and adds a vendored
stealth init script (webdriver/plugins/languages/Permissions/WebGL
spoofs + querySelector hijack to disarm disable-devtool-auto) on
every new context. Falls back to in-process headless if the env
vars aren't set.

Encrypted PVC for profile + npm cache, NetworkPolicy to TCP/3000
gated by client-namespace label, 6h tar.gz backup CronJob to NFS,
Authentik-gated nginx sidecar at chrome.viktorbarzin.me for human
liveness checks. Image pinned to playwright:v1.48.0-noble in
lockstep with the Python client's playwright==1.48.0.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-07 10:43:40 +00:00

449 lines
18 KiB
Python

"""Headless-browser playback verification for extracted streams.
The basic health checker (backend/health.py) only validates m3u8 syntax.
For embed/iframe streams it has nothing to check — the previous code blindly
marked every embed `is_live=True`, which meant the stream list was full of
news articles and aggregator landing pages that never actually played.
This module loads each candidate stream URL in headless Chromium (via
Playwright) and looks for *codec-independent* signals that the upstream
serves a playable stream:
- For m3u8: hls.js receives MANIFEST_PARSED + at least one FRAG_LOADED
event. We don't wait for `<video>` to gain dimensions, because Playwright's
chromium build doesn't include the H.264/AAC codecs. The user's real
browser does, so confirming "manifest + segment fetch succeed" is the
right server-side signal.
- For embed: a `<video>` element appears at top level OR inside the iframe
(the embed proxy strips X-Frame-Options + frame-buster JS so we can
introspect the iframe content), OR the player has set up a MediaSource.
Designed to be called from the extraction service's run_extraction()
hook, with bounded concurrency. Each verification typically takes
4-12 seconds.
"""
import asyncio
import base64
import logging
import os
import time
from dataclasses import dataclass
logger = logging.getLogger(__name__)
# Toggle off in development by setting PLAYBACK_VERIFY_ENABLED=false.
VERIFY_ENABLED = os.getenv("PLAYBACK_VERIFY_ENABLED", "true").lower() in ("true", "1", "yes")
# Maximum number of concurrent browser pages.
MAX_CONCURRENCY = int(os.getenv("PLAYBACK_VERIFY_CONCURRENCY", "2"))
# Per-stream verification budget (seconds). Beyond this we declare unplayable.
PER_STREAM_TIMEOUT = float(os.getenv("PLAYBACK_VERIFY_TIMEOUT", "20"))
# Where the embed proxy lives, used to wrap embed URLs so they bypass
# X-Frame-Options/CSP/JS frame-busters during verification. Defaults to
# loopback because verification runs inside the same FastAPI process.
PROXY_BASE = os.getenv("PLAYBACK_VERIFY_PROXY_BASE", "http://127.0.0.1:8000")
USER_AGENT = (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/120.0.0.0 Safari/537.36"
)
@dataclass
class PlaybackVerdict:
is_playable: bool
signal: str = "" # which check triggered the positive verdict
elapsed_ms: int = 0
error: str = ""
def _b64url(s: str) -> str:
"""URL-safe base64 with padding stripped — matches m3u8_rewriter.encode_url."""
return base64.urlsafe_b64encode(s.encode()).decode().rstrip("=")
def _hls_test_html(m3u8_url: str) -> str:
"""A self-contained HTML page that loads an m3u8 via hls.js into a <video>.
The page exposes window._verifier with manifest_parsed / frag_loaded
booleans the verifier polls. It also marks media-error or fatal-error
so we can distinguish 'upstream is unreachable' from 'codec missing'.
"""
return f"""<!doctype html>
<html><head><meta charset="utf-8"><title>verify</title>
<script src="https://cdn.jsdelivr.net/npm/hls.js@1.5/dist/hls.min.js"></script>
</head><body>
<video id="v" muted playsinline width="640" height="360"></video>
<script>
window._verifier = {{
manifest_parsed: false,
frag_loaded: false,
media_loaded: false, // true when MSE has appended any buffer
fatal_network_error: false, // upstream truly unreachable
manifest_incompatible: false, // codec missing — separate from network reachability
hls_error_details: ""
}};
const v = document.getElementById('v');
const url = {m3u8_url!r};
function start() {{
if (window.Hls && Hls.isSupported()) {{
const hls = new Hls({{enableWorker: true}});
hls.on(Hls.Events.MANIFEST_PARSED, () => {{ window._verifier.manifest_parsed = true; }});
hls.on(Hls.Events.FRAG_LOADED, () => {{ window._verifier.frag_loaded = true; }});
hls.on(Hls.Events.BUFFER_APPENDED, () => {{ window._verifier.media_loaded = true; }});
hls.on(Hls.Events.ERROR, (_, d) => {{
window._verifier.hls_error_details = d.details || "";
if (d.fatal && d.type === Hls.ErrorTypes.NETWORK_ERROR) {{
window._verifier.fatal_network_error = true;
}}
if (d.details === Hls.ErrorDetails.MANIFEST_INCOMPATIBLE_CODECS_ERROR) {{
window._verifier.manifest_incompatible = true;
}}
}});
hls.loadSource(url);
hls.attachMedia(v);
}} else if (v.canPlayType('application/vnd.apple.mpegurl')) {{
v.src = url;
v.addEventListener('loadedmetadata', () => {{ window._verifier.manifest_parsed = true; window._verifier.frag_loaded = true; }});
v.addEventListener('error', () => {{ window._verifier.fatal_network_error = true; }});
}} else {{
window._verifier.hls_error_details = "no hls support";
}}
}}
window.addEventListener('load', start);
</script></body></html>"""
def _embed_test_html(_proxied_embed_url: str) -> str:
"""No longer used — verifier navigates the page directly to the proxy URL.
The earlier iframe-wrapper approach hit same-origin policy when inspecting
the iframe's contentDocument (the wrapper page was a data: URL, the iframe
was http://127.0.0.1:8000), so we couldn't read the embed's DOM.
"""
return ""
_M3U8_POLL_JS = """
() => {
const v = window._verifier || {};
const vid = document.querySelector('video');
return {
manifest_parsed: !!v.manifest_parsed,
frag_loaded: !!v.frag_loaded,
media_loaded: !!v.media_loaded,
fatal_network_error: !!v.fatal_network_error,
manifest_incompatible: !!v.manifest_incompatible,
hls_error_details: v.hls_error_details || "",
video_width: vid ? vid.videoWidth : 0,
video_ready: vid ? vid.readyState : 0,
};
}
"""
_EMBED_POLL_JS = """
() => {
try {
const vids = document.querySelectorAll('video');
if (vids.length > 0) {
const v = vids[0];
return {
has_video: true,
src: v.currentSrc || v.src || "",
width: v.videoWidth,
ready: v.readyState,
duration: isFinite(v.duration) ? v.duration : 0,
media_keys: !!v.mediaKeys,
sources: v.querySelectorAll('source').length,
};
}
return {has_video: false};
} catch (e) {
return {has_video: false, err: String(e)};
}
}
"""
async def _verify_m3u8(page, m3u8_url: str, deadline: float) -> PlaybackVerdict:
"""Confirm an m3u8 URL is fetchable via hls.js end-to-end.
Positive signal hierarchy:
1. media_loaded (MSE buffer appended) — strongest, codec-supported.
2. frag_loaded (hls.js fetched at least one segment) — upstream is OK
even if the local browser lacks codecs.
3. manifest_parsed without media_loaded but with manifest_incompatible
— indicates upstream playlist is valid; player can't decode here
but a real user's browser will.
Negative signal:
- fatal_network_error: upstream is unreachable.
- timeout with no manifest_parsed: upstream did not respond.
"""
start = time.monotonic()
html = _hls_test_html(m3u8_url)
data_url = "data:text/html;base64," + base64.b64encode(html.encode()).decode()
try:
await page.goto(data_url, wait_until="domcontentloaded", timeout=10_000)
except Exception as e:
return PlaybackVerdict(
is_playable=False, error=f"goto failed: {e}",
elapsed_ms=int((time.monotonic() - start) * 1000),
)
last_state: dict = {}
while time.monotonic() < deadline:
try:
state = await page.evaluate(_M3U8_POLL_JS)
except Exception as e:
return PlaybackVerdict(
is_playable=False, error=f"evaluate failed: {e}",
elapsed_ms=int((time.monotonic() - start) * 1000),
)
last_state = state
if state.get("media_loaded"):
return PlaybackVerdict(
is_playable=True, signal="media_loaded",
elapsed_ms=int((time.monotonic() - start) * 1000),
)
if state.get("frag_loaded"):
return PlaybackVerdict(
is_playable=True, signal="frag_loaded",
elapsed_ms=int((time.monotonic() - start) * 1000),
)
# MANIFEST_INCOMPATIBLE_CODECS_ERROR fires after hls.js successfully
# fetched and parsed the manifest — the failure is purely local
# (chromium lacks H.264). The user's real browser has codecs, so
# this URL is playable from the user's perspective.
if state.get("manifest_incompatible"):
return PlaybackVerdict(
is_playable=True, signal="manifest_parsed_codec_missing_in_verifier",
elapsed_ms=int((time.monotonic() - start) * 1000),
)
if state.get("manifest_parsed"):
return PlaybackVerdict(
is_playable=True, signal="manifest_parsed",
elapsed_ms=int((time.monotonic() - start) * 1000),
)
if state.get("fatal_network_error"):
return PlaybackVerdict(
is_playable=False, error="upstream network error",
elapsed_ms=int((time.monotonic() - start) * 1000),
)
await asyncio.sleep(0.25)
err = "no playback signal"
if last_state.get("hls_error_details"):
err = f"hls.js error: {last_state['hls_error_details']}"
return PlaybackVerdict(
is_playable=False, error=err,
elapsed_ms=int((time.monotonic() - start) * 1000),
)
async def _verify_embed(page, proxied_url: str, deadline: float) -> PlaybackVerdict:
"""Navigate directly to the proxied embed and confirm a player rendered.
Positive signals (in priority order):
- <video> with src/sources/mediaKeys set (player wired up).
- <video> element exists with any state (script ran, player attaching).
- A player container div (jwplayer, video-js, [id*=player], etc.).
Loading the embed page directly (not via iframe wrapper) avoids the
same-origin policy that prevented earlier iframe-introspection runs
from seeing the embed DOM.
"""
start = time.monotonic()
try:
await page.goto(proxied_url, wait_until="domcontentloaded", timeout=15_000)
except Exception as e:
return PlaybackVerdict(
is_playable=False, error=f"goto failed: {e}",
elapsed_ms=int((time.monotonic() - start) * 1000),
)
# Track the best state seen across all polls. Some embeds load a player
# briefly then anti-bot JS tears the DOM down (hmembeds redirects to
# google.com if its devtool-detection trips). We accept any positive
# signal observed during the window, even if it's gone by timeout.
#
# We require an actual <video> element — a "player container div"
# is too weak (sportsurge has player-class divs but no real player).
seen_video_wired = False
seen_video_tag = False
last_err = ""
while time.monotonic() < deadline:
try:
r = await page.evaluate(_EMBED_POLL_JS)
except Exception as e:
return PlaybackVerdict(
is_playable=False, error=f"evaluate failed: {e}",
elapsed_ms=int((time.monotonic() - start) * 1000),
)
if r.get("has_video"):
seen_video_tag = True
if r.get("src") or r.get("width", 0) > 0 or r.get("media_keys") or r.get("sources", 0) > 0:
seen_video_wired = True
return PlaybackVerdict(
is_playable=True, signal="video.wired",
elapsed_ms=int((time.monotonic() - start) * 1000),
)
last_err = r.get("err", "")
await asyncio.sleep(0.5)
if seen_video_wired:
return PlaybackVerdict(is_playable=True, signal="video.wired",
elapsed_ms=int((time.monotonic() - start) * 1000))
if seen_video_tag:
return PlaybackVerdict(is_playable=True, signal="video.tag_only",
elapsed_ms=int((time.monotonic() - start) * 1000))
err = "no <video> element rendered"
if last_err:
err += f"; last_err: {last_err}"
return PlaybackVerdict(is_playable=False, error=err,
elapsed_ms=int((time.monotonic() - start) * 1000))
class PlaybackVerifier:
"""Verifies playability of m3u8 and embed URLs via headless Chromium.
Manages a single browser instance for the process lifetime (cheap per-page
contexts) and bounds concurrency with a semaphore.
"""
def __init__(self) -> None:
self._browser = None
self._playwright = None
self._sem = asyncio.Semaphore(MAX_CONCURRENCY)
self._lock = asyncio.Lock()
async def _ensure_browser(self):
if self._browser is not None:
return self._browser
async with self._lock:
if self._browser is not None:
return self._browser
try:
from playwright.async_api import async_playwright
except ImportError:
logger.error("playwright not installed — playback verification disabled")
return None
self._playwright = await async_playwright().start()
ws_base = os.getenv("CHROME_WS_URL")
ws_token = os.getenv("CHROME_WS_TOKEN")
if ws_base and ws_token:
self._browser = await self._playwright.chromium.connect(
f"{ws_base.rstrip('/')}/{ws_token}", timeout=15_000,
)
logger.info("connected to remote chrome-service (concurrency=%d)", MAX_CONCURRENCY)
else:
self._browser = await self._playwright.chromium.launch(
headless=True,
args=[
"--disable-dev-shm-usage",
"--disable-web-security",
"--no-sandbox",
"--disable-setuid-sandbox",
"--disable-features=IsolateOrigins,site-per-process",
"--autoplay-policy=no-user-gesture-required",
],
)
logger.warning("CHROME_WS_URL not set — using in-process Chromium (concurrency=%d)", MAX_CONCURRENCY)
return self._browser
async def shutdown(self) -> None:
if self._browser is not None:
try:
await self._browser.close()
except Exception:
logger.exception("error closing browser")
if self._playwright is not None:
try:
await self._playwright.stop()
except Exception:
logger.exception("error stopping playwright")
self._browser = None
self._playwright = None
async def verify(self, url: str, stream_type: str) -> PlaybackVerdict:
if not VERIFY_ENABLED:
return PlaybackVerdict(is_playable=True, error="disabled")
browser = await self._ensure_browser()
if browser is None:
return PlaybackVerdict(is_playable=False, error="playwright unavailable")
is_m3u8 = stream_type == "m3u8"
if not is_m3u8:
url = f"{PROXY_BASE}/embed?url={_b64url(url)}"
async with self._sem:
# Set the per-stream deadline AFTER acquiring the semaphore.
# Otherwise queued streams that wait behind earlier ones
# would have already-expired deadlines when they start.
deadline = time.monotonic() + PER_STREAM_TIMEOUT
try:
context = await browser.new_context(
user_agent=USER_AGENT,
viewport={"width": 1280, "height": 720},
bypass_csp=True,
)
from backend.stealth import STEALTH_JS
await context.add_init_script(STEALTH_JS)
page = await context.new_page()
except Exception as e:
return PlaybackVerdict(
is_playable=False, error=f"context create failed: {e}",
)
try:
if is_m3u8:
verdict = await _verify_m3u8(page, url, deadline)
else:
verdict = await _verify_embed(page, url, deadline)
except asyncio.TimeoutError:
verdict = PlaybackVerdict(is_playable=False, error="overall timeout")
except Exception as e:
verdict = PlaybackVerdict(
is_playable=False, error=f"verify exception: {e}",
)
finally:
try:
await page.close()
await context.close()
except Exception:
pass
logger.info(
"[verify] %s -> playable=%s signal=%s err=%s elapsed=%dms",
url[:120], verdict.is_playable, verdict.signal,
verdict.error, verdict.elapsed_ms,
)
return verdict
async def verify_many(self, items: list[tuple[str, str]]) -> dict[str, PlaybackVerdict]:
if not items:
return {}
if not VERIFY_ENABLED:
return {url: PlaybackVerdict(is_playable=True, error="disabled") for url, _ in items}
async def _run(url: str, stream_type: str):
verdict = await self.verify(url, stream_type)
return url, verdict
results = await asyncio.gather(
*[_run(url, st) for url, st in items], return_exceptions=True
)
out: dict[str, PlaybackVerdict] = {}
for r in results:
if isinstance(r, Exception):
logger.exception("verify task crashed: %s", r)
continue
url, verdict = r
out[url] = verdict
return out