f1-stream: drop demo + landing-page extractors, add fetch-proxy injection

Per user feedback: the demo Big Buck Bunny / Apple test streams aren't
useful in an F1-streams app. Removed DemoExtractor entirely. Tightened
the discord-extractor path filter from "any stream-shaped path" to
"direct embed/player path only" — the previous filter still let
sportsurge `/event/...` landing pages through, which the verifier
mistook for playable because they render player-class divs without a
real player.

Embed proxy now also rewrites window.fetch + XMLHttpRequest.open inside
the upstream HTML so that cross-origin XHRs (e.g. the hmembeds
`/sec/<JWT>` token-binding endpoint) go through our /embed-asset relay.
This avoids the CORS reject that fired when the player JS tried to call
hghndasw.gbgdhdffhf.shop/sec/... from an `f1.viktorbarzin.me` origin.

The verifier now requires a `<video>` element to mark embed streams
playable (not just a player-class div). Curated streams bypass the
verifier — hmembeds aggressively detects headless Chromium (devtool
trap, console-clear timing, automation flags) and won't progress past
JW Player init in our pod, but the user's real browser should clear
those checks. We can't honestly headless-verify hmembeds, so we trust
the curator instead of falsely rejecting them.

Image: viktorbarzin/f1-stream:v6.1.1
This commit is contained in:
Viktor Barzin 2026-05-06 21:50:54 +00:00
parent f90d79ed4e
commit 574cdf08d2
5 changed files with 87 additions and 27 deletions

View file

@ -96,6 +96,63 @@ _FRAME_BUSTER_DEFEAT_TEMPLATE = """
loc.assign = function(u){{ if (typeof u === 'string' && u.indexOf('google.com') !== -1) return; if (origAssign) origAssign(u); }};
loc.replace = function(u){{ if (typeof u === 'string' && u.indexOf('google.com') !== -1) return; if (origReplace) origReplace(u); }};
}} catch (e) {{}}
// Route all cross-origin fetch/XHR requests through our /embed-asset
// proxy. The hmembeds player calls a token-binding endpoint
// (hghndasw.gbgdhdffhf.shop/sec/<JWT>) that CORS-rejects requests from
// any origin other than hmembeds.one. By rewriting the URL to
// /embed-asset?url=..., the browser fetches our same-origin endpoint
// (no CORS issue), and our backend fetches the upstream with the
// correct Referer/Origin server-side (no CORS issue there either).
try {{
var b64url = function(s) {{
return btoa(unescape(encodeURIComponent(s)))
.replace(/\\+/g, '-').replace(/\\//g, '_').replace(/=+$/, '');
}};
var sameOrigin = function(u) {{
try {{ return (new URL(u, document.baseURI || location.href)).origin === location.origin; }}
catch (_) {{ return true; }}
}};
var toAbsolute = function(u) {{
try {{ return (new URL(u, document.baseURI || location.href)).toString(); }}
catch (_) {{ return u; }}
}};
var proxify = function(u) {{
var abs = toAbsolute(u);
if (sameOrigin(abs)) return u;
// Don't double-proxy.
if (abs.indexOf('/embed-asset?') !== -1 || abs.indexOf('/embed?') !== -1) return u;
return location.origin + '/embed-asset?url=' + b64url(abs);
}};
var _fetch = window.fetch && window.fetch.bind(window);
if (_fetch) {{
window.fetch = function(input, init) {{
try {{
if (typeof input === 'string') {{
return _fetch(proxify(input), init);
}} else if (input && input.url) {{
var newUrl = proxify(input.url);
if (newUrl !== input.url) {{
return _fetch(new Request(newUrl, input), init);
}}
}}
}} catch (e) {{}}
return _fetch(input, init);
}};
}}
var XHR = window.XMLHttpRequest;
if (XHR && XHR.prototype && XHR.prototype.open) {{
var _open = XHR.prototype.open;
XHR.prototype.open = function(method, url) {{
try {{ url = proxify(url); }} catch (e) {{}}
var args = Array.prototype.slice.call(arguments);
args[1] = url;
return _open.apply(this, args);
}};
}}
}} catch (e) {{}}
}})();</script>
"""

View file

@ -14,7 +14,6 @@ Example:
from backend.extractors.aceztrims import AceztrimsExtractor
from backend.extractors.curated import CuratedExtractor
from backend.extractors.daddylive import DaddyLiveExtractor
from backend.extractors.demo import DemoExtractor
from backend.extractors.discord_source import DiscordExtractor
from backend.extractors.models import ExtractedStream
from backend.extractors.pitsport import PitsportExtractor
@ -42,12 +41,11 @@ def create_registry() -> ExtractorRegistry:
# --- Register extractors below ---
# CuratedExtractor returns hand-picked 24/7 channels first so we always
# have something. FallbackExtractor was removed — it surfaced aggregator
# landing pages that don't play directly in an iframe (they require
# user navigation through the page) and dominated the list with
# entries that fail browser-based playback verification.
# have something. DemoExtractor and FallbackExtractor were removed —
# demo streams aren't F1 content (just Big Buck Bunny etc.) and
# FallbackExtractor surfaced aggregator landing pages that don't play
# directly in an iframe.
registry.register(CuratedExtractor())
registry.register(DemoExtractor())
registry.register(StreamedExtractor())
registry.register(DaddyLiveExtractor())
registry.register(AceztrimsExtractor())

View file

@ -42,13 +42,13 @@ EXCLUDED_DOMAINS = {
}
# A URL is treated as a candidate stream embed only if its path looks like
# a stream/embed/player route. This catches /embed/{id}, /stream/{id},
# /watch/{id}, /live/{slug}, /player/{...} and similar — and rejects
# /article/, /news/, /latest/, /join/, etc.
# a *direct* player/embed page — `/embed/{id}`, `/player/{...}`, `*.m3u8`,
# `*.php` (legacy iframe1.php style). Aggregator landing pages
# (`/event/...`, `/watch?session=...`, etc.) are rejected because they
# show a list of links instead of playing automatically — those produce
# verifier-passing UI without actual playback.
_PATH_KEYWORDS = (
"embed/", "/stream", "/streams", "/watch", "/live",
"/player", "/play/", "/sky", "/f1/", "/formula",
"/grand-prix", "/gp/", "/channel", ".m3u8", ".php",
"/embed/", "/player/", ".m3u8", ".php",
)

View file

@ -94,10 +94,21 @@ class ExtractionService:
stream.is_live = verdict.is_playable
stream.checked_at = now_iso
# Curated streams skip the verifier — they are hand-picked
# 24/7 channels whose embed pages aggressively detect headless
# automation. We can't reliably confirm playback server-side,
# but we trust the curator. The user's real browser does NOT
# trigger the same anti-bot heuristics (real plugins, real
# mouse movements, etc.).
CURATED_BYPASS = {"curated"}
for stream in embed_streams:
stream.checked_at = now_iso
if stream.site_key in CURATED_BYPASS:
stream.is_live = True
stream.response_time_ms = 0
continue
key = stream.embed_url or stream.url
verdict = verdicts.get(key)
stream.checked_at = now_iso
if verdict is None:
# Verifier unavailable — fall back to "trust extractor".
# This keeps the service usable even without playwright.

View file

@ -162,12 +162,9 @@ _EMBED_POLL_JS = """
sources: v.querySelectorAll('source').length,
};
}
const player_divs = document.querySelectorAll(
'[id*="player" i], [class*="player" i], [class*="jwplayer" i], [id*="video" i], [class*="video-js" i]'
);
return {has_video: false, has_player_div: player_divs.length > 0};
return {has_video: false};
} catch (e) {
return {has_video: false, has_player_div: false, err: String(e)};
return {has_video: false, err: String(e)};
}
}
"""
@ -271,12 +268,14 @@ async def _verify_embed(page, proxied_url: str, deadline: float) -> PlaybackVerd
)
# Track the best state seen across all polls. Some embeds load a player
# div briefly then anti-bot JS tears the DOM down (hmembeds redirects
# to google.com if its devtool-detection trips). We accept any positive
# briefly then anti-bot JS tears the DOM down (hmembeds redirects to
# google.com if its devtool-detection trips). We accept any positive
# signal observed during the window, even if it's gone by timeout.
#
# We require an actual <video> element — a "player container div"
# is too weak (sportsurge has player-class divs but no real player).
seen_video_wired = False
seen_video_tag = False
seen_player_div = False
last_err = ""
while time.monotonic() < deadline:
@ -295,8 +294,6 @@ async def _verify_embed(page, proxied_url: str, deadline: float) -> PlaybackVerd
is_playable=True, signal="video.wired",
elapsed_ms=int((time.monotonic() - start) * 1000),
)
if r.get("has_player_div"):
seen_player_div = True
last_err = r.get("err", "")
await asyncio.sleep(0.5)
@ -306,11 +303,8 @@ async def _verify_embed(page, proxied_url: str, deadline: float) -> PlaybackVerd
if seen_video_tag:
return PlaybackVerdict(is_playable=True, signal="video.tag_only",
elapsed_ms=int((time.monotonic() - start) * 1000))
if seen_player_div:
return PlaybackVerdict(is_playable=True, signal="player_div",
elapsed_ms=int((time.monotonic() - start) * 1000))
err = "no <video> or player container found"
err = "no <video> element rendered"
if last_err:
err += f"; last_err: {last_err}"
return PlaybackVerdict(is_playable=False, error=err,