f1-stream: revive aceztrims + pitsport, more ppv variants

- aceztrims: scrape /f11/ (the actual stream page), not /f1/ (the
  cross-sport schedule). Drop the dead /iframe1?s= + onclick m3u8
  regexes (site moved to `getElementById('iframe').src = '...'` ~20
  channels ago). Strip HTML comments first so the ~20 legacy buttons
  kept inside <!-- ... --> stop showing up as false positives.
  Also pick up the default inline <iframe id='iframe' src='...'>.
  Local run: 11 channels (was 0).

- pitsport: decode the RSC payload before regex-matching in
  _parse_live_events (raw HTML had it escape-encoded, so the homepage
  card path was silently 0). Add the new /live-now route (canonical
  what's-live-right-now list). Add "f1" to MOTORSPORT_CATEGORIES — the
  site labels Formula 1 events as just "F1". Refresh the stale
  serveplay.site docstring (host rotates; pushembdz's api/stream link
  is authoritative).
  Local run: 7 m3u8 streams covering Canadian GP (EN1/EN2/MULTI/ITA/ESP)
  + NASCAR Coke 600 (was 0).

- ppv: always emit the parent embed alongside substreams (was dropping
  it whenever substreams existed). Prefer source_tag in substream titles
  so users see "Sky Sport 1 NZ" / "Apple TV (F1TV)" instead of generic
  #1/#2 suffixes.

Diagnosed against the live cluster (curated + 7 other extractors
returning 0 cached streams, only 2 dead hmembeds curated 24/7 channels
visible to users). Each fix verified with the extractor run against
live sites this turn.
This commit is contained in:
Viktor Barzin 2026-05-24 22:05:37 +00:00
parent d5f73ce109
commit 5a0e4b3dac
3 changed files with 147 additions and 125 deletions

View file

@ -1,13 +1,24 @@
"""Aceztrims extractor - scrapes F1 streaming links from Aceztrims pages. """Aceztrims extractor — scrapes embed URLs from acestrlms.pages.dev/f11/.
Parses HTML for iframe button onclick handlers and extracts streams from: The page (Cloudflare Pages, no anti-bot) hosts an iframe + a strip of
- /iframe1?s=<m3u8_url> direct m3u8 onclick channel-switcher buttons. Each button rewrites the iframe via
- https://pooembed.eu/embed/... embed URL `document.getElementById('iframe').src = '<embed_url>'`. The initial
channel is hard-coded as `<iframe id='iframe' src='...'>`.
We strip HTML comments first because the page keeps ~20 legacy channel
buttons inside `<!-- ... -->` blocks for easy re-enablement; the previous
loose regex picked them up as false positives.
All channels are iframe embeds (no direct m3u8) `stream_type='embed'`.
Site naming note: the extractor key stays `aceztrims` (the previous
domain) so registry/cache identifiers don't churn. The current domain
is `acestrlms.pages.dev` and the F1 path is `/f11/` (two ones `/f1/`
is the cross-sport schedule page and has no stream buttons).
""" """
import logging import logging
import re import re
from urllib.parse import parse_qs, urlparse
import httpx import httpx
@ -17,9 +28,8 @@ from backend.extractors.models import ExtractedStream
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
BASE_URL = "https://acestrlms.pages.dev" BASE_URL = "https://acestrlms.pages.dev"
# Pages to scrape for streams
F1_PAGES = [ F1_PAGES = [
("/f1/", "Formula 1"), ("/f11/", "Formula 1"),
] ]
USER_AGENT = ( USER_AGENT = (
@ -28,13 +38,21 @@ USER_AGENT = (
"Chrome/120.0.0.0 Safari/537.36" "Chrome/120.0.0.0 Safari/537.36"
) )
# `document.getElementById('iframe').src = '<URL>'` — current channel-switcher format.
_ONCLICK_IFRAME_SRC = re.compile(
r"""document\.getElementById\(['"]iframe['"]\)\.src\s*=\s*['"]([^'"]+)['"]""",
re.IGNORECASE,
)
# `<iframe id='iframe' src='<URL>'>` — the default/initial channel.
_DEFAULT_IFRAME = re.compile(
r"""<iframe[^>]*id\s*=\s*['"]iframe['"][^>]*src\s*=\s*['"]([^'"]+)['"]""",
re.IGNORECASE,
)
_HTML_COMMENT = re.compile(r"<!--.*?-->", re.DOTALL)
class AceztrimsExtractor(BaseExtractor): class AceztrimsExtractor(BaseExtractor):
"""Extracts streams from Aceztrims pages by parsing HTML for iframe URLs. """Pulls iframe embed URLs out of the acestrlms.pages.dev F1 page."""
Looks for onclick handlers on buttons/links that open iframes, and
extracts the stream URLs from them.
"""
@property @property
def site_key(self) -> str: def site_key(self) -> str:
@ -45,7 +63,6 @@ class AceztrimsExtractor(BaseExtractor):
return "Aceztrims" return "Aceztrims"
async def extract(self) -> list[ExtractedStream]: async def extract(self) -> list[ExtractedStream]:
"""Scrape all configured F1 pages for stream URLs."""
streams: list[ExtractedStream] = [] streams: list[ExtractedStream] = []
async with httpx.AsyncClient( async with httpx.AsyncClient(
@ -55,12 +72,9 @@ class AceztrimsExtractor(BaseExtractor):
) as client: ) as client:
for path, category in F1_PAGES: for path, category in F1_PAGES:
try: try:
page_streams = await self._scrape_page(client, path, category) streams.extend(await self._scrape_page(client, path, category))
streams.extend(page_streams)
except Exception: except Exception:
logger.exception( logger.exception("[aceztrims] Failed to scrape %s", path)
"[aceztrims] Failed to scrape page %s", path
)
logger.info("[aceztrims] Extracted %d stream(s)", len(streams)) logger.info("[aceztrims] Extracted %d stream(s)", len(streams))
return streams return streams
@ -68,85 +82,39 @@ class AceztrimsExtractor(BaseExtractor):
async def _scrape_page( async def _scrape_page(
self, client: httpx.AsyncClient, path: str, category: str self, client: httpx.AsyncClient, path: str, category: str
) -> list[ExtractedStream]: ) -> list[ExtractedStream]:
"""Scrape a single page for stream URLs."""
url = f"{BASE_URL}{path}" url = f"{BASE_URL}{path}"
resp = await client.get(url) resp = await client.get(url)
if resp.status_code != 200: if resp.status_code != 200:
logger.warning( logger.warning(
"[aceztrims] Page %s returned HTTP %d", path, resp.status_code "[aceztrims] %s returned HTTP %d", path, resp.status_code
) )
return [] return []
html = resp.text # The page keeps a block of legacy channel buttons inside
# `<!-- ... -->` for quick re-enablement. Strip comments first so
# the regex only sees live buttons.
html = _HTML_COMMENT.sub("", resp.text)
seen: set[str] = set()
streams: list[ExtractedStream] = [] streams: list[ExtractedStream] = []
seen_urls: set[str] = set()
# Pattern 1: /iframe1?s=<m3u8_url> — direct m3u8 for pattern in (_DEFAULT_IFRAME, _ONCLICK_IFRAME_SRC):
iframe1_pattern = re.compile( for match in pattern.finditer(html):
r"""['"]((?:https?://[^'"]*)?/iframe1\?s=([^'"&]+))['""]""", embed_url = match.group(1).strip()
re.IGNORECASE, if not embed_url or embed_url in seen:
) continue
for match in iframe1_pattern.finditer(html): seen.add(embed_url)
m3u8_url = match.group(2) streams.append(
if m3u8_url in seen_urls: ExtractedStream(
continue url=embed_url,
seen_urls.add(m3u8_url) site_key=self.site_key,
site_name=self.site_name,
streams.append( quality="",
ExtractedStream( title=f"{category} Stream",
url=m3u8_url, stream_type="embed",
site_key=self.site_key, embed_url=embed_url,
site_name=self.site_name, )
quality="",
title=f"{category} Stream",
stream_type="m3u8",
) )
)
# Pattern 2: embed URLs (pooembed.eu or similar)
embed_pattern = re.compile(
r"""['"]((https?://(?:pooembed\.eu|[^'"]*embed)[^'"]*))['"]""",
re.IGNORECASE,
)
for match in embed_pattern.finditer(html):
embed_url = match.group(1)
if embed_url in seen_urls:
continue
seen_urls.add(embed_url)
streams.append(
ExtractedStream(
url=embed_url,
site_key=self.site_key,
site_name=self.site_name,
quality="",
title=f"{category} Stream (Embed)",
stream_type="embed",
embed_url=embed_url,
)
)
# Pattern 3: Generic onclick handlers with URLs
onclick_pattern = re.compile(
r"""onclick\s*=\s*['"].*?['"]?(https?://[^'")\s]+\.m3u8[^'")\s]*)['"]?""",
re.IGNORECASE,
)
for match in onclick_pattern.finditer(html):
m3u8_url = match.group(1)
if m3u8_url in seen_urls:
continue
seen_urls.add(m3u8_url)
streams.append(
ExtractedStream(
url=m3u8_url,
site_key=self.site_key,
site_name=self.site_name,
quality="",
title=f"{category} Stream",
stream_type="m3u8",
)
)
logger.info( logger.info(
"[aceztrims] Found %d stream(s) on %s", len(streams), path "[aceztrims] Found %d stream(s) on %s", len(streams), path

View file

@ -34,7 +34,7 @@ USER_AGENT = (
# to also surface MotoGP and adjacent motorsports — keeps the f1-stream # to also surface MotoGP and adjacent motorsports — keeps the f1-stream
# UI useful between race weekends and during the off-season. # UI useful between race weekends and during the off-season.
MOTORSPORT_CATEGORIES = { MOTORSPORT_CATEGORIES = {
"formula 1", "formula 2", "formula 3", "f1", "formula 1", "formula 2", "formula 3",
"motogp", "moto gp", "moto2", "moto3", "motoe", "motogp", "moto gp", "moto2", "moto3", "motoe",
"world rally championship", "wrc", "world rally championship", "wrc",
"world endurance championship", "wec", "world endurance championship", "wec",
@ -85,27 +85,61 @@ _is_f1_category = _is_motorsport_category
_is_f1_event = _is_motorsport_event _is_f1_event = _is_motorsport_event
def _parse_live_events(html: str) -> list[_PitsportEvent]: def _decode_rsc_payload(html: str) -> str:
"""Parse live events from the main page RSC payload. """Concatenate and unescape all `self.__next_f.push([1, "..."])` chunks.
The main page contains event cards with props: Next.js RSC ships its tree as escape-encoded strings inside repeated
category, title, time, imageUrl `self.__next_f.push` calls. Regex over the raw HTML misses everything
wrapped in <a href="/watch/{UUID}"> links. interesting; we have to decode unicode escapes first.
""" """
chunks = re.findall(r'self\.__next_f\.push\(\[1,"(.*?)"\]\)', html, re.DOTALL)
if not chunks:
return ""
payload = ""
for chunk in chunks:
try:
payload += chunk.encode().decode("unicode_escape")
except Exception:
payload += chunk
return payload
def _parse_live_events(html: str) -> list[_PitsportEvent]:
"""Parse live events from the main page (or `/live-now`) RSC payload.
The pages embed event cards inside the Next.js RSC payload; the raw
HTML keeps it escape-encoded so we decode first, then match.
Two shapes are common:
1) Older card props: "category":"...","title":"..." next to
"href":"/watch/UUID".
2) Newer `event` prop: an `event` object with `uri:"/watch/UUID"`
carrying `category` and `title`.
"""
payload = _decode_rsc_payload(html) or html
events: list[_PitsportEvent] = [] events: list[_PitsportEvent] = []
# Match event cards in the RSC payload - they appear as JSON-like structures href_pattern = re.compile(
# Pattern: href="/watch/UUID" ... category":"...", "title":"..."
# In the RSC payload, the data is in the format:
# ["$","$L2","/watch/UUID",{"href":"/watch/UUID","children":["$","$L10",null,
# {"category":"...","title":"...","time":...,"imageUrl":"..."}]}]
pattern = re.compile(
r'"href":"(/watch/([0-9a-f-]{36}))"[^}]*?"category":"([^"]+)","title":"([^"]+)"', r'"href":"(/watch/([0-9a-f-]{36}))"[^}]*?"category":"([^"]+)","title":"([^"]+)"',
) )
for match in pattern.finditer(html): for match in href_pattern.finditer(payload):
_, uuid, category, title = match.groups() _, uuid, category, title = match.groups()
events.append(_PitsportEvent(category=category, title=title, watch_uuid=uuid)) events.append(_PitsportEvent(category=category, title=title, watch_uuid=uuid))
event_pattern = re.compile(
r'"event":\{[^{}]*?"title":"([^"]+)"[^{}]*?"uri":"/watch/([0-9a-f-]{36})"[^{}]*?"category":"([^"]+)"',
)
for match in event_pattern.finditer(payload):
title, uuid, category = match.groups()
events.append(_PitsportEvent(category=category, title=title, watch_uuid=uuid))
event_pattern_alt = re.compile(
r'"event":\{[^{}]*?"category":"([^"]+)"[^{}]*?"title":"([^"]+)"[^{}]*?"uri":"/watch/([0-9a-f-]{36})"',
)
for match in event_pattern_alt.finditer(payload):
category, title, uuid = match.groups()
events.append(_PitsportEvent(category=category, title=title, watch_uuid=uuid))
return events return events
@ -301,13 +335,12 @@ def _is_m3u8_method(method: str) -> bool:
def _extract_m3u8_url(link: str) -> str: def _extract_m3u8_url(link: str) -> str:
"""Convert a serveplay.site player URL to an m3u8 playlist URL. """Pass through the link from pushembdz's `api/stream/<slug>` response.
Input: https://dash.serveplay.site/{channel}/index.html The host has rotated over time (serveplay.site oe1.ossfeed.store
Output: https://dash.serveplay.site/{channel}/index.html ); the response is always a master playlist URL we hand to the
player as-is. Content-Type may be `text/css` or `application/json`
The index.html IS the m3u8 playlist (served with proper content-type treat as HLS based on body sniffing (`#EXTM3U`), not MIME.
when fetched with the correct Referer header).
""" """
return link return link
@ -388,6 +421,24 @@ class PitsportExtractor(BaseExtractor):
except Exception: except Exception:
logger.exception("[pitsport] Failed to fetch main page") logger.exception("[pitsport] Failed to fetch main page")
# Fetch /live-now — canonical "currently live" list, added 2026.
try:
resp = await client.get(f"{PITSPORT_BASE}/live-now")
if resp.status_code == 200:
live_now_events = _parse_live_events(resp.text)
logger.info(
"[pitsport] Live-now page: %d event(s)", len(live_now_events)
)
for ev in live_now_events:
if _is_f1_event(ev.category, ev.title):
all_events.append(ev)
else:
logger.warning(
"[pitsport] Live-now page returned HTTP %d", resp.status_code
)
except Exception:
logger.exception("[pitsport] Failed to fetch live-now page")
# Fetch schedule page for upcoming events # Fetch schedule page for upcoming events
try: try:
resp = await client.get(f"{PITSPORT_BASE}/schedule") resp = await client.get(f"{PITSPORT_BASE}/schedule")

View file

@ -153,21 +153,37 @@ class PPVExtractor(BaseExtractor):
if viewers and int(viewers) > 0: if viewers and int(viewers) > 0:
title += f" ({viewers} viewers)" title += f" ({viewers} viewers)"
# Check for substreams (multiple quality/language options) # Always emit the parent stream — substreams are
# additional language/source variants, not replacements.
streams.append(
ExtractedStream(
url=embed_url,
site_key=self.site_key,
site_name=self.site_name,
quality=quality,
title=title,
stream_type="embed",
embed_url=embed_url,
)
)
substreams = stream_obj.get("substreams") substreams = stream_obj.get("substreams")
if isinstance(substreams, list) and substreams: if isinstance(substreams, list):
for i, sub in enumerate(substreams): for i, sub in enumerate(substreams):
sub_embed = sub.get("iframe", "") or sub.get("embed_url", "") sub_embed = sub.get("iframe", "") or sub.get("embed_url", "")
if not sub_embed: if not sub_embed:
# Fall back to the parent embed URL
sub_embed = embed_url sub_embed = embed_url
sub_name = sub.get("name", "") or sub.get("label", "") sub_name = (
sub.get("source_tag", "")
or sub.get("name", "")
or sub.get("label", "")
)
sub_quality = sub.get("tag", "") or sub.get("quality", "") or quality sub_quality = sub.get("tag", "") or sub.get("quality", "") or quality
sub_title = f"{name}" sub_title = f"{name}"
if sub_name: if sub_name:
sub_title += f" - {sub_name}" sub_title += f" - {sub_name}"
elif i > 0: else:
sub_title += f" #{i + 1}" sub_title += f" #{i + 2}"
streams.append( streams.append(
ExtractedStream( ExtractedStream(
@ -180,19 +196,6 @@ class PPVExtractor(BaseExtractor):
embed_url=sub_embed, embed_url=sub_embed,
) )
) )
else:
# Single stream, no substreams
streams.append(
ExtractedStream(
url=embed_url,
site_key=self.site_key,
site_name=self.site_name,
quality=quality,
title=title,
stream_type="embed",
embed_url=embed_url,
)
)
except Exception: except Exception:
logger.exception("[ppv] Failed to extract streams") logger.exception("[ppv] Failed to extract streams")