f1-stream: revive aceztrims + pitsport, more ppv variants
- aceztrims: scrape /f11/ (the actual stream page), not /f1/ (the
cross-sport schedule). Drop the dead /iframe1?s= + onclick m3u8
regexes (site moved to `getElementById('iframe').src = '...'` ~20
channels ago). Strip HTML comments first so the ~20 legacy buttons
kept inside <!-- ... --> stop showing up as false positives.
Also pick up the default inline <iframe id='iframe' src='...'>.
Local run: 11 channels (was 0).
- pitsport: decode the RSC payload before regex-matching in
_parse_live_events (raw HTML had it escape-encoded, so the homepage
card path was silently 0). Add the new /live-now route (canonical
what's-live-right-now list). Add "f1" to MOTORSPORT_CATEGORIES — the
site labels Formula 1 events as just "F1". Refresh the stale
serveplay.site docstring (host rotates; pushembdz's api/stream link
is authoritative).
Local run: 7 m3u8 streams covering Canadian GP (EN1/EN2/MULTI/ITA/ESP)
+ NASCAR Coke 600 (was 0).
- ppv: always emit the parent embed alongside substreams (was dropping
it whenever substreams existed). Prefer source_tag in substream titles
so users see "Sky Sport 1 NZ" / "Apple TV (F1TV)" instead of generic
#1/#2 suffixes.
Diagnosed against the live cluster (curated + 7 other extractors
returning 0 cached streams, only 2 dead hmembeds curated 24/7 channels
visible to users). Each fix verified with the extractor run against
live sites this turn.
This commit is contained in:
parent
d5f73ce109
commit
5a0e4b3dac
3 changed files with 147 additions and 125 deletions
|
|
@ -1,13 +1,24 @@
|
||||||
"""Aceztrims extractor - scrapes F1 streaming links from Aceztrims pages.
|
"""Aceztrims extractor — scrapes embed URLs from acestrlms.pages.dev/f11/.
|
||||||
|
|
||||||
Parses HTML for iframe button onclick handlers and extracts streams from:
|
The page (Cloudflare Pages, no anti-bot) hosts an iframe + a strip of
|
||||||
- /iframe1?s=<m3u8_url> → direct m3u8
|
onclick channel-switcher buttons. Each button rewrites the iframe via
|
||||||
- https://pooembed.eu/embed/... → embed URL
|
`document.getElementById('iframe').src = '<embed_url>'`. The initial
|
||||||
|
channel is hard-coded as `<iframe id='iframe' src='...'>`.
|
||||||
|
|
||||||
|
We strip HTML comments first because the page keeps ~20 legacy channel
|
||||||
|
buttons inside `<!-- ... -->` blocks for easy re-enablement; the previous
|
||||||
|
loose regex picked them up as false positives.
|
||||||
|
|
||||||
|
All channels are iframe embeds (no direct m3u8) — `stream_type='embed'`.
|
||||||
|
|
||||||
|
Site naming note: the extractor key stays `aceztrims` (the previous
|
||||||
|
domain) so registry/cache identifiers don't churn. The current domain
|
||||||
|
is `acestrlms.pages.dev` and the F1 path is `/f11/` (two ones — `/f1/`
|
||||||
|
is the cross-sport schedule page and has no stream buttons).
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
import re
|
import re
|
||||||
from urllib.parse import parse_qs, urlparse
|
|
||||||
|
|
||||||
import httpx
|
import httpx
|
||||||
|
|
||||||
|
|
@ -17,9 +28,8 @@ from backend.extractors.models import ExtractedStream
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
BASE_URL = "https://acestrlms.pages.dev"
|
BASE_URL = "https://acestrlms.pages.dev"
|
||||||
# Pages to scrape for streams
|
|
||||||
F1_PAGES = [
|
F1_PAGES = [
|
||||||
("/f1/", "Formula 1"),
|
("/f11/", "Formula 1"),
|
||||||
]
|
]
|
||||||
|
|
||||||
USER_AGENT = (
|
USER_AGENT = (
|
||||||
|
|
@ -28,13 +38,21 @@ USER_AGENT = (
|
||||||
"Chrome/120.0.0.0 Safari/537.36"
|
"Chrome/120.0.0.0 Safari/537.36"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# `document.getElementById('iframe').src = '<URL>'` — current channel-switcher format.
|
||||||
|
_ONCLICK_IFRAME_SRC = re.compile(
|
||||||
|
r"""document\.getElementById\(['"]iframe['"]\)\.src\s*=\s*['"]([^'"]+)['"]""",
|
||||||
|
re.IGNORECASE,
|
||||||
|
)
|
||||||
|
# `<iframe id='iframe' src='<URL>'>` — the default/initial channel.
|
||||||
|
_DEFAULT_IFRAME = re.compile(
|
||||||
|
r"""<iframe[^>]*id\s*=\s*['"]iframe['"][^>]*src\s*=\s*['"]([^'"]+)['"]""",
|
||||||
|
re.IGNORECASE,
|
||||||
|
)
|
||||||
|
_HTML_COMMENT = re.compile(r"<!--.*?-->", re.DOTALL)
|
||||||
|
|
||||||
|
|
||||||
class AceztrimsExtractor(BaseExtractor):
|
class AceztrimsExtractor(BaseExtractor):
|
||||||
"""Extracts streams from Aceztrims pages by parsing HTML for iframe URLs.
|
"""Pulls iframe embed URLs out of the acestrlms.pages.dev F1 page."""
|
||||||
|
|
||||||
Looks for onclick handlers on buttons/links that open iframes, and
|
|
||||||
extracts the stream URLs from them.
|
|
||||||
"""
|
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def site_key(self) -> str:
|
def site_key(self) -> str:
|
||||||
|
|
@ -45,7 +63,6 @@ class AceztrimsExtractor(BaseExtractor):
|
||||||
return "Aceztrims"
|
return "Aceztrims"
|
||||||
|
|
||||||
async def extract(self) -> list[ExtractedStream]:
|
async def extract(self) -> list[ExtractedStream]:
|
||||||
"""Scrape all configured F1 pages for stream URLs."""
|
|
||||||
streams: list[ExtractedStream] = []
|
streams: list[ExtractedStream] = []
|
||||||
|
|
||||||
async with httpx.AsyncClient(
|
async with httpx.AsyncClient(
|
||||||
|
|
@ -55,12 +72,9 @@ class AceztrimsExtractor(BaseExtractor):
|
||||||
) as client:
|
) as client:
|
||||||
for path, category in F1_PAGES:
|
for path, category in F1_PAGES:
|
||||||
try:
|
try:
|
||||||
page_streams = await self._scrape_page(client, path, category)
|
streams.extend(await self._scrape_page(client, path, category))
|
||||||
streams.extend(page_streams)
|
|
||||||
except Exception:
|
except Exception:
|
||||||
logger.exception(
|
logger.exception("[aceztrims] Failed to scrape %s", path)
|
||||||
"[aceztrims] Failed to scrape page %s", path
|
|
||||||
)
|
|
||||||
|
|
||||||
logger.info("[aceztrims] Extracted %d stream(s)", len(streams))
|
logger.info("[aceztrims] Extracted %d stream(s)", len(streams))
|
||||||
return streams
|
return streams
|
||||||
|
|
@ -68,85 +82,39 @@ class AceztrimsExtractor(BaseExtractor):
|
||||||
async def _scrape_page(
|
async def _scrape_page(
|
||||||
self, client: httpx.AsyncClient, path: str, category: str
|
self, client: httpx.AsyncClient, path: str, category: str
|
||||||
) -> list[ExtractedStream]:
|
) -> list[ExtractedStream]:
|
||||||
"""Scrape a single page for stream URLs."""
|
|
||||||
url = f"{BASE_URL}{path}"
|
url = f"{BASE_URL}{path}"
|
||||||
resp = await client.get(url)
|
resp = await client.get(url)
|
||||||
if resp.status_code != 200:
|
if resp.status_code != 200:
|
||||||
logger.warning(
|
logger.warning(
|
||||||
"[aceztrims] Page %s returned HTTP %d", path, resp.status_code
|
"[aceztrims] %s returned HTTP %d", path, resp.status_code
|
||||||
)
|
)
|
||||||
return []
|
return []
|
||||||
|
|
||||||
html = resp.text
|
# The page keeps a block of legacy channel buttons inside
|
||||||
|
# `<!-- ... -->` for quick re-enablement. Strip comments first so
|
||||||
|
# the regex only sees live buttons.
|
||||||
|
html = _HTML_COMMENT.sub("", resp.text)
|
||||||
|
|
||||||
|
seen: set[str] = set()
|
||||||
streams: list[ExtractedStream] = []
|
streams: list[ExtractedStream] = []
|
||||||
seen_urls: set[str] = set()
|
|
||||||
|
|
||||||
# Pattern 1: /iframe1?s=<m3u8_url> — direct m3u8
|
for pattern in (_DEFAULT_IFRAME, _ONCLICK_IFRAME_SRC):
|
||||||
iframe1_pattern = re.compile(
|
for match in pattern.finditer(html):
|
||||||
r"""['"]((?:https?://[^'"]*)?/iframe1\?s=([^'"&]+))['""]""",
|
embed_url = match.group(1).strip()
|
||||||
re.IGNORECASE,
|
if not embed_url or embed_url in seen:
|
||||||
)
|
continue
|
||||||
for match in iframe1_pattern.finditer(html):
|
seen.add(embed_url)
|
||||||
m3u8_url = match.group(2)
|
streams.append(
|
||||||
if m3u8_url in seen_urls:
|
ExtractedStream(
|
||||||
continue
|
url=embed_url,
|
||||||
seen_urls.add(m3u8_url)
|
site_key=self.site_key,
|
||||||
|
site_name=self.site_name,
|
||||||
streams.append(
|
quality="",
|
||||||
ExtractedStream(
|
title=f"{category} Stream",
|
||||||
url=m3u8_url,
|
stream_type="embed",
|
||||||
site_key=self.site_key,
|
embed_url=embed_url,
|
||||||
site_name=self.site_name,
|
)
|
||||||
quality="",
|
|
||||||
title=f"{category} Stream",
|
|
||||||
stream_type="m3u8",
|
|
||||||
)
|
)
|
||||||
)
|
|
||||||
|
|
||||||
# Pattern 2: embed URLs (pooembed.eu or similar)
|
|
||||||
embed_pattern = re.compile(
|
|
||||||
r"""['"]((https?://(?:pooembed\.eu|[^'"]*embed)[^'"]*))['"]""",
|
|
||||||
re.IGNORECASE,
|
|
||||||
)
|
|
||||||
for match in embed_pattern.finditer(html):
|
|
||||||
embed_url = match.group(1)
|
|
||||||
if embed_url in seen_urls:
|
|
||||||
continue
|
|
||||||
seen_urls.add(embed_url)
|
|
||||||
|
|
||||||
streams.append(
|
|
||||||
ExtractedStream(
|
|
||||||
url=embed_url,
|
|
||||||
site_key=self.site_key,
|
|
||||||
site_name=self.site_name,
|
|
||||||
quality="",
|
|
||||||
title=f"{category} Stream (Embed)",
|
|
||||||
stream_type="embed",
|
|
||||||
embed_url=embed_url,
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
# Pattern 3: Generic onclick handlers with URLs
|
|
||||||
onclick_pattern = re.compile(
|
|
||||||
r"""onclick\s*=\s*['"].*?['"]?(https?://[^'")\s]+\.m3u8[^'")\s]*)['"]?""",
|
|
||||||
re.IGNORECASE,
|
|
||||||
)
|
|
||||||
for match in onclick_pattern.finditer(html):
|
|
||||||
m3u8_url = match.group(1)
|
|
||||||
if m3u8_url in seen_urls:
|
|
||||||
continue
|
|
||||||
seen_urls.add(m3u8_url)
|
|
||||||
|
|
||||||
streams.append(
|
|
||||||
ExtractedStream(
|
|
||||||
url=m3u8_url,
|
|
||||||
site_key=self.site_key,
|
|
||||||
site_name=self.site_name,
|
|
||||||
quality="",
|
|
||||||
title=f"{category} Stream",
|
|
||||||
stream_type="m3u8",
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
logger.info(
|
logger.info(
|
||||||
"[aceztrims] Found %d stream(s) on %s", len(streams), path
|
"[aceztrims] Found %d stream(s) on %s", len(streams), path
|
||||||
|
|
|
||||||
|
|
@ -34,7 +34,7 @@ USER_AGENT = (
|
||||||
# to also surface MotoGP and adjacent motorsports — keeps the f1-stream
|
# to also surface MotoGP and adjacent motorsports — keeps the f1-stream
|
||||||
# UI useful between race weekends and during the off-season.
|
# UI useful between race weekends and during the off-season.
|
||||||
MOTORSPORT_CATEGORIES = {
|
MOTORSPORT_CATEGORIES = {
|
||||||
"formula 1", "formula 2", "formula 3",
|
"f1", "formula 1", "formula 2", "formula 3",
|
||||||
"motogp", "moto gp", "moto2", "moto3", "motoe",
|
"motogp", "moto gp", "moto2", "moto3", "motoe",
|
||||||
"world rally championship", "wrc",
|
"world rally championship", "wrc",
|
||||||
"world endurance championship", "wec",
|
"world endurance championship", "wec",
|
||||||
|
|
@ -85,27 +85,61 @@ _is_f1_category = _is_motorsport_category
|
||||||
_is_f1_event = _is_motorsport_event
|
_is_f1_event = _is_motorsport_event
|
||||||
|
|
||||||
|
|
||||||
def _parse_live_events(html: str) -> list[_PitsportEvent]:
|
def _decode_rsc_payload(html: str) -> str:
|
||||||
"""Parse live events from the main page RSC payload.
|
"""Concatenate and unescape all `self.__next_f.push([1, "..."])` chunks.
|
||||||
|
|
||||||
The main page contains event cards with props:
|
Next.js RSC ships its tree as escape-encoded strings inside repeated
|
||||||
category, title, time, imageUrl
|
`self.__next_f.push` calls. Regex over the raw HTML misses everything
|
||||||
wrapped in <a href="/watch/{UUID}"> links.
|
interesting; we have to decode unicode escapes first.
|
||||||
"""
|
"""
|
||||||
|
chunks = re.findall(r'self\.__next_f\.push\(\[1,"(.*?)"\]\)', html, re.DOTALL)
|
||||||
|
if not chunks:
|
||||||
|
return ""
|
||||||
|
payload = ""
|
||||||
|
for chunk in chunks:
|
||||||
|
try:
|
||||||
|
payload += chunk.encode().decode("unicode_escape")
|
||||||
|
except Exception:
|
||||||
|
payload += chunk
|
||||||
|
return payload
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_live_events(html: str) -> list[_PitsportEvent]:
|
||||||
|
"""Parse live events from the main page (or `/live-now`) RSC payload.
|
||||||
|
|
||||||
|
The pages embed event cards inside the Next.js RSC payload; the raw
|
||||||
|
HTML keeps it escape-encoded so we decode first, then match.
|
||||||
|
Two shapes are common:
|
||||||
|
1) Older card props: "category":"...","title":"..." next to
|
||||||
|
"href":"/watch/UUID".
|
||||||
|
2) Newer `event` prop: an `event` object with `uri:"/watch/UUID"`
|
||||||
|
carrying `category` and `title`.
|
||||||
|
"""
|
||||||
|
payload = _decode_rsc_payload(html) or html
|
||||||
|
|
||||||
events: list[_PitsportEvent] = []
|
events: list[_PitsportEvent] = []
|
||||||
|
|
||||||
# Match event cards in the RSC payload - they appear as JSON-like structures
|
href_pattern = re.compile(
|
||||||
# Pattern: href="/watch/UUID" ... category":"...", "title":"..."
|
|
||||||
# In the RSC payload, the data is in the format:
|
|
||||||
# ["$","$L2","/watch/UUID",{"href":"/watch/UUID","children":["$","$L10",null,
|
|
||||||
# {"category":"...","title":"...","time":...,"imageUrl":"..."}]}]
|
|
||||||
pattern = re.compile(
|
|
||||||
r'"href":"(/watch/([0-9a-f-]{36}))"[^}]*?"category":"([^"]+)","title":"([^"]+)"',
|
r'"href":"(/watch/([0-9a-f-]{36}))"[^}]*?"category":"([^"]+)","title":"([^"]+)"',
|
||||||
)
|
)
|
||||||
for match in pattern.finditer(html):
|
for match in href_pattern.finditer(payload):
|
||||||
_, uuid, category, title = match.groups()
|
_, uuid, category, title = match.groups()
|
||||||
events.append(_PitsportEvent(category=category, title=title, watch_uuid=uuid))
|
events.append(_PitsportEvent(category=category, title=title, watch_uuid=uuid))
|
||||||
|
|
||||||
|
event_pattern = re.compile(
|
||||||
|
r'"event":\{[^{}]*?"title":"([^"]+)"[^{}]*?"uri":"/watch/([0-9a-f-]{36})"[^{}]*?"category":"([^"]+)"',
|
||||||
|
)
|
||||||
|
for match in event_pattern.finditer(payload):
|
||||||
|
title, uuid, category = match.groups()
|
||||||
|
events.append(_PitsportEvent(category=category, title=title, watch_uuid=uuid))
|
||||||
|
|
||||||
|
event_pattern_alt = re.compile(
|
||||||
|
r'"event":\{[^{}]*?"category":"([^"]+)"[^{}]*?"title":"([^"]+)"[^{}]*?"uri":"/watch/([0-9a-f-]{36})"',
|
||||||
|
)
|
||||||
|
for match in event_pattern_alt.finditer(payload):
|
||||||
|
category, title, uuid = match.groups()
|
||||||
|
events.append(_PitsportEvent(category=category, title=title, watch_uuid=uuid))
|
||||||
|
|
||||||
return events
|
return events
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -301,13 +335,12 @@ def _is_m3u8_method(method: str) -> bool:
|
||||||
|
|
||||||
|
|
||||||
def _extract_m3u8_url(link: str) -> str:
|
def _extract_m3u8_url(link: str) -> str:
|
||||||
"""Convert a serveplay.site player URL to an m3u8 playlist URL.
|
"""Pass through the link from pushembdz's `api/stream/<slug>` response.
|
||||||
|
|
||||||
Input: https://dash.serveplay.site/{channel}/index.html
|
The host has rotated over time (serveplay.site → oe1.ossfeed.store →
|
||||||
Output: https://dash.serveplay.site/{channel}/index.html
|
…); the response is always a master playlist URL we hand to the
|
||||||
|
player as-is. Content-Type may be `text/css` or `application/json` —
|
||||||
The index.html IS the m3u8 playlist (served with proper content-type
|
treat as HLS based on body sniffing (`#EXTM3U`), not MIME.
|
||||||
when fetched with the correct Referer header).
|
|
||||||
"""
|
"""
|
||||||
return link
|
return link
|
||||||
|
|
||||||
|
|
@ -388,6 +421,24 @@ class PitsportExtractor(BaseExtractor):
|
||||||
except Exception:
|
except Exception:
|
||||||
logger.exception("[pitsport] Failed to fetch main page")
|
logger.exception("[pitsport] Failed to fetch main page")
|
||||||
|
|
||||||
|
# Fetch /live-now — canonical "currently live" list, added 2026.
|
||||||
|
try:
|
||||||
|
resp = await client.get(f"{PITSPORT_BASE}/live-now")
|
||||||
|
if resp.status_code == 200:
|
||||||
|
live_now_events = _parse_live_events(resp.text)
|
||||||
|
logger.info(
|
||||||
|
"[pitsport] Live-now page: %d event(s)", len(live_now_events)
|
||||||
|
)
|
||||||
|
for ev in live_now_events:
|
||||||
|
if _is_f1_event(ev.category, ev.title):
|
||||||
|
all_events.append(ev)
|
||||||
|
else:
|
||||||
|
logger.warning(
|
||||||
|
"[pitsport] Live-now page returned HTTP %d", resp.status_code
|
||||||
|
)
|
||||||
|
except Exception:
|
||||||
|
logger.exception("[pitsport] Failed to fetch live-now page")
|
||||||
|
|
||||||
# Fetch schedule page for upcoming events
|
# Fetch schedule page for upcoming events
|
||||||
try:
|
try:
|
||||||
resp = await client.get(f"{PITSPORT_BASE}/schedule")
|
resp = await client.get(f"{PITSPORT_BASE}/schedule")
|
||||||
|
|
|
||||||
|
|
@ -153,21 +153,37 @@ class PPVExtractor(BaseExtractor):
|
||||||
if viewers and int(viewers) > 0:
|
if viewers and int(viewers) > 0:
|
||||||
title += f" ({viewers} viewers)"
|
title += f" ({viewers} viewers)"
|
||||||
|
|
||||||
# Check for substreams (multiple quality/language options)
|
# Always emit the parent stream — substreams are
|
||||||
|
# additional language/source variants, not replacements.
|
||||||
|
streams.append(
|
||||||
|
ExtractedStream(
|
||||||
|
url=embed_url,
|
||||||
|
site_key=self.site_key,
|
||||||
|
site_name=self.site_name,
|
||||||
|
quality=quality,
|
||||||
|
title=title,
|
||||||
|
stream_type="embed",
|
||||||
|
embed_url=embed_url,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
substreams = stream_obj.get("substreams")
|
substreams = stream_obj.get("substreams")
|
||||||
if isinstance(substreams, list) and substreams:
|
if isinstance(substreams, list):
|
||||||
for i, sub in enumerate(substreams):
|
for i, sub in enumerate(substreams):
|
||||||
sub_embed = sub.get("iframe", "") or sub.get("embed_url", "")
|
sub_embed = sub.get("iframe", "") or sub.get("embed_url", "")
|
||||||
if not sub_embed:
|
if not sub_embed:
|
||||||
# Fall back to the parent embed URL
|
|
||||||
sub_embed = embed_url
|
sub_embed = embed_url
|
||||||
sub_name = sub.get("name", "") or sub.get("label", "")
|
sub_name = (
|
||||||
|
sub.get("source_tag", "")
|
||||||
|
or sub.get("name", "")
|
||||||
|
or sub.get("label", "")
|
||||||
|
)
|
||||||
sub_quality = sub.get("tag", "") or sub.get("quality", "") or quality
|
sub_quality = sub.get("tag", "") or sub.get("quality", "") or quality
|
||||||
sub_title = f"{name}"
|
sub_title = f"{name}"
|
||||||
if sub_name:
|
if sub_name:
|
||||||
sub_title += f" - {sub_name}"
|
sub_title += f" - {sub_name}"
|
||||||
elif i > 0:
|
else:
|
||||||
sub_title += f" #{i + 1}"
|
sub_title += f" #{i + 2}"
|
||||||
|
|
||||||
streams.append(
|
streams.append(
|
||||||
ExtractedStream(
|
ExtractedStream(
|
||||||
|
|
@ -180,19 +196,6 @@ class PPVExtractor(BaseExtractor):
|
||||||
embed_url=sub_embed,
|
embed_url=sub_embed,
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
else:
|
|
||||||
# Single stream, no substreams
|
|
||||||
streams.append(
|
|
||||||
ExtractedStream(
|
|
||||||
url=embed_url,
|
|
||||||
site_key=self.site_key,
|
|
||||||
site_name=self.site_name,
|
|
||||||
quality=quality,
|
|
||||||
title=title,
|
|
||||||
stream_type="embed",
|
|
||||||
embed_url=embed_url,
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
except Exception:
|
except Exception:
|
||||||
logger.exception("[ppv] Failed to extract streams")
|
logger.exception("[ppv] Failed to extract streams")
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue