infra/stacks/f1-stream/files/backend/extractors/aceztrims.py
Viktor Barzin 5a0e4b3dac f1-stream: revive aceztrims + pitsport, more ppv variants
- aceztrims: scrape /f11/ (the actual stream page), not /f1/ (the
  cross-sport schedule). Drop the dead /iframe1?s= + onclick m3u8
  regexes (site moved to `getElementById('iframe').src = '...'` ~20
  channels ago). Strip HTML comments first so the ~20 legacy buttons
  kept inside <!-- ... --> stop showing up as false positives.
  Also pick up the default inline <iframe id='iframe' src='...'>.
  Local run: 11 channels (was 0).

- pitsport: decode the RSC payload before regex-matching in
  _parse_live_events (raw HTML had it escape-encoded, so the homepage
  card path was silently 0). Add the new /live-now route (canonical
  what's-live-right-now list). Add "f1" to MOTORSPORT_CATEGORIES — the
  site labels Formula 1 events as just "F1". Refresh the stale
  serveplay.site docstring (host rotates; pushembdz's api/stream link
  is authoritative).
  Local run: 7 m3u8 streams covering Canadian GP (EN1/EN2/MULTI/ITA/ESP)
  + NASCAR Coke 600 (was 0).

- ppv: always emit the parent embed alongside substreams (was dropping
  it whenever substreams existed). Prefer source_tag in substream titles
  so users see "Sky Sport 1 NZ" / "Apple TV (F1TV)" instead of generic
  #1/#2 suffixes.

Diagnosed against the live cluster (curated + 7 other extractors
returning 0 cached streams, only 2 dead hmembeds curated 24/7 channels
visible to users). Each fix verified with the extractor run against
live sites this turn.
2026-05-24 22:05:37 +00:00

122 lines
4.1 KiB
Python

"""Aceztrims extractor — scrapes embed URLs from acestrlms.pages.dev/f11/.
The page (Cloudflare Pages, no anti-bot) hosts an iframe + a strip of
onclick channel-switcher buttons. Each button rewrites the iframe via
`document.getElementById('iframe').src = '<embed_url>'`. The initial
channel is hard-coded as `<iframe id='iframe' src='...'>`.
We strip HTML comments first because the page keeps ~20 legacy channel
buttons inside `<!-- ... -->` blocks for easy re-enablement; the previous
loose regex picked them up as false positives.
All channels are iframe embeds (no direct m3u8) — `stream_type='embed'`.
Site naming note: the extractor key stays `aceztrims` (the previous
domain) so registry/cache identifiers don't churn. The current domain
is `acestrlms.pages.dev` and the F1 path is `/f11/` (two ones — `/f1/`
is the cross-sport schedule page and has no stream buttons).
"""
import logging
import re
import httpx
from backend.extractors.base import BaseExtractor
from backend.extractors.models import ExtractedStream
logger = logging.getLogger(__name__)
BASE_URL = "https://acestrlms.pages.dev"
F1_PAGES = [
("/f11/", "Formula 1"),
]
USER_AGENT = (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/120.0.0.0 Safari/537.36"
)
# `document.getElementById('iframe').src = '<URL>'` — current channel-switcher format.
_ONCLICK_IFRAME_SRC = re.compile(
r"""document\.getElementById\(['"]iframe['"]\)\.src\s*=\s*['"]([^'"]+)['"]""",
re.IGNORECASE,
)
# `<iframe id='iframe' src='<URL>'>` — the default/initial channel.
_DEFAULT_IFRAME = re.compile(
r"""<iframe[^>]*id\s*=\s*['"]iframe['"][^>]*src\s*=\s*['"]([^'"]+)['"]""",
re.IGNORECASE,
)
_HTML_COMMENT = re.compile(r"<!--.*?-->", re.DOTALL)
class AceztrimsExtractor(BaseExtractor):
"""Pulls iframe embed URLs out of the acestrlms.pages.dev F1 page."""
@property
def site_key(self) -> str:
return "aceztrims"
@property
def site_name(self) -> str:
return "Aceztrims"
async def extract(self) -> list[ExtractedStream]:
streams: list[ExtractedStream] = []
async with httpx.AsyncClient(
timeout=15.0,
follow_redirects=True,
headers={"User-Agent": USER_AGENT},
) as client:
for path, category in F1_PAGES:
try:
streams.extend(await self._scrape_page(client, path, category))
except Exception:
logger.exception("[aceztrims] Failed to scrape %s", path)
logger.info("[aceztrims] Extracted %d stream(s)", len(streams))
return streams
async def _scrape_page(
self, client: httpx.AsyncClient, path: str, category: str
) -> list[ExtractedStream]:
url = f"{BASE_URL}{path}"
resp = await client.get(url)
if resp.status_code != 200:
logger.warning(
"[aceztrims] %s returned HTTP %d", path, resp.status_code
)
return []
# The page keeps a block of legacy channel buttons inside
# `<!-- ... -->` for quick re-enablement. Strip comments first so
# the regex only sees live buttons.
html = _HTML_COMMENT.sub("", resp.text)
seen: set[str] = set()
streams: list[ExtractedStream] = []
for pattern in (_DEFAULT_IFRAME, _ONCLICK_IFRAME_SRC):
for match in pattern.finditer(html):
embed_url = match.group(1).strip()
if not embed_url or embed_url in seen:
continue
seen.add(embed_url)
streams.append(
ExtractedStream(
url=embed_url,
site_key=self.site_key,
site_name=self.site_name,
quality="",
title=f"{category} Stream",
stream_type="embed",
embed_url=embed_url,
)
)
logger.info(
"[aceztrims] Found %d stream(s) on %s", len(streams), path
)
return streams