f1-stream: revive aceztrims + pitsport, more ppv variants

- aceztrims: scrape /f11/ (the actual stream page), not /f1/ (the cross-sport schedule). Drop the dead /iframe1?s= + onclick m3u8 regexes (site moved to `getElementById('iframe').src = '...'` ~20 channels ago). Strip HTML comments first so the ~20 legacy buttons kept inside  stop showing up as false positives. Also pick up the default inline <iframe id='iframe' src='...'>. Local run: 11 channels (was 0). - pitsport: decode the RSC payload before regex-matching in _parse_live_events (raw HTML had it escape-encoded, so the homepage card path was silently 0). Add the new /live-now route (canonical what's-live-right-now list). Add "f1" to MOTORSPORT_CATEGORIES — the site labels Formula 1 events as just "F1". Refresh the stale serveplay.site docstring (host rotates; pushembdz's api/stream link is authoritative). Local run: 7 m3u8 streams covering Canadian GP (EN1/EN2/MULTI/ITA/ESP) + NASCAR Coke 600 (was 0). - ppv: always emit the parent embed alongside substreams (was dropping it whenever substreams existed). Prefer source_tag in substream titles so users see "Sky Sport 1 NZ" / "Apple TV (F1TV)" instead of generic #1/#2 suffixes. Diagnosed against the live cluster (curated + 7 other extractors returning 0 cached streams, only 2 dead hmembeds curated 24/7 channels visible to users). Each fix verified with the extractor run against live sites this turn.
2026-05-24 22:05:37 +00:00 · 2026-05-24 22:05:37 +00:00 · 5a0e4b3dac
commit 5a0e4b3dac
parent d5f73ce109
3 changed files with 147 additions and 125 deletions
--- a/stacks/f1-stream/files/backend/extractors/aceztrims.py
+++ b/stacks/f1-stream/files/backend/extractors/aceztrims.py
@ -1,13 +1,24 @@
-"""Aceztrims extractor - scrapes F1 streaming links from Aceztrims pages.
+"""Aceztrims extractor — scrapes embed URLs from acestrlms.pages.dev/f11/.
-Parses HTML for iframe button onclick handlers and extracts streams from:
+The page (Cloudflare Pages, no anti-bot) hosts an iframe + a strip of
- /iframe1?s=<m3u8_url> → direct m3u8
+onclick channel-switcher buttons. Each button rewrites the iframe via
- https://pooembed.eu/embed/... → embed URL
+`document.getElementById('iframe').src = '<embed_url>'`. The initial
 channel is hard-coded as `<iframe id='iframe' src='...'>`.
 We strip HTML comments first because the page keeps ~20 legacy channel
 buttons inside `<!-- ... -->` blocks for easy re-enablement; the previous
 loose regex picked them up as false positives.
 All channels are iframe embeds (no direct m3u8) — `stream_type='embed'`.
 Site naming note: the extractor key stays `aceztrims` (the previous
 domain) so registry/cache identifiers don't churn. The current domain
 is `acestrlms.pages.dev` and the F1 path is `/f11/` (two ones — `/f1/`
 is the cross-sport schedule page and has no stream buttons).
 """
 import logging
 import re
 from urllib.parse import parse_qs, urlparse
 import httpx
@ -17,9 +28,8 @@ from backend.extractors.models import ExtractedStream
 logger = logging.getLogger(__name__)
 BASE_URL = "https://acestrlms.pages.dev"
 # Pages to scrape for streams
 F1_PAGES = [
-    ("/f1/", "Formula 1"),
+    ("/f11/", "Formula 1"),
 ]
 USER_AGENT = (
@ -28,13 +38,21 @@ USER_AGENT = (
    "Chrome/120.0.0.0 Safari/537.36"
 )
 # `document.getElementById('iframe').src = '<URL>'` — current channel-switcher format.
 _ONCLICK_IFRAME_SRC = re.compile(
    r"""document\.getElementById\(['"]iframe['"]\)\.src\s*=\s*['"]([^'"]+)['"]""",
    re.IGNORECASE,
 )
 # `<iframe id='iframe' src='<URL>'>` — the default/initial channel.
 _DEFAULT_IFRAME = re.compile(
    r"""<iframe[^>]*id\s*=\s*['"]iframe['"][^>]*src\s*=\s*['"]([^'"]+)['"]""",
    re.IGNORECASE,
 )
 _HTML_COMMENT = re.compile(r"<!--.*?-->", re.DOTALL)
 class AceztrimsExtractor(BaseExtractor):
-    """Extracts streams from Aceztrims pages by parsing HTML for iframe URLs.
+    """Pulls iframe embed URLs out of the acestrlms.pages.dev F1 page."""
    Looks for onclick handlers on buttons/links that open iframes, and
    extracts the stream URLs from them.
    """
    @property
    def site_key(self) -> str:
@ -45,7 +63,6 @@ class AceztrimsExtractor(BaseExtractor):
        return "Aceztrims"
    async def extract(self) -> list[ExtractedStream]:
        """Scrape all configured F1 pages for stream URLs."""
        streams: list[ExtractedStream] = []
        async with httpx.AsyncClient(
@ -55,12 +72,9 @@ class AceztrimsExtractor(BaseExtractor):
        ) as client:
            for path, category in F1_PAGES:
                try:
-                    page_streams = await self._scrape_page(client, path, category)
+                    streams.extend(await self._scrape_page(client, path, category))
                    streams.extend(page_streams)
                except Exception:
-                    logger.exception(
+                    logger.exception("[aceztrims] Failed to scrape %s", path)
                        "[aceztrims] Failed to scrape page %s", path
                    )
        logger.info("[aceztrims] Extracted %d stream(s)", len(streams))
        return streams
@ -68,85 +82,39 @@ class AceztrimsExtractor(BaseExtractor):
    async def _scrape_page(
        self, client: httpx.AsyncClient, path: str, category: str
    ) -> list[ExtractedStream]:
        """Scrape a single page for stream URLs."""
        url = f"{BASE_URL}{path}"
        resp = await client.get(url)
        if resp.status_code != 200:
            logger.warning(
-                "[aceztrims] Page %s returned HTTP %d", path, resp.status_code
+                "[aceztrims] %s returned HTTP %d", path, resp.status_code
            )
            return []
-        html = resp.text
+        # The page keeps a block of legacy channel buttons inside
        # `<!-- ... -->` for quick re-enablement. Strip comments first so
        # the regex only sees live buttons.
        html = _HTML_COMMENT.sub("", resp.text)
        seen: set[str] = set()
        streams: list[ExtractedStream] = []
        seen_urls: set[str] = set()
-        # Pattern 1: /iframe1?s=<m3u8_url> — direct m3u8
+        for pattern in (_DEFAULT_IFRAME, _ONCLICK_IFRAME_SRC):
-        iframe1_pattern = re.compile(
+            for match in pattern.finditer(html):
-            r"""['"]((?:https?://[^'"]*)?/iframe1\?s=([^'"&]+))['""]""",
+                embed_url = match.group(1).strip()
-            re.IGNORECASE,
+                if not embed_url or embed_url in seen:
-        )
+                    continue
-        for match in iframe1_pattern.finditer(html):
+                seen.add(embed_url)
-            m3u8_url = match.group(2)
+                streams.append(
-            if m3u8_url in seen_urls:
+                    ExtractedStream(
-                continue
+                        url=embed_url,
-            seen_urls.add(m3u8_url)
+                        site_key=self.site_key,
-
+                        site_name=self.site_name,
-            streams.append(
+                        quality="",
-                ExtractedStream(
+                        title=f"{category} Stream",
-                    url=m3u8_url,
+                        stream_type="embed",
-                    site_key=self.site_key,
+                        embed_url=embed_url,
-                    site_name=self.site_name,
+                    )
                    quality="",
                    title=f"{category} Stream",
                    stream_type="m3u8",
                )
            )
        # Pattern 2: embed URLs (pooembed.eu or similar)
        embed_pattern = re.compile(
            r"""['"]((https?://(?:pooembed\.eu|[^'"]*embed)[^'"]*))['"]""",
            re.IGNORECASE,
        )
        for match in embed_pattern.finditer(html):
            embed_url = match.group(1)
            if embed_url in seen_urls:
                continue
            seen_urls.add(embed_url)
            streams.append(
                ExtractedStream(
                    url=embed_url,
                    site_key=self.site_key,
                    site_name=self.site_name,
                    quality="",
                    title=f"{category} Stream (Embed)",
                    stream_type="embed",
                    embed_url=embed_url,
                )
            )
        # Pattern 3: Generic onclick handlers with URLs
        onclick_pattern = re.compile(
            r"""onclick\s*=\s*['"].*?['"]?(https?://[^'")\s]+\.m3u8[^'")\s]*)['"]?""",
            re.IGNORECASE,
        )
        for match in onclick_pattern.finditer(html):
            m3u8_url = match.group(1)
            if m3u8_url in seen_urls:
                continue
            seen_urls.add(m3u8_url)
            streams.append(
                ExtractedStream(
                    url=m3u8_url,
                    site_key=self.site_key,
                    site_name=self.site_name,
                    quality="",
                    title=f"{category} Stream",
                    stream_type="m3u8",
                )
            )
        logger.info(
            "[aceztrims] Found %d stream(s) on %s", len(streams), path
--- a/stacks/f1-stream/files/backend/extractors/pitsport.py
+++ b/stacks/f1-stream/files/backend/extractors/pitsport.py
@ -34,7 +34,7 @@ USER_AGENT = (
 # to also surface MotoGP and adjacent motorsports — keeps the f1-stream
 # UI useful between race weekends and during the off-season.
 MOTORSPORT_CATEGORIES = {
-    "formula 1", "formula 2", "formula 3",
+    "f1", "formula 1", "formula 2", "formula 3",
    "motogp", "moto gp", "moto2", "moto3", "motoe",
    "world rally championship", "wrc",
    "world endurance championship", "wec",
@ -85,27 +85,61 @@ _is_f1_category = _is_motorsport_category
 _is_f1_event = _is_motorsport_event
-def _parse_live_events(html: str) -> list[_PitsportEvent]:
+def _decode_rsc_payload(html: str) -> str:
-    """Parse live events from the main page RSC payload.
+    """Concatenate and unescape all `self.__next_f.push([1, "..."])` chunks.
-    The main page contains event cards with props:
+    Next.js RSC ships its tree as escape-encoded strings inside repeated
-        category, title, time, imageUrl
+    `self.__next_f.push` calls. Regex over the raw HTML misses everything
-    wrapped in <a href="/watch/{UUID}"> links.
+    interesting; we have to decode unicode escapes first.
    """
    chunks = re.findall(r'self\.__next_f\.push\(\[1,"(.*?)"\]\)', html, re.DOTALL)
    if not chunks:
        return ""
    payload = ""
    for chunk in chunks:
        try:
            payload += chunk.encode().decode("unicode_escape")
        except Exception:
            payload += chunk
    return payload
 def _parse_live_events(html: str) -> list[_PitsportEvent]:
    """Parse live events from the main page (or `/live-now`) RSC payload.
    The pages embed event cards inside the Next.js RSC payload; the raw
    HTML keeps it escape-encoded so we decode first, then match.
    Two shapes are common:
      1) Older card props: "category":"...","title":"..." next to
         "href":"/watch/UUID".
      2) Newer `event` prop: an `event` object with `uri:"/watch/UUID"`
         carrying `category` and `title`.
    """
    payload = _decode_rsc_payload(html) or html
    events: list[_PitsportEvent] = []
-    # Match event cards in the RSC payload - they appear as JSON-like structures
+    href_pattern = re.compile(
    # Pattern: href="/watch/UUID" ... category":"...", "title":"..."
    # In the RSC payload, the data is in the format:
    #   ["$","$L2","/watch/UUID",{"href":"/watch/UUID","children":["$","$L10",null,
    #     {"category":"...","title":"...","time":...,"imageUrl":"..."}]}]
    pattern = re.compile(
        r'"href":"(/watch/([0-9a-f-]{36}))"[^}]*?"category":"([^"]+)","title":"([^"]+)"',
    )
-    for match in pattern.finditer(html):
+    for match in href_pattern.finditer(payload):
        _, uuid, category, title = match.groups()
        events.append(_PitsportEvent(category=category, title=title, watch_uuid=uuid))
    event_pattern = re.compile(
        r'"event":\{[^{}]*?"title":"([^"]+)"[^{}]*?"uri":"/watch/([0-9a-f-]{36})"[^{}]*?"category":"([^"]+)"',
    )
    for match in event_pattern.finditer(payload):
        title, uuid, category = match.groups()
        events.append(_PitsportEvent(category=category, title=title, watch_uuid=uuid))
    event_pattern_alt = re.compile(
        r'"event":\{[^{}]*?"category":"([^"]+)"[^{}]*?"title":"([^"]+)"[^{}]*?"uri":"/watch/([0-9a-f-]{36})"',
    )
    for match in event_pattern_alt.finditer(payload):
        category, title, uuid = match.groups()
        events.append(_PitsportEvent(category=category, title=title, watch_uuid=uuid))
    return events
@ -301,13 +335,12 @@ def _is_m3u8_method(method: str) -> bool:
 def _extract_m3u8_url(link: str) -> str:
-    """Convert a serveplay.site player URL to an m3u8 playlist URL.
+    """Pass through the link from pushembdz's `api/stream/<slug>` response.
-    Input:  https://dash.serveplay.site/{channel}/index.html
+    The host has rotated over time (serveplay.site → oe1.ossfeed.store →
-    Output: https://dash.serveplay.site/{channel}/index.html
+    …); the response is always a master playlist URL we hand to the
-
+    player as-is. Content-Type may be `text/css` or `application/json` —
-    The index.html IS the m3u8 playlist (served with proper content-type
+    treat as HLS based on body sniffing (`#EXTM3U`), not MIME.
    when fetched with the correct Referer header).
    """
    return link
@ -388,6 +421,24 @@ class PitsportExtractor(BaseExtractor):
        except Exception:
            logger.exception("[pitsport] Failed to fetch main page")
        # Fetch /live-now — canonical "currently live" list, added 2026.
        try:
            resp = await client.get(f"{PITSPORT_BASE}/live-now")
            if resp.status_code == 200:
                live_now_events = _parse_live_events(resp.text)
                logger.info(
                    "[pitsport] Live-now page: %d event(s)", len(live_now_events)
                )
                for ev in live_now_events:
                    if _is_f1_event(ev.category, ev.title):
                        all_events.append(ev)
            else:
                logger.warning(
                    "[pitsport] Live-now page returned HTTP %d", resp.status_code
                )
        except Exception:
            logger.exception("[pitsport] Failed to fetch live-now page")
        # Fetch schedule page for upcoming events
        try:
            resp = await client.get(f"{PITSPORT_BASE}/schedule")
--- a/stacks/f1-stream/files/backend/extractors/ppv.py
+++ b/stacks/f1-stream/files/backend/extractors/ppv.py
@ -153,21 +153,37 @@ class PPVExtractor(BaseExtractor):
                    if viewers and int(viewers) > 0:
                        title += f" ({viewers} viewers)"
-                    # Check for substreams (multiple quality/language options)
+                    # Always emit the parent stream — substreams are
                    # additional language/source variants, not replacements.
                    streams.append(
                        ExtractedStream(
                            url=embed_url,
                            site_key=self.site_key,
                            site_name=self.site_name,
                            quality=quality,
                            title=title,
                            stream_type="embed",
                            embed_url=embed_url,
                        )
                    )
                    substreams = stream_obj.get("substreams")
-                    if isinstance(substreams, list) and substreams:
+                    if isinstance(substreams, list):
                        for i, sub in enumerate(substreams):
                            sub_embed = sub.get("iframe", "") or sub.get("embed_url", "")
                            if not sub_embed:
                                # Fall back to the parent embed URL
                                sub_embed = embed_url
-                            sub_name = sub.get("name", "") or sub.get("label", "")
+                            sub_name = (
                                sub.get("source_tag", "")
                                or sub.get("name", "")
                                or sub.get("label", "")
                            )
                            sub_quality = sub.get("tag", "") or sub.get("quality", "") or quality
                            sub_title = f"{name}"
                            if sub_name:
                                sub_title += f" - {sub_name}"
-                            elif i > 0:
+                            else:
-                                sub_title += f" #{i + 1}"
+                                sub_title += f" #{i + 2}"
                            streams.append(
                                ExtractedStream(
@ -180,19 +196,6 @@ class PPVExtractor(BaseExtractor):
                                    embed_url=sub_embed,
                                )
                            )
                    else:
                        # Single stream, no substreams
                        streams.append(
                            ExtractedStream(
                                url=embed_url,
                                site_key=self.site_key,
                                site_name=self.site_name,
                                quality=quality,
                                title=title,
                                stream_type="embed",
                                embed_url=embed_url,
                            )
                        )
        except Exception:
            logger.exception("[ppv] Failed to extract streams")