f1-stream: revive aceztrims + pitsport, more ppv variants

- aceztrims: scrape /f11/ (the actual stream page), not /f1/ (the cross-sport schedule). Drop the dead /iframe1?s= + onclick m3u8 regexes (site moved to `getElementById('iframe').src = '...'` ~20 channels ago). Strip HTML comments first so the ~20 legacy buttons kept inside  stop showing up as false positives. Also pick up the default inline <iframe id='iframe' src='...'>. Local run: 11 channels (was 0). - pitsport: decode the RSC payload before regex-matching in _parse_live_events (raw HTML had it escape-encoded, so the homepage card path was silently 0). Add the new /live-now route (canonical what's-live-right-now list). Add "f1" to MOTORSPORT_CATEGORIES — the site labels Formula 1 events as just "F1". Refresh the stale serveplay.site docstring (host rotates; pushembdz's api/stream link is authoritative). Local run: 7 m3u8 streams covering Canadian GP (EN1/EN2/MULTI/ITA/ESP) + NASCAR Coke 600 (was 0). - ppv: always emit the parent embed alongside substreams (was dropping it whenever substreams existed). Prefer source_tag in substream titles so users see "Sky Sport 1 NZ" / "Apple TV (F1TV)" instead of generic #1/#2 suffixes. Diagnosed against the live cluster (curated + 7 other extractors returning 0 cached streams, only 2 dead hmembeds curated 24/7 channels visible to users). Each fix verified with the extractor run against live sites this turn.
2026-05-24 22:05:37 +00:00 · 2026-05-24 22:05:37 +00:00 · 5a0e4b3dac
commit 5a0e4b3dac
parent d5f73ce109
3 changed files with 147 additions and 125 deletions
--- a/stacks/f1-stream/files/backend/extractors/aceztrims.py
+++ b/stacks/f1-stream/files/backend/extractors/aceztrims.py
@ -1,13 +1,24 @@
-"""Aceztrims extractor - scrapes F1 streaming links from Aceztrims pages.
+"""Aceztrims extractor — scrapes embed URLs from acestrlms.pages.dev/f11/.

-Parses HTML for iframe button onclick handlers and extracts streams from:
- /iframe1?s=<m3u8_url> → direct m3u8
- https://pooembed.eu/embed/... → embed URL
+The page (Cloudflare Pages, no anti-bot) hosts an iframe + a strip of
+onclick channel-switcher buttons. Each button rewrites the iframe via
+`document.getElementById('iframe').src = '<embed_url>'`. The initial
+channel is hard-coded as `<iframe id='iframe' src='...'>`.
+
+We strip HTML comments first because the page keeps ~20 legacy channel
+buttons inside `<!-- ... -->` blocks for easy re-enablement; the previous
+loose regex picked them up as false positives.
+
+All channels are iframe embeds (no direct m3u8) — `stream_type='embed'`.
+
+Site naming note: the extractor key stays `aceztrims` (the previous
+domain) so registry/cache identifiers don't churn. The current domain
+is `acestrlms.pages.dev` and the F1 path is `/f11/` (two ones — `/f1/`
+is the cross-sport schedule page and has no stream buttons).
 """

 import logging
 import re
-from urllib.parse import parse_qs, urlparse

 import httpx

@ -17,9 +28,8 @@ from backend.extractors.models import ExtractedStream
 logger = logging.getLogger(__name__)

 BASE_URL = "https://acestrlms.pages.dev"
-# Pages to scrape for streams
 F1_PAGES = [
-    ("/f1/", "Formula 1"),
+    ("/f11/", "Formula 1"),
 ]

 USER_AGENT = (
@ -28,13 +38,21 @@ USER_AGENT = (
    "Chrome/120.0.0.0 Safari/537.36"
 )

+# `document.getElementById('iframe').src = '<URL>'` — current channel-switcher format.
+_ONCLICK_IFRAME_SRC = re.compile(
+    r"""document\.getElementById\(['"]iframe['"]\)\.src\s*=\s*['"]([^'"]+)['"]""",
+    re.IGNORECASE,
+)
+# `<iframe id='iframe' src='<URL>'>` — the default/initial channel.
+_DEFAULT_IFRAME = re.compile(
+    r"""<iframe[^>]*id\s*=\s*['"]iframe['"][^>]*src\s*=\s*['"]([^'"]+)['"]""",
+    re.IGNORECASE,
+)
+_HTML_COMMENT = re.compile(r"<!--.*?-->", re.DOTALL)
+

 class AceztrimsExtractor(BaseExtractor):
-    """Extracts streams from Aceztrims pages by parsing HTML for iframe URLs.
-
-    Looks for onclick handlers on buttons/links that open iframes, and
-    extracts the stream URLs from them.
-    """
+    """Pulls iframe embed URLs out of the acestrlms.pages.dev F1 page."""

    @property
    def site_key(self) -> str:
@ -45,7 +63,6 @@ class AceztrimsExtractor(BaseExtractor):
        return "Aceztrims"

    async def extract(self) -> list[ExtractedStream]:
-        """Scrape all configured F1 pages for stream URLs."""
        streams: list[ExtractedStream] = []

        async with httpx.AsyncClient(
@ -55,12 +72,9 @@ class AceztrimsExtractor(BaseExtractor):
        ) as client:
            for path, category in F1_PAGES:
                try:
-                    page_streams = await self._scrape_page(client, path, category)
-                    streams.extend(page_streams)
+                    streams.extend(await self._scrape_page(client, path, category))
                except Exception:
-                    logger.exception(
-                        "[aceztrims] Failed to scrape page %s", path
-                    )
+                    logger.exception("[aceztrims] Failed to scrape %s", path)

        logger.info("[aceztrims] Extracted %d stream(s)", len(streams))
        return streams
@ -68,85 +82,39 @@ class AceztrimsExtractor(BaseExtractor):
    async def _scrape_page(
        self, client: httpx.AsyncClient, path: str, category: str
    ) -> list[ExtractedStream]:
-        """Scrape a single page for stream URLs."""
        url = f"{BASE_URL}{path}"
        resp = await client.get(url)
        if resp.status_code != 200:
            logger.warning(
-                "[aceztrims] Page %s returned HTTP %d", path, resp.status_code
+                "[aceztrims] %s returned HTTP %d", path, resp.status_code
            )
            return []

-        html = resp.text
+        # The page keeps a block of legacy channel buttons inside
+        # `<!-- ... -->` for quick re-enablement. Strip comments first so
+        # the regex only sees live buttons.
+        html = _HTML_COMMENT.sub("", resp.text)
+
+        seen: set[str] = set()
        streams: list[ExtractedStream] = []
-        seen_urls: set[str] = set()

-        # Pattern 1: /iframe1?s=<m3u8_url> — direct m3u8
-        iframe1_pattern = re.compile(
-            r"""['"]((?:https?://[^'"]*)?/iframe1\?s=([^'"&]+))['""]""",
-            re.IGNORECASE,
-        )
-        for match in iframe1_pattern.finditer(html):
-            m3u8_url = match.group(2)
-            if m3u8_url in seen_urls:
-                continue
-            seen_urls.add(m3u8_url)
-
-            streams.append(
-                ExtractedStream(
-                    url=m3u8_url,
-                    site_key=self.site_key,
-                    site_name=self.site_name,
-                    quality="",
-                    title=f"{category} Stream",
-                    stream_type="m3u8",
+        for pattern in (_DEFAULT_IFRAME, _ONCLICK_IFRAME_SRC):
+            for match in pattern.finditer(html):
+                embed_url = match.group(1).strip()
+                if not embed_url or embed_url in seen:
+                    continue
+                seen.add(embed_url)
+                streams.append(
+                    ExtractedStream(
+                        url=embed_url,
+                        site_key=self.site_key,
+                        site_name=self.site_name,
+                        quality="",
+                        title=f"{category} Stream",
+                        stream_type="embed",
+                        embed_url=embed_url,
+                    )
                )
-            )
-
-        # Pattern 2: embed URLs (pooembed.eu or similar)
-        embed_pattern = re.compile(
-            r"""['"]((https?://(?:pooembed\.eu|[^'"]*embed)[^'"]*))['"]""",
-            re.IGNORECASE,
-        )
-        for match in embed_pattern.finditer(html):
-            embed_url = match.group(1)
-            if embed_url in seen_urls:
-                continue
-            seen_urls.add(embed_url)
-
-            streams.append(
-                ExtractedStream(
-                    url=embed_url,
-                    site_key=self.site_key,
-                    site_name=self.site_name,
-                    quality="",
-                    title=f"{category} Stream (Embed)",
-                    stream_type="embed",
-                    embed_url=embed_url,
-                )
-            )
-
-        # Pattern 3: Generic onclick handlers with URLs
-        onclick_pattern = re.compile(
-            r"""onclick\s*=\s*['"].*?['"]?(https?://[^'")\s]+\.m3u8[^'")\s]*)['"]?""",
-            re.IGNORECASE,
-        )
-        for match in onclick_pattern.finditer(html):
-            m3u8_url = match.group(1)
-            if m3u8_url in seen_urls:
-                continue
-            seen_urls.add(m3u8_url)
-
-            streams.append(
-                ExtractedStream(
-                    url=m3u8_url,
-                    site_key=self.site_key,
-                    site_name=self.site_name,
-                    quality="",
-                    title=f"{category} Stream",
-                    stream_type="m3u8",
-                )
-            )

        logger.info(
            "[aceztrims] Found %d stream(s) on %s", len(streams), path
--- a/stacks/f1-stream/files/backend/extractors/pitsport.py
+++ b/stacks/f1-stream/files/backend/extractors/pitsport.py
@ -34,7 +34,7 @@ USER_AGENT = (
 # to also surface MotoGP and adjacent motorsports — keeps the f1-stream
 # UI useful between race weekends and during the off-season.
 MOTORSPORT_CATEGORIES = {
-    "formula 1", "formula 2", "formula 3",
+    "f1", "formula 1", "formula 2", "formula 3",
    "motogp", "moto gp", "moto2", "moto3", "motoe",
    "world rally championship", "wrc",
    "world endurance championship", "wec",
@ -85,27 +85,61 @@ _is_f1_category = _is_motorsport_category
 _is_f1_event = _is_motorsport_event


-def _parse_live_events(html: str) -> list[_PitsportEvent]:
-    """Parse live events from the main page RSC payload.
+def _decode_rsc_payload(html: str) -> str:
+    """Concatenate and unescape all `self.__next_f.push([1, "..."])` chunks.

-    The main page contains event cards with props:
-        category, title, time, imageUrl
-    wrapped in <a href="/watch/{UUID}"> links.
+    Next.js RSC ships its tree as escape-encoded strings inside repeated
+    `self.__next_f.push` calls. Regex over the raw HTML misses everything
+    interesting; we have to decode unicode escapes first.
    """
+    chunks = re.findall(r'self\.__next_f\.push\(\[1,"(.*?)"\]\)', html, re.DOTALL)
+    if not chunks:
+        return ""
+    payload = ""
+    for chunk in chunks:
+        try:
+            payload += chunk.encode().decode("unicode_escape")
+        except Exception:
+            payload += chunk
+    return payload
+
+
+def _parse_live_events(html: str) -> list[_PitsportEvent]:
+    """Parse live events from the main page (or `/live-now`) RSC payload.
+
+    The pages embed event cards inside the Next.js RSC payload; the raw
+    HTML keeps it escape-encoded so we decode first, then match.
+    Two shapes are common:
+      1) Older card props: "category":"...","title":"..." next to
+         "href":"/watch/UUID".
+      2) Newer `event` prop: an `event` object with `uri:"/watch/UUID"`
+         carrying `category` and `title`.
+    """
+    payload = _decode_rsc_payload(html) or html
+
    events: list[_PitsportEvent] = []

-    # Match event cards in the RSC payload - they appear as JSON-like structures
-    # Pattern: href="/watch/UUID" ... category":"...", "title":"..."
-    # In the RSC payload, the data is in the format:
-    #   ["$","$L2","/watch/UUID",{"href":"/watch/UUID","children":["$","$L10",null,
-    #     {"category":"...","title":"...","time":...,"imageUrl":"..."}]}]
-    pattern = re.compile(
+    href_pattern = re.compile(
        r'"href":"(/watch/([0-9a-f-]{36}))"[^}]*?"category":"([^"]+)","title":"([^"]+)"',
    )
-    for match in pattern.finditer(html):
+    for match in href_pattern.finditer(payload):
        _, uuid, category, title = match.groups()
        events.append(_PitsportEvent(category=category, title=title, watch_uuid=uuid))

+    event_pattern = re.compile(
+        r'"event":\{[^{}]*?"title":"([^"]+)"[^{}]*?"uri":"/watch/([0-9a-f-]{36})"[^{}]*?"category":"([^"]+)"',
+    )
+    for match in event_pattern.finditer(payload):
+        title, uuid, category = match.groups()
+        events.append(_PitsportEvent(category=category, title=title, watch_uuid=uuid))
+
+    event_pattern_alt = re.compile(
+        r'"event":\{[^{}]*?"category":"([^"]+)"[^{}]*?"title":"([^"]+)"[^{}]*?"uri":"/watch/([0-9a-f-]{36})"',
+    )
+    for match in event_pattern_alt.finditer(payload):
+        category, title, uuid = match.groups()
+        events.append(_PitsportEvent(category=category, title=title, watch_uuid=uuid))
+
    return events


@ -301,13 +335,12 @@ def _is_m3u8_method(method: str) -> bool:


 def _extract_m3u8_url(link: str) -> str:
-    """Convert a serveplay.site player URL to an m3u8 playlist URL.
+    """Pass through the link from pushembdz's `api/stream/<slug>` response.

-    Input:  https://dash.serveplay.site/{channel}/index.html
-    Output: https://dash.serveplay.site/{channel}/index.html
-
-    The index.html IS the m3u8 playlist (served with proper content-type
-    when fetched with the correct Referer header).
+    The host has rotated over time (serveplay.site → oe1.ossfeed.store →
+    …); the response is always a master playlist URL we hand to the
+    player as-is. Content-Type may be `text/css` or `application/json` —
+    treat as HLS based on body sniffing (`#EXTM3U`), not MIME.
    """
    return link

@ -388,6 +421,24 @@ class PitsportExtractor(BaseExtractor):
        except Exception:
            logger.exception("[pitsport] Failed to fetch main page")

+        # Fetch /live-now — canonical "currently live" list, added 2026.
+        try:
+            resp = await client.get(f"{PITSPORT_BASE}/live-now")
+            if resp.status_code == 200:
+                live_now_events = _parse_live_events(resp.text)
+                logger.info(
+                    "[pitsport] Live-now page: %d event(s)", len(live_now_events)
+                )
+                for ev in live_now_events:
+                    if _is_f1_event(ev.category, ev.title):
+                        all_events.append(ev)
+            else:
+                logger.warning(
+                    "[pitsport] Live-now page returned HTTP %d", resp.status_code
+                )
+        except Exception:
+            logger.exception("[pitsport] Failed to fetch live-now page")
+
        # Fetch schedule page for upcoming events
        try:
            resp = await client.get(f"{PITSPORT_BASE}/schedule")
--- a/stacks/f1-stream/files/backend/extractors/ppv.py
+++ b/stacks/f1-stream/files/backend/extractors/ppv.py
@ -153,21 +153,37 @@ class PPVExtractor(BaseExtractor):
                    if viewers and int(viewers) > 0:
                        title += f" ({viewers} viewers)"

-                    # Check for substreams (multiple quality/language options)
+                    # Always emit the parent stream — substreams are
+                    # additional language/source variants, not replacements.
+                    streams.append(
+                        ExtractedStream(
+                            url=embed_url,
+                            site_key=self.site_key,
+                            site_name=self.site_name,
+                            quality=quality,
+                            title=title,
+                            stream_type="embed",
+                            embed_url=embed_url,
+                        )
+                    )
+
                    substreams = stream_obj.get("substreams")
-                    if isinstance(substreams, list) and substreams:
+                    if isinstance(substreams, list):
                        for i, sub in enumerate(substreams):
                            sub_embed = sub.get("iframe", "") or sub.get("embed_url", "")
                            if not sub_embed:
-                                # Fall back to the parent embed URL
                                sub_embed = embed_url
-                            sub_name = sub.get("name", "") or sub.get("label", "")
+                            sub_name = (
+                                sub.get("source_tag", "")
+                                or sub.get("name", "")
+                                or sub.get("label", "")
+                            )
                            sub_quality = sub.get("tag", "") or sub.get("quality", "") or quality
                            sub_title = f"{name}"
                            if sub_name:
                                sub_title += f" - {sub_name}"
-                            elif i > 0:
-                                sub_title += f" #{i + 1}"
+                            else:
+                                sub_title += f" #{i + 2}"

                            streams.append(
                                ExtractedStream(
@ -180,19 +196,6 @@ class PPVExtractor(BaseExtractor):
                                    embed_url=sub_embed,
                                )
                            )
-                    else:
-                        # Single stream, no substreams
-                        streams.append(
-                            ExtractedStream(
-                                url=embed_url,
-                                site_key=self.site_key,
-                                site_name=self.site_name,
-                                quality=quality,
-                                title=title,
-                                stream_type="embed",
-                                embed_url=embed_url,
-                            )
-                        )

        except Exception:
            logger.exception("[ppv] Failed to extract streams")