f1-stream: pitsport extractor — broaden categories + new safeStream payload

The previous extractor only surfaced Formula 1/2/3 and never returned anything outside race weekends. Two fixes: 1. Broadened category filter from {formula 1/2/3} to a motorsport set (MotoGP/Moto2/Moto3, WRC/WEC/IndyCar/NASCAR + the F1 series). Replaces the NON_F1_KEYWORDS exclusion list with a positive-match MOTORSPORT_KEYWORDS set; removes the F1-specific filter on title keywords. Old `_is_f1_*` aliases retained as compat shims. 2. Updated `_parse_stream_config` for the current pushembdz.store embed payload — Next.js now serves `safeStream` (just title + method) and the actual stream URL is fetched at runtime from `pushembdz.store/api/stream/<slug>`. Extractor now hits that endpoint when the inline link is missing. Treats `method=jwp` as HLS and accepts URLs ending in `.css` (pushembdz disguises some HLS playlists with a `.css` extension). End-to-end result: /streams went from 2 (curated, broken JW decoder) to 24 streams marked `is_live=True`. The verifier confirms each via `manifest_parsed_codec_missing_in_verifier` (Playwright Chromium has no H.264 — manifest fetch alone is the codec-independent positive signal). Currently surfaces Rally de Portugal SS1–SS22 (WRC); MotoGP starts appearing once the French GP weekend goes live tomorrow. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-07 15:25:27 +00:00 · 2026-05-07 15:25:27 +00:00 · 18d96712c7
commit 18d96712c7
parent 8146d05191
1 changed files with 86 additions and 49 deletions
--- a/stacks/f1-stream/files/backend/extractors/pitsport.py
+++ b/stacks/f1-stream/files/backend/extractors/pitsport.py
@ -30,20 +30,31 @@ USER_AGENT = (
    "Chrome/120.0.0.0 Safari/537.36"
 )
-# Categories to include (case-insensitive match)
+# Categories to include (case-insensitive match). Broadened beyond F1
-F1_CATEGORIES = {"formula 1", "formula 2", "formula 3"}
+# to also surface MotoGP and adjacent motorsports — keeps the f1-stream
-
+# UI useful between race weekends and during the off-season.
-# Fallback keyword matching on combined category+title for edge cases
+MOTORSPORT_CATEGORIES = {
-F1_KEYWORDS = {"formula 1", "formula one", "f1"}
+    "formula 1", "formula 2", "formula 3",
-GP_KEYWORD = "grand prix"
+    "motogp", "moto gp", "moto2", "moto3", "motoe",
-NON_F1_KEYWORDS = {
+    "world rally championship", "wrc",
-    "motogp", "moto gp", "moto2", "moto3", "motoe", "indycar",
+    "world endurance championship", "wec",
-    "indy car", "firestone", "nascar", "rally", "wrc", "wec",
+    "indycar series", "indycar", "indynxt",
-    "lemans", "le mans", "superbike", "dtm", "supercars", "arca",
+    "nascar cup series", "nascar truck series", "nascar o'reilly auto parts series",
-    "xfinity", "trucks", "super formula", "supergt", "super gt",
+    "nascar xfinity series", "nascar",
    "ama supercross", "supercross",
 }
 # Title keywords that are strong positives even when the category text
 # is missing (live-now cards sometimes elide it).
 MOTORSPORT_KEYWORDS = {
    "formula 1", "formula one", "f1",
    "motogp", "moto gp", "moto2", "moto3",
    "rally", "wrc",
    "indycar", "indy car",
    "nascar",
    "le mans", "lemans", "wec", "endurance",
 }
 GP_KEYWORD = "grand prix"
@dataclass
 class _PitsportEvent:
@ -54,28 +65,29 @@ class _PitsportEvent:
    watch_uuid: str
-def _is_f1_category(category: str) -> bool:
+def _is_motorsport_category(category: str) -> bool:
-    """Check if a category string matches an F1-related series."""
+    """Check if a category string matches an included motorsport series."""
-    return category.strip().lower() in F1_CATEGORIES
+    return category.strip().lower() in MOTORSPORT_CATEGORIES
-def _is_f1_event(category: str, title: str) -> bool:
+def _is_motorsport_event(category: str, title: str) -> bool:
-    """Check if an event is Formula 1 related by category or title keywords."""
+    """Check if an event is a motorsport we want to surface (F1 + adjacent)."""
-    # Primary check: exact category match
+    if _is_motorsport_category(category):
    if _is_f1_category(category):
        return True
    # Secondary check: keyword matching on combined text
    lower = f"{category} {title}".lower()
-    if any(kw in lower for kw in NON_F1_KEYWORDS):
+    if any(kw in lower for kw in MOTORSPORT_KEYWORDS):
        return False
    if any(kw in lower for kw in F1_KEYWORDS):
        return True
    if GP_KEYWORD in lower:
        return True
    return False
 # Aliases kept so older call-sites stay compiling. Both now point at the
 # broadened motorsport filter.
 _is_f1_category = _is_motorsport_category
 _is_f1_event = _is_motorsport_event
 def _parse_live_events(html: str) -> list[_PitsportEvent]:
    """Parse live events from the main page RSC payload.
@ -232,11 +244,28 @@ class _StreamConfig:
 def _parse_stream_config(html: str) -> _StreamConfig | None:
    """Extract stream config from an embed page RSC payload.
-    The embed page contains an RSC payload line like:
+    The embed page now uses a `safeStream` payload that elides the link:
-        4:["$","$Ld",null,{"stream":{"title":"...","link":"...","method":"player"},
+        4:["$","$Ld",null,{"safeStream":{"title":"Rally TV","method":"jwp"},
           "error":null,"slug":"..."}]
    The actual stream URL is fetched at runtime via
    pushembdz.store/api/stream/<slug>. Older payloads used "stream" with
    inline title+link+method — kept as fallback.
    """
-    # Try matching the escaped RSC payload pattern
+    # Current format: safeStream with title + method only (link via API).
    pattern_safe = re.compile(
        r'\\?"safeStream\\?"\s*:\s*\{'
        r'\\?"title\\?"\s*:\s*\\?"([^"\\]+)\\?"\s*,\s*'
        r'\\?"method\\?"\s*:\s*\\?"([^"\\]+)\\?"',
    )
    match = pattern_safe.search(html)
    if match:
        return _StreamConfig(
            title=match.group(1),
            link="",  # filled in by the caller via the api/stream endpoint
            method=match.group(2),
        )
    # Legacy: escaped RSC payload with inline link.
    pattern = re.compile(
        r'"stream":\{["\']?\\?"title\\?"["\']?:["\']?\\?"([^"\\]+)\\?"["\']?,'
        r'["\']?\\?"link\\?"["\']?:["\']?\\?"([^"\\]+)\\?"["\']?,'
@ -244,13 +273,8 @@ def _parse_stream_config(html: str) -> _StreamConfig | None:
    )
    match = pattern.search(html)
    if match:
-        return _StreamConfig(
+        return _StreamConfig(title=match.group(1), link=match.group(2), method=match.group(3))
            title=match.group(1),
            link=match.group(2),
            method=match.group(3),
        )
    # Simpler pattern for double-escaped payload
    pattern2 = re.compile(
        r'\\?"stream\\?":\{\\?"title\\?":\\?"([^\\]+)\\?",'
        r'\\?"link\\?":\\?"([^\\]+)\\?",'
@ -258,13 +282,8 @@ def _parse_stream_config(html: str) -> _StreamConfig | None:
    )
    match = pattern2.search(html)
    if match:
-        return _StreamConfig(
+        return _StreamConfig(title=match.group(1), link=match.group(2), method=match.group(3))
            title=match.group(1),
            link=match.group(2),
            method=match.group(3),
        )
    # Most lenient: just find the three fields near each other
    pattern3 = re.compile(
        r'"stream"\s*:\s*\{\s*"title"\s*:\s*"([^"]+)"\s*,'
        r'\s*"link"\s*:\s*"([^"]+)"\s*,'
@ -272,18 +291,16 @@ def _parse_stream_config(html: str) -> _StreamConfig | None:
    )
    match = pattern3.search(html)
    if match:
-        return _StreamConfig(
+        return _StreamConfig(title=match.group(1), link=match.group(2), method=match.group(3))
            title=match.group(1),
            link=match.group(2),
            method=match.group(3),
        )
    return None
 def _is_m3u8_method(method: str) -> bool:
    """Check if the stream method indicates a direct HLS stream."""
-    return method.lower() in ("player", "hls")
+    # `jwp` (current pushembdz format) returns an m3u8 from the api/stream
    # endpoint regardless of player UI; treat it as HLS.
    return method.lower() in ("player", "hls", "jwp")
 def _extract_m3u8_url(link: str) -> str:
@ -478,11 +495,31 @@ class PitsportExtractor(BaseExtractor):
            if stream_num > 1:
                stream_title += f" #{stream_num}"
-            if _is_m3u8_method(config.method) and "serveplay.site" in config.link:
+            # `safeStream` payload elides the link — fetch it from the
-                # Direct m3u8 stream
+            # pushembdz.store/api/stream/<slug> endpoint. Older `stream`
-                m3u8_url = _extract_m3u8_url(config.link)
+            # payloads provided the link inline.
            link = config.link
            if not link and _is_m3u8_method(config.method):
                api_url = f"{EMBED_BASE}/api/stream/{embed_uuid}"
                try:
                    api_resp = await client.get(
                        api_url,
                        headers={"Referer": embed_url, "Accept": "application/json"},
                    )
                    if api_resp.status_code == 200:
                        link = (api_resp.json() or {}).get("link", "")
                except Exception:
                    logger.debug(
                        "[pitsport] api/stream lookup failed for %s",
                        embed_uuid,
                        exc_info=True,
                    )
            # Treat any HLS-ish URL (m3u8, or pushembdz's .css disguise) as m3u8.
            looks_hls = link and (".m3u8" in link or link.endswith(".css") or "serveplay.site" in link)
            if _is_m3u8_method(config.method) and looks_hls:
                return ExtractedStream(
-                    url=m3u8_url,
+                    url=link,
                    site_key=self.site_key,
                    site_name=self.site_name,
                    quality="",