f1-stream: pitsport extractor — broaden categories + new safeStream payload

The previous extractor only surfaced Formula 1/2/3 and never returned
anything outside race weekends. Two fixes:

1. Broadened category filter from {formula 1/2/3} to a motorsport set
   (MotoGP/Moto2/Moto3, WRC/WEC/IndyCar/NASCAR + the F1 series).
   Replaces the NON_F1_KEYWORDS exclusion list with a positive-match
   MOTORSPORT_KEYWORDS set; removes the F1-specific filter on title
   keywords. Old `_is_f1_*` aliases retained as compat shims.

2. Updated `_parse_stream_config` for the current pushembdz.store embed
   payload — Next.js now serves `safeStream` (just title + method) and
   the actual stream URL is fetched at runtime from
   `pushembdz.store/api/stream/<slug>`. Extractor now hits that endpoint
   when the inline link is missing. Treats `method=jwp` as HLS and
   accepts URLs ending in `.css` (pushembdz disguises some HLS playlists
   with a `.css` extension).

End-to-end result: /streams went from 2 (curated, broken JW decoder) to
24 streams marked `is_live=True`. The verifier confirms each via
`manifest_parsed_codec_missing_in_verifier` (Playwright Chromium has no
H.264 — manifest fetch alone is the codec-independent positive signal).
Currently surfaces Rally de Portugal SS1–SS22 (WRC); MotoGP starts
appearing once the French GP weekend goes live tomorrow.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
Viktor Barzin 2026-05-07 15:25:27 +00:00
parent 8146d05191
commit 18d96712c7

View file

@ -30,20 +30,31 @@ USER_AGENT = (
"Chrome/120.0.0.0 Safari/537.36" "Chrome/120.0.0.0 Safari/537.36"
) )
# Categories to include (case-insensitive match) # Categories to include (case-insensitive match). Broadened beyond F1
F1_CATEGORIES = {"formula 1", "formula 2", "formula 3"} # to also surface MotoGP and adjacent motorsports — keeps the f1-stream
# UI useful between race weekends and during the off-season.
# Fallback keyword matching on combined category+title for edge cases MOTORSPORT_CATEGORIES = {
F1_KEYWORDS = {"formula 1", "formula one", "f1"} "formula 1", "formula 2", "formula 3",
GP_KEYWORD = "grand prix" "motogp", "moto gp", "moto2", "moto3", "motoe",
NON_F1_KEYWORDS = { "world rally championship", "wrc",
"motogp", "moto gp", "moto2", "moto3", "motoe", "indycar", "world endurance championship", "wec",
"indy car", "firestone", "nascar", "rally", "wrc", "wec", "indycar series", "indycar", "indynxt",
"lemans", "le mans", "superbike", "dtm", "supercars", "arca", "nascar cup series", "nascar truck series", "nascar o'reilly auto parts series",
"xfinity", "trucks", "super formula", "supergt", "super gt", "nascar xfinity series", "nascar",
"ama supercross", "supercross",
} }
# Title keywords that are strong positives even when the category text
# is missing (live-now cards sometimes elide it).
MOTORSPORT_KEYWORDS = {
"formula 1", "formula one", "f1",
"motogp", "moto gp", "moto2", "moto3",
"rally", "wrc",
"indycar", "indy car",
"nascar",
"le mans", "lemans", "wec", "endurance",
}
GP_KEYWORD = "grand prix"
@dataclass @dataclass
class _PitsportEvent: class _PitsportEvent:
@ -54,28 +65,29 @@ class _PitsportEvent:
watch_uuid: str watch_uuid: str
def _is_f1_category(category: str) -> bool: def _is_motorsport_category(category: str) -> bool:
"""Check if a category string matches an F1-related series.""" """Check if a category string matches an included motorsport series."""
return category.strip().lower() in F1_CATEGORIES return category.strip().lower() in MOTORSPORT_CATEGORIES
def _is_f1_event(category: str, title: str) -> bool: def _is_motorsport_event(category: str, title: str) -> bool:
"""Check if an event is Formula 1 related by category or title keywords.""" """Check if an event is a motorsport we want to surface (F1 + adjacent)."""
# Primary check: exact category match if _is_motorsport_category(category):
if _is_f1_category(category):
return True return True
# Secondary check: keyword matching on combined text
lower = f"{category} {title}".lower() lower = f"{category} {title}".lower()
if any(kw in lower for kw in NON_F1_KEYWORDS): if any(kw in lower for kw in MOTORSPORT_KEYWORDS):
return False
if any(kw in lower for kw in F1_KEYWORDS):
return True return True
if GP_KEYWORD in lower: if GP_KEYWORD in lower:
return True return True
return False return False
# Aliases kept so older call-sites stay compiling. Both now point at the
# broadened motorsport filter.
_is_f1_category = _is_motorsport_category
_is_f1_event = _is_motorsport_event
def _parse_live_events(html: str) -> list[_PitsportEvent]: def _parse_live_events(html: str) -> list[_PitsportEvent]:
"""Parse live events from the main page RSC payload. """Parse live events from the main page RSC payload.
@ -232,11 +244,28 @@ class _StreamConfig:
def _parse_stream_config(html: str) -> _StreamConfig | None: def _parse_stream_config(html: str) -> _StreamConfig | None:
"""Extract stream config from an embed page RSC payload. """Extract stream config from an embed page RSC payload.
The embed page contains an RSC payload line like: The embed page now uses a `safeStream` payload that elides the link:
4:["$","$Ld",null,{"stream":{"title":"...","link":"...","method":"player"}, 4:["$","$Ld",null,{"safeStream":{"title":"Rally TV","method":"jwp"},
"error":null,"slug":"..."}] "error":null,"slug":"..."}]
The actual stream URL is fetched at runtime via
pushembdz.store/api/stream/<slug>. Older payloads used "stream" with
inline title+link+method kept as fallback.
""" """
# Try matching the escaped RSC payload pattern # Current format: safeStream with title + method only (link via API).
pattern_safe = re.compile(
r'\\?"safeStream\\?"\s*:\s*\{'
r'\\?"title\\?"\s*:\s*\\?"([^"\\]+)\\?"\s*,\s*'
r'\\?"method\\?"\s*:\s*\\?"([^"\\]+)\\?"',
)
match = pattern_safe.search(html)
if match:
return _StreamConfig(
title=match.group(1),
link="", # filled in by the caller via the api/stream endpoint
method=match.group(2),
)
# Legacy: escaped RSC payload with inline link.
pattern = re.compile( pattern = re.compile(
r'"stream":\{["\']?\\?"title\\?"["\']?:["\']?\\?"([^"\\]+)\\?"["\']?,' r'"stream":\{["\']?\\?"title\\?"["\']?:["\']?\\?"([^"\\]+)\\?"["\']?,'
r'["\']?\\?"link\\?"["\']?:["\']?\\?"([^"\\]+)\\?"["\']?,' r'["\']?\\?"link\\?"["\']?:["\']?\\?"([^"\\]+)\\?"["\']?,'
@ -244,13 +273,8 @@ def _parse_stream_config(html: str) -> _StreamConfig | None:
) )
match = pattern.search(html) match = pattern.search(html)
if match: if match:
return _StreamConfig( return _StreamConfig(title=match.group(1), link=match.group(2), method=match.group(3))
title=match.group(1),
link=match.group(2),
method=match.group(3),
)
# Simpler pattern for double-escaped payload
pattern2 = re.compile( pattern2 = re.compile(
r'\\?"stream\\?":\{\\?"title\\?":\\?"([^\\]+)\\?",' r'\\?"stream\\?":\{\\?"title\\?":\\?"([^\\]+)\\?",'
r'\\?"link\\?":\\?"([^\\]+)\\?",' r'\\?"link\\?":\\?"([^\\]+)\\?",'
@ -258,13 +282,8 @@ def _parse_stream_config(html: str) -> _StreamConfig | None:
) )
match = pattern2.search(html) match = pattern2.search(html)
if match: if match:
return _StreamConfig( return _StreamConfig(title=match.group(1), link=match.group(2), method=match.group(3))
title=match.group(1),
link=match.group(2),
method=match.group(3),
)
# Most lenient: just find the three fields near each other
pattern3 = re.compile( pattern3 = re.compile(
r'"stream"\s*:\s*\{\s*"title"\s*:\s*"([^"]+)"\s*,' r'"stream"\s*:\s*\{\s*"title"\s*:\s*"([^"]+)"\s*,'
r'\s*"link"\s*:\s*"([^"]+)"\s*,' r'\s*"link"\s*:\s*"([^"]+)"\s*,'
@ -272,18 +291,16 @@ def _parse_stream_config(html: str) -> _StreamConfig | None:
) )
match = pattern3.search(html) match = pattern3.search(html)
if match: if match:
return _StreamConfig( return _StreamConfig(title=match.group(1), link=match.group(2), method=match.group(3))
title=match.group(1),
link=match.group(2),
method=match.group(3),
)
return None return None
def _is_m3u8_method(method: str) -> bool: def _is_m3u8_method(method: str) -> bool:
"""Check if the stream method indicates a direct HLS stream.""" """Check if the stream method indicates a direct HLS stream."""
return method.lower() in ("player", "hls") # `jwp` (current pushembdz format) returns an m3u8 from the api/stream
# endpoint regardless of player UI; treat it as HLS.
return method.lower() in ("player", "hls", "jwp")
def _extract_m3u8_url(link: str) -> str: def _extract_m3u8_url(link: str) -> str:
@ -478,11 +495,31 @@ class PitsportExtractor(BaseExtractor):
if stream_num > 1: if stream_num > 1:
stream_title += f" #{stream_num}" stream_title += f" #{stream_num}"
if _is_m3u8_method(config.method) and "serveplay.site" in config.link: # `safeStream` payload elides the link — fetch it from the
# Direct m3u8 stream # pushembdz.store/api/stream/<slug> endpoint. Older `stream`
m3u8_url = _extract_m3u8_url(config.link) # payloads provided the link inline.
link = config.link
if not link and _is_m3u8_method(config.method):
api_url = f"{EMBED_BASE}/api/stream/{embed_uuid}"
try:
api_resp = await client.get(
api_url,
headers={"Referer": embed_url, "Accept": "application/json"},
)
if api_resp.status_code == 200:
link = (api_resp.json() or {}).get("link", "")
except Exception:
logger.debug(
"[pitsport] api/stream lookup failed for %s",
embed_uuid,
exc_info=True,
)
# Treat any HLS-ish URL (m3u8, or pushembdz's .css disguise) as m3u8.
looks_hls = link and (".m3u8" in link or link.endswith(".css") or "serveplay.site" in link)
if _is_m3u8_method(config.method) and looks_hls:
return ExtractedStream( return ExtractedStream(
url=m3u8_url, url=link,
site_key=self.site_key, site_key=self.site_key,
site_name=self.site_name, site_name=self.site_name,
quality="", quality="",