f1-stream: pitsport extractor — broaden categories + new safeStream payload
The previous extractor only surfaced Formula 1/2/3 and never returned
anything outside race weekends. Two fixes:
1. Broadened category filter from {formula 1/2/3} to a motorsport set
(MotoGP/Moto2/Moto3, WRC/WEC/IndyCar/NASCAR + the F1 series).
Replaces the NON_F1_KEYWORDS exclusion list with a positive-match
MOTORSPORT_KEYWORDS set; removes the F1-specific filter on title
keywords. Old `_is_f1_*` aliases retained as compat shims.
2. Updated `_parse_stream_config` for the current pushembdz.store embed
payload — Next.js now serves `safeStream` (just title + method) and
the actual stream URL is fetched at runtime from
`pushembdz.store/api/stream/<slug>`. Extractor now hits that endpoint
when the inline link is missing. Treats `method=jwp` as HLS and
accepts URLs ending in `.css` (pushembdz disguises some HLS playlists
with a `.css` extension).
End-to-end result: /streams went from 2 (curated, broken JW decoder) to
24 streams marked `is_live=True`. The verifier confirms each via
`manifest_parsed_codec_missing_in_verifier` (Playwright Chromium has no
H.264 — manifest fetch alone is the codec-independent positive signal).
Currently surfaces Rally de Portugal SS1–SS22 (WRC); MotoGP starts
appearing once the French GP weekend goes live tomorrow.
Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
parent
8146d05191
commit
18d96712c7
1 changed files with 86 additions and 49 deletions
|
|
@ -30,20 +30,31 @@ USER_AGENT = (
|
|||
"Chrome/120.0.0.0 Safari/537.36"
|
||||
)
|
||||
|
||||
# Categories to include (case-insensitive match)
|
||||
F1_CATEGORIES = {"formula 1", "formula 2", "formula 3"}
|
||||
|
||||
# Fallback keyword matching on combined category+title for edge cases
|
||||
F1_KEYWORDS = {"formula 1", "formula one", "f1"}
|
||||
GP_KEYWORD = "grand prix"
|
||||
NON_F1_KEYWORDS = {
|
||||
"motogp", "moto gp", "moto2", "moto3", "motoe", "indycar",
|
||||
"indy car", "firestone", "nascar", "rally", "wrc", "wec",
|
||||
"lemans", "le mans", "superbike", "dtm", "supercars", "arca",
|
||||
"xfinity", "trucks", "super formula", "supergt", "super gt",
|
||||
"ama supercross", "supercross",
|
||||
# Categories to include (case-insensitive match). Broadened beyond F1
|
||||
# to also surface MotoGP and adjacent motorsports — keeps the f1-stream
|
||||
# UI useful between race weekends and during the off-season.
|
||||
MOTORSPORT_CATEGORIES = {
|
||||
"formula 1", "formula 2", "formula 3",
|
||||
"motogp", "moto gp", "moto2", "moto3", "motoe",
|
||||
"world rally championship", "wrc",
|
||||
"world endurance championship", "wec",
|
||||
"indycar series", "indycar", "indynxt",
|
||||
"nascar cup series", "nascar truck series", "nascar o'reilly auto parts series",
|
||||
"nascar xfinity series", "nascar",
|
||||
}
|
||||
|
||||
# Title keywords that are strong positives even when the category text
|
||||
# is missing (live-now cards sometimes elide it).
|
||||
MOTORSPORT_KEYWORDS = {
|
||||
"formula 1", "formula one", "f1",
|
||||
"motogp", "moto gp", "moto2", "moto3",
|
||||
"rally", "wrc",
|
||||
"indycar", "indy car",
|
||||
"nascar",
|
||||
"le mans", "lemans", "wec", "endurance",
|
||||
}
|
||||
GP_KEYWORD = "grand prix"
|
||||
|
||||
|
||||
@dataclass
|
||||
class _PitsportEvent:
|
||||
|
|
@ -54,28 +65,29 @@ class _PitsportEvent:
|
|||
watch_uuid: str
|
||||
|
||||
|
||||
def _is_f1_category(category: str) -> bool:
|
||||
"""Check if a category string matches an F1-related series."""
|
||||
return category.strip().lower() in F1_CATEGORIES
|
||||
def _is_motorsport_category(category: str) -> bool:
|
||||
"""Check if a category string matches an included motorsport series."""
|
||||
return category.strip().lower() in MOTORSPORT_CATEGORIES
|
||||
|
||||
|
||||
def _is_f1_event(category: str, title: str) -> bool:
|
||||
"""Check if an event is Formula 1 related by category or title keywords."""
|
||||
# Primary check: exact category match
|
||||
if _is_f1_category(category):
|
||||
def _is_motorsport_event(category: str, title: str) -> bool:
|
||||
"""Check if an event is a motorsport we want to surface (F1 + adjacent)."""
|
||||
if _is_motorsport_category(category):
|
||||
return True
|
||||
|
||||
# Secondary check: keyword matching on combined text
|
||||
lower = f"{category} {title}".lower()
|
||||
if any(kw in lower for kw in NON_F1_KEYWORDS):
|
||||
return False
|
||||
if any(kw in lower for kw in F1_KEYWORDS):
|
||||
if any(kw in lower for kw in MOTORSPORT_KEYWORDS):
|
||||
return True
|
||||
if GP_KEYWORD in lower:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
# Aliases kept so older call-sites stay compiling. Both now point at the
|
||||
# broadened motorsport filter.
|
||||
_is_f1_category = _is_motorsport_category
|
||||
_is_f1_event = _is_motorsport_event
|
||||
|
||||
|
||||
def _parse_live_events(html: str) -> list[_PitsportEvent]:
|
||||
"""Parse live events from the main page RSC payload.
|
||||
|
||||
|
|
@ -232,11 +244,28 @@ class _StreamConfig:
|
|||
def _parse_stream_config(html: str) -> _StreamConfig | None:
|
||||
"""Extract stream config from an embed page RSC payload.
|
||||
|
||||
The embed page contains an RSC payload line like:
|
||||
4:["$","$Ld",null,{"stream":{"title":"...","link":"...","method":"player"},
|
||||
The embed page now uses a `safeStream` payload that elides the link:
|
||||
4:["$","$Ld",null,{"safeStream":{"title":"Rally TV","method":"jwp"},
|
||||
"error":null,"slug":"..."}]
|
||||
The actual stream URL is fetched at runtime via
|
||||
pushembdz.store/api/stream/<slug>. Older payloads used "stream" with
|
||||
inline title+link+method — kept as fallback.
|
||||
"""
|
||||
# Try matching the escaped RSC payload pattern
|
||||
# Current format: safeStream with title + method only (link via API).
|
||||
pattern_safe = re.compile(
|
||||
r'\\?"safeStream\\?"\s*:\s*\{'
|
||||
r'\\?"title\\?"\s*:\s*\\?"([^"\\]+)\\?"\s*,\s*'
|
||||
r'\\?"method\\?"\s*:\s*\\?"([^"\\]+)\\?"',
|
||||
)
|
||||
match = pattern_safe.search(html)
|
||||
if match:
|
||||
return _StreamConfig(
|
||||
title=match.group(1),
|
||||
link="", # filled in by the caller via the api/stream endpoint
|
||||
method=match.group(2),
|
||||
)
|
||||
|
||||
# Legacy: escaped RSC payload with inline link.
|
||||
pattern = re.compile(
|
||||
r'"stream":\{["\']?\\?"title\\?"["\']?:["\']?\\?"([^"\\]+)\\?"["\']?,'
|
||||
r'["\']?\\?"link\\?"["\']?:["\']?\\?"([^"\\]+)\\?"["\']?,'
|
||||
|
|
@ -244,13 +273,8 @@ def _parse_stream_config(html: str) -> _StreamConfig | None:
|
|||
)
|
||||
match = pattern.search(html)
|
||||
if match:
|
||||
return _StreamConfig(
|
||||
title=match.group(1),
|
||||
link=match.group(2),
|
||||
method=match.group(3),
|
||||
)
|
||||
return _StreamConfig(title=match.group(1), link=match.group(2), method=match.group(3))
|
||||
|
||||
# Simpler pattern for double-escaped payload
|
||||
pattern2 = re.compile(
|
||||
r'\\?"stream\\?":\{\\?"title\\?":\\?"([^\\]+)\\?",'
|
||||
r'\\?"link\\?":\\?"([^\\]+)\\?",'
|
||||
|
|
@ -258,13 +282,8 @@ def _parse_stream_config(html: str) -> _StreamConfig | None:
|
|||
)
|
||||
match = pattern2.search(html)
|
||||
if match:
|
||||
return _StreamConfig(
|
||||
title=match.group(1),
|
||||
link=match.group(2),
|
||||
method=match.group(3),
|
||||
)
|
||||
return _StreamConfig(title=match.group(1), link=match.group(2), method=match.group(3))
|
||||
|
||||
# Most lenient: just find the three fields near each other
|
||||
pattern3 = re.compile(
|
||||
r'"stream"\s*:\s*\{\s*"title"\s*:\s*"([^"]+)"\s*,'
|
||||
r'\s*"link"\s*:\s*"([^"]+)"\s*,'
|
||||
|
|
@ -272,18 +291,16 @@ def _parse_stream_config(html: str) -> _StreamConfig | None:
|
|||
)
|
||||
match = pattern3.search(html)
|
||||
if match:
|
||||
return _StreamConfig(
|
||||
title=match.group(1),
|
||||
link=match.group(2),
|
||||
method=match.group(3),
|
||||
)
|
||||
return _StreamConfig(title=match.group(1), link=match.group(2), method=match.group(3))
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def _is_m3u8_method(method: str) -> bool:
|
||||
"""Check if the stream method indicates a direct HLS stream."""
|
||||
return method.lower() in ("player", "hls")
|
||||
# `jwp` (current pushembdz format) returns an m3u8 from the api/stream
|
||||
# endpoint regardless of player UI; treat it as HLS.
|
||||
return method.lower() in ("player", "hls", "jwp")
|
||||
|
||||
|
||||
def _extract_m3u8_url(link: str) -> str:
|
||||
|
|
@ -478,11 +495,31 @@ class PitsportExtractor(BaseExtractor):
|
|||
if stream_num > 1:
|
||||
stream_title += f" #{stream_num}"
|
||||
|
||||
if _is_m3u8_method(config.method) and "serveplay.site" in config.link:
|
||||
# Direct m3u8 stream
|
||||
m3u8_url = _extract_m3u8_url(config.link)
|
||||
# `safeStream` payload elides the link — fetch it from the
|
||||
# pushembdz.store/api/stream/<slug> endpoint. Older `stream`
|
||||
# payloads provided the link inline.
|
||||
link = config.link
|
||||
if not link and _is_m3u8_method(config.method):
|
||||
api_url = f"{EMBED_BASE}/api/stream/{embed_uuid}"
|
||||
try:
|
||||
api_resp = await client.get(
|
||||
api_url,
|
||||
headers={"Referer": embed_url, "Accept": "application/json"},
|
||||
)
|
||||
if api_resp.status_code == 200:
|
||||
link = (api_resp.json() or {}).get("link", "")
|
||||
except Exception:
|
||||
logger.debug(
|
||||
"[pitsport] api/stream lookup failed for %s",
|
||||
embed_uuid,
|
||||
exc_info=True,
|
||||
)
|
||||
|
||||
# Treat any HLS-ish URL (m3u8, or pushembdz's .css disguise) as m3u8.
|
||||
looks_hls = link and (".m3u8" in link or link.endswith(".css") or "serveplay.site" in link)
|
||||
if _is_m3u8_method(config.method) and looks_hls:
|
||||
return ExtractedStream(
|
||||
url=m3u8_url,
|
||||
url=link,
|
||||
site_key=self.site_key,
|
||||
site_name=self.site_name,
|
||||
quality="",
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue