"""Pitsport.xyz extractor - fetches F1 streams from the Next.js RSC payload. Architecture: - Main page (pitsport.xyz) has a "Live Now" section with event cards containing category, title, time, imageUrl props and /watch/{UUID} links. - Schedule page (pitsport.xyz/schedule) lists all events grouped by category (h2 headings) with /watch/{UUID} links and event titles. - Watch pages (/watch/{UUID}) embed iframes from pushembdz.store/embed/{EMBED_UUID}. - Embed pages contain an RSC payload with a stream config: {title, link, method}. - When method is "player" or "hls", the link field points to a serveplay.site m3u8 playlist. Otherwise we return the embed URL for iframe playback. """ import logging import re from dataclasses import dataclass import httpx from backend.extractors.base import BaseExtractor from backend.extractors.models import ExtractedStream logger = logging.getLogger(__name__) PITSPORT_BASE = "https://pitsport.xyz" EMBED_BASE = "https://pushembdz.store" USER_AGENT = ( "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " "AppleWebKit/537.36 (KHTML, like Gecko) " "Chrome/120.0.0.0 Safari/537.36" ) # Categories to include (case-insensitive match). Broadened beyond F1 # to also surface MotoGP and adjacent motorsports — keeps the f1-stream # UI useful between race weekends and during the off-season. MOTORSPORT_CATEGORIES = { "f1", "formula 1", "formula 2", "formula 3", "motogp", "moto gp", "moto2", "moto3", "motoe", "world rally championship", "wrc", "world endurance championship", "wec", "indycar series", "indycar", "indynxt", "nascar cup series", "nascar truck series", "nascar o'reilly auto parts series", "nascar xfinity series", "nascar", } # Title keywords that are strong positives even when the category text # is missing (live-now cards sometimes elide it). MOTORSPORT_KEYWORDS = { "formula 1", "formula one", "f1", "motogp", "moto gp", "moto2", "moto3", "rally", "wrc", "indycar", "indy car", "nascar", "le mans", "lemans", "wec", "endurance", } GP_KEYWORD = "grand prix" @dataclass class _PitsportEvent: """An event discovered from the Pitsport site.""" category: str title: str watch_uuid: str def _is_motorsport_category(category: str) -> bool: """Check if a category string matches an included motorsport series.""" return category.strip().lower() in MOTORSPORT_CATEGORIES def _is_motorsport_event(category: str, title: str) -> bool: """Accept anything pitsport.xyz lists. Pitsport curates sports broadcasts (WRC, MotoGP, IndyCar, NASCAR, Premier League Darts, Premier League football, etc.) — the site's own selection is the filter we want. Empty/garbage events still get filtered downstream when `_resolve_event_streams` produces no playable URL.""" return bool(category or title) # Aliases kept so older call-sites stay compiling. Both now point at the # broadened motorsport filter. _is_f1_category = _is_motorsport_category _is_f1_event = _is_motorsport_event def _decode_rsc_payload(html: str) -> str: """Concatenate and unescape all `self.__next_f.push([1, "..."])` chunks. Next.js RSC ships its tree as escape-encoded strings inside repeated `self.__next_f.push` calls. Regex over the raw HTML misses everything interesting; we have to decode unicode escapes first. """ chunks = re.findall(r'self\.__next_f\.push\(\[1,"(.*?)"\]\)', html, re.DOTALL) if not chunks: return "" payload = "" for chunk in chunks: try: payload += chunk.encode().decode("unicode_escape") except Exception: payload += chunk return payload def _parse_live_events(html: str) -> list[_PitsportEvent]: """Parse live events from the main page (or `/live-now`) RSC payload. The pages embed event cards inside the Next.js RSC payload; the raw HTML keeps it escape-encoded so we decode first, then match. Two shapes are common: 1) Older card props: "category":"...","title":"..." next to "href":"/watch/UUID". 2) Newer `event` prop: an `event` object with `uri:"/watch/UUID"` carrying `category` and `title`. """ payload = _decode_rsc_payload(html) or html events: list[_PitsportEvent] = [] href_pattern = re.compile( r'"href":"(/watch/([0-9a-f-]{36}))"[^}]*?"category":"([^"]+)","title":"([^"]+)"', ) for match in href_pattern.finditer(payload): _, uuid, category, title = match.groups() events.append(_PitsportEvent(category=category, title=title, watch_uuid=uuid)) event_pattern = re.compile( r'"event":\{[^{}]*?"title":"([^"]+)"[^{}]*?"uri":"/watch/([0-9a-f-]{36})"[^{}]*?"category":"([^"]+)"', ) for match in event_pattern.finditer(payload): title, uuid, category = match.groups() events.append(_PitsportEvent(category=category, title=title, watch_uuid=uuid)) event_pattern_alt = re.compile( r'"event":\{[^{}]*?"category":"([^"]+)"[^{}]*?"title":"([^"]+)"[^{}]*?"uri":"/watch/([0-9a-f-]{36})"', ) for match in event_pattern_alt.finditer(payload): category, title, uuid = match.groups() events.append(_PitsportEvent(category=category, title=title, watch_uuid=uuid)) return events def _parse_schedule_events(html: str) -> list[_PitsportEvent]: """Parse events from the schedule page. The schedule page groups events under category headers (h2 elements). In the rendered HTML: