"""Pitsport.xyz extractor - fetches F1 streams from the Next.js RSC payload. Architecture: - Main page (pitsport.xyz) has a "Live Now" section with event cards containing category, title, time, imageUrl props and /watch/{UUID} links. - Schedule page (pitsport.xyz/schedule) lists all events grouped by category (h2 headings) with /watch/{UUID} links and event titles. - Watch pages (/watch/{UUID}) embed iframes from pushembdz.store/embed/{EMBED_UUID}. - Embed pages contain an RSC payload with a stream config: {title, link, method}. - When method is "player" or "hls", the link field points to a serveplay.site m3u8 playlist. Otherwise we return the embed URL for iframe playback. """ import logging import re from dataclasses import dataclass import httpx from backend.extractors.base import BaseExtractor from backend.extractors.models import ExtractedStream logger = logging.getLogger(__name__) PITSPORT_BASE = "https://pitsport.xyz" EMBED_BASE = "https://pushembdz.store" USER_AGENT = ( "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " "AppleWebKit/537.36 (KHTML, like Gecko) " "Chrome/120.0.0.0 Safari/537.36" ) # Categories to include (case-insensitive match). Broadened beyond F1 # to also surface MotoGP and adjacent motorsports — keeps the f1-stream # UI useful between race weekends and during the off-season. MOTORSPORT_CATEGORIES = { "formula 1", "formula 2", "formula 3", "motogp", "moto gp", "moto2", "moto3", "motoe", "world rally championship", "wrc", "world endurance championship", "wec", "indycar series", "indycar", "indynxt", "nascar cup series", "nascar truck series", "nascar o'reilly auto parts series", "nascar xfinity series", "nascar", } # Title keywords that are strong positives even when the category text # is missing (live-now cards sometimes elide it). MOTORSPORT_KEYWORDS = { "formula 1", "formula one", "f1", "motogp", "moto gp", "moto2", "moto3", "rally", "wrc", "indycar", "indy car", "nascar", "le mans", "lemans", "wec", "endurance", } GP_KEYWORD = "grand prix" @dataclass class _PitsportEvent: """An event discovered from the Pitsport site.""" category: str title: str watch_uuid: str def _is_motorsport_category(category: str) -> bool: """Check if a category string matches an included motorsport series.""" return category.strip().lower() in MOTORSPORT_CATEGORIES def _is_motorsport_event(category: str, title: str) -> bool: """Accept anything pitsport.xyz lists. Pitsport curates sports broadcasts (WRC, MotoGP, IndyCar, NASCAR, Premier League Darts, Premier League football, etc.) — the site's own selection is the filter we want. Empty/garbage events still get filtered downstream when `_resolve_event_streams` produces no playable URL.""" return bool(category or title) # Aliases kept so older call-sites stay compiling. Both now point at the # broadened motorsport filter. _is_f1_category = _is_motorsport_category _is_f1_event = _is_motorsport_event def _parse_live_events(html: str) -> list[_PitsportEvent]: """Parse live events from the main page RSC payload. The main page contains event cards with props: category, title, time, imageUrl wrapped in links. """ events: list[_PitsportEvent] = [] # Match event cards in the RSC payload - they appear as JSON-like structures # Pattern: href="/watch/UUID" ... category":"...", "title":"..." # In the RSC payload, the data is in the format: # ["$","$L2","/watch/UUID",{"href":"/watch/UUID","children":["$","$L10",null, # {"category":"...","title":"...","time":...,"imageUrl":"..."}]}] pattern = re.compile( r'"href":"(/watch/([0-9a-f-]{36}))"[^}]*?"category":"([^"]+)","title":"([^"]+)"', ) for match in pattern.finditer(html): _, uuid, category, title = match.groups() events.append(_PitsportEvent(category=category, title=title, watch_uuid=uuid)) return events def _parse_schedule_events(html: str) -> list[_PitsportEvent]: """Parse events from the schedule page. The schedule page groups events under category headers (h2 elements). In the rendered HTML:

Formula 1

... ...
In the RSC payload, similar structure with section divs containing a category h2 and child event links with titles. """ events: list[_PitsportEvent] = [] # Strategy 1: Parse from rendered HTML # Find category sections: >CategoryName followed by watch links # Split HTML at each category header section_pattern = re.compile( r'>([^<]+)\s*]*class="flex flex-wrap gap-6">(.*?)(?=\s*\s*(?:|$))', re.DOTALL, ) for section_match in section_pattern.finditer(html): category = section_match.group(1).strip() section_html = section_match.group(2) # Find all watch links in this section link_pattern = re.compile( r'href="/watch/([0-9a-f-]{36})".*?]*>([^<]+)', re.DOTALL, ) for link_match in link_pattern.finditer(section_html): uuid = link_match.group(1) title = link_match.group(2).strip() events.append( _PitsportEvent(category=category, title=title, watch_uuid=uuid) ) # Strategy 2: Parse from RSC payload if rendered HTML didn't yield results # The RSC payload has patterns like: # "children":"Formula 1"}] ... "/watch/UUID" ... "title":"EventTitle" if not events: events = _parse_schedule_rsc(html) return events def _parse_schedule_rsc(html: str) -> list[_PitsportEvent]: """Parse events from schedule page RSC payload as fallback. Extracts category section divs from the RSC JSON structure. """ events: list[_PitsportEvent] = [] # Find the RSC payload chunks rsc_chunks = re.findall( r'self\.__next_f\.push\(\[1,"(.*?)"\]\)', html, re.DOTALL ) if not rsc_chunks: return events # Concatenate and unescape full_payload = "" for chunk in rsc_chunks: try: full_payload += chunk.encode().decode("unicode_escape") except Exception: full_payload += chunk # Find category sections in the RSC data # Pattern: "children":"CategoryName"}],["$","div",...watch links... # Each section div contains an h2 with the category name and watch links cat_pattern = re.compile( r'border-gray-700 pb-2","children":"([^"]+)"\}.*?' r'(?=border-gray-700 pb-2","children"|$)', re.DOTALL, ) for cat_match in cat_pattern.finditer(full_payload): category = cat_match.group(1) section_text = cat_match.group(0) # Find watch UUIDs and titles in this section # Pattern: "/watch/UUID" ... "title":"EventTitle" event_pattern = re.compile( r'/watch/([0-9a-f-]{36}).*?"title":"([^"]+)"', ) for ev_match in event_pattern.finditer(section_text): uuid = ev_match.group(1) title = ev_match.group(2) events.append( _PitsportEvent(category=category, title=title, watch_uuid=uuid) ) return events def _parse_embed_uuids(html: str) -> list[str]: """Extract embed UUIDs from a watch page. Watch pages contain iframes like: