"""Pitsport.xyz extractor - fetches F1 streams from the Next.js RSC payload. Architecture: - Main page (pitsport.xyz) has a "Live Now" section with event cards containing category, title, time, imageUrl props and /watch/{UUID} links. - Schedule page (pitsport.xyz/schedule) lists all events grouped by category (h2 headings) with /watch/{UUID} links and event titles. - Watch pages (/watch/{UUID}) embed iframes from pushembdz.store/embed/{EMBED_UUID}. - Embed pages contain an RSC payload with a stream config: {title, link, method}. - When method is "player" or "hls", the link field points to a serveplay.site m3u8 playlist. Otherwise we return the embed URL for iframe playback. """ import logging import re from dataclasses import dataclass import httpx from backend.extractors.base import BaseExtractor from backend.extractors.models import ExtractedStream logger = logging.getLogger(__name__) PITSPORT_BASE = "https://pitsport.xyz" EMBED_BASE = "https://pushembdz.store" USER_AGENT = ( "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " "AppleWebKit/537.36 (KHTML, like Gecko) " "Chrome/120.0.0.0 Safari/537.36" ) # Categories to include (case-insensitive match) F1_CATEGORIES = {"formula 1", "formula 2", "formula 3"} # Fallback keyword matching on combined category+title for edge cases F1_KEYWORDS = {"formula 1", "formula one", "f1"} GP_KEYWORD = "grand prix" NON_F1_KEYWORDS = { "motogp", "moto gp", "moto2", "moto3", "motoe", "indycar", "indy car", "firestone", "nascar", "rally", "wrc", "wec", "lemans", "le mans", "superbike", "dtm", "supercars", "arca", "xfinity", "trucks", "super formula", "supergt", "super gt", "ama supercross", "supercross", } @dataclass class _PitsportEvent: """An event discovered from the Pitsport site.""" category: str title: str watch_uuid: str def _is_f1_category(category: str) -> bool: """Check if a category string matches an F1-related series.""" return category.strip().lower() in F1_CATEGORIES def _is_f1_event(category: str, title: str) -> bool: """Check if an event is Formula 1 related by category or title keywords.""" # Primary check: exact category match if _is_f1_category(category): return True # Secondary check: keyword matching on combined text lower = f"{category} {title}".lower() if any(kw in lower for kw in NON_F1_KEYWORDS): return False if any(kw in lower for kw in F1_KEYWORDS): return True if GP_KEYWORD in lower: return True return False def _parse_live_events(html: str) -> list[_PitsportEvent]: """Parse live events from the main page RSC payload. The main page contains event cards with props: category, title, time, imageUrl wrapped in links. """ events: list[_PitsportEvent] = [] # Match event cards in the RSC payload - they appear as JSON-like structures # Pattern: href="/watch/UUID" ... category":"...", "title":"..." # In the RSC payload, the data is in the format: # ["$","$L2","/watch/UUID",{"href":"/watch/UUID","children":["$","$L10",null, # {"category":"...","title":"...","time":...,"imageUrl":"..."}]}] pattern = re.compile( r'"href":"(/watch/([0-9a-f-]{36}))"[^}]*?"category":"([^"]+)","title":"([^"]+)"', ) for match in pattern.finditer(html): _, uuid, category, title = match.groups() events.append(_PitsportEvent(category=category, title=title, watch_uuid=uuid)) return events def _parse_schedule_events(html: str) -> list[_PitsportEvent]: """Parse events from the schedule page. The schedule page groups events under category headers (h2 elements). In the rendered HTML:

Formula 1

... ...
In the RSC payload, similar structure with section divs containing a category h2 and child event links with titles. """ events: list[_PitsportEvent] = [] # Strategy 1: Parse from rendered HTML # Find category sections: >CategoryName followed by watch links # Split HTML at each category header section_pattern = re.compile( r'>([^<]+)\s*]*class="flex flex-wrap gap-6">(.*?)(?=\s*\s*(?:|$))', re.DOTALL, ) for section_match in section_pattern.finditer(html): category = section_match.group(1).strip() section_html = section_match.group(2) # Find all watch links in this section link_pattern = re.compile( r'href="/watch/([0-9a-f-]{36})".*?]*>([^<]+)', re.DOTALL, ) for link_match in link_pattern.finditer(section_html): uuid = link_match.group(1) title = link_match.group(2).strip() events.append( _PitsportEvent(category=category, title=title, watch_uuid=uuid) ) # Strategy 2: Parse from RSC payload if rendered HTML didn't yield results # The RSC payload has patterns like: # "children":"Formula 1"}] ... "/watch/UUID" ... "title":"EventTitle" if not events: events = _parse_schedule_rsc(html) return events def _parse_schedule_rsc(html: str) -> list[_PitsportEvent]: """Parse events from schedule page RSC payload as fallback. Extracts category section divs from the RSC JSON structure. """ events: list[_PitsportEvent] = [] # Find the RSC payload chunks rsc_chunks = re.findall( r'self\.__next_f\.push\(\[1,"(.*?)"\]\)', html, re.DOTALL ) if not rsc_chunks: return events # Concatenate and unescape full_payload = "" for chunk in rsc_chunks: try: full_payload += chunk.encode().decode("unicode_escape") except Exception: full_payload += chunk # Find category sections in the RSC data # Pattern: "children":"CategoryName"}],["$","div",...watch links... # Each section div contains an h2 with the category name and watch links cat_pattern = re.compile( r'border-gray-700 pb-2","children":"([^"]+)"\}.*?' r'(?=border-gray-700 pb-2","children"|$)', re.DOTALL, ) for cat_match in cat_pattern.finditer(full_payload): category = cat_match.group(1) section_text = cat_match.group(0) # Find watch UUIDs and titles in this section # Pattern: "/watch/UUID" ... "title":"EventTitle" event_pattern = re.compile( r'/watch/([0-9a-f-]{36}).*?"title":"([^"]+)"', ) for ev_match in event_pattern.finditer(section_text): uuid = ev_match.group(1) title = ev_match.group(2) events.append( _PitsportEvent(category=category, title=title, watch_uuid=uuid) ) return events def _parse_embed_uuids(html: str) -> list[str]: """Extract embed UUIDs from a watch page. Watch pages contain iframes like: