infra/stacks/f1-stream/files/backend/extractors/dd12.py

"""DD12Streams extractor — scrapes inline m3u8 URLs from per-channel pages.

Each DD12 sport page (`/nas`, `/f1`, `/sky`, etc.) renders an iframe to
`/<channel>c1` which 302-redirects to `/new-<channel>/jwplayer`. That
page contains a JW Player setup with the m3u8 URL hard-coded inline:

    playerInstance.setup({
      file: "https://...b-cdn.net/.../master.m3u8",
      ...
    });

The JW Player runtime fails in our cluster (same fingerprint trap as
hmembeds), but we don't need it — the file URL is in the HTML and any
browser with H.264 codecs can play it directly via hls.js.

Channel discovery: probe a known list. New ones can be added by checking
DD12's own homepage / nav.
"""

import logging
import re

import httpx

from backend.extractors.base import BaseExtractor
from backend.extractors.models import ExtractedStream

logger = logging.getLogger(__name__)

BASE = "https://dd12streams.com"
USER_AGENT = (
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
    "AppleWebKit/605.1.15 (KHTML, like Gecko) "
    "Version/17.4 Safari/605.1.15"
)

# (path, channel_label, title). Add as DD12 surfaces new channels.
CHANNELS = (
    ("nas", "DD12Streams", "NASCAR Cup Series (24/7) — DD12"),
)

_FILE_URL_RE = re.compile(r"""file\s*:\s*["']([^"']+\.m3u8[^"']*)["']""")


class DD12Extractor(BaseExtractor):
    @property
    def site_key(self) -> str:
        return "dd12"

    @property
    def site_name(self) -> str:
        return "DD12Streams"

    async def extract(self) -> list[ExtractedStream]:
        results: list[ExtractedStream] = []
        async with httpx.AsyncClient(
            timeout=15.0,
            follow_redirects=True,
            headers={"User-Agent": USER_AGENT},
        ) as client:
            for path, label, title in CHANNELS:
                try:
                    page_url = f"{BASE}/{path}"
                    resp = await client.get(page_url)
                    if resp.status_code != 200:
                        continue
                    iframe_path = self._extract_iframe(resp.text)
                    if not iframe_path:
                        continue
                    iframe_url = (
                        iframe_path
                        if iframe_path.startswith("http")
                        else f"{BASE}{iframe_path}"
                    )
                    iframe_resp = await client.get(
                        iframe_url, headers={"Referer": page_url}
                    )
                    if iframe_resp.status_code != 200:
                        continue
                    m3u8 = self._find_m3u8(iframe_resp.text)
                    if not m3u8:
                        continue
                    results.append(
                        ExtractedStream(
                            url=m3u8,
                            site_key=self.site_key,
                            site_name=label,
                            quality="",
                            title=title,
                            stream_type="m3u8",
                        )
                    )
                except Exception:
                    logger.debug(
                        "[dd12] /%s extraction failed", path, exc_info=True
                    )
        logger.info("[dd12] Extracted %d stream(s)", len(results))
        return results

    @staticmethod
    def _extract_iframe(html: str) -> str | None:
        m = re.search(
            r'<iframe[^>]+id=["\']vplayer["\'][^>]+src=["\']([^"\']+)["\']',
            html,
        )
        return m.group(1) if m else None

    @staticmethod
    def _find_m3u8(html: str) -> str | None:
        m = _FILE_URL_RE.search(html)
        return m.group(1) if m else None
f1-stream: add chrome-browser, subreddit, dd12 extractors; fix streamed.pk User asked to broaden the source pipeline so f1-stream can find F1 (and adjacent motorsport) streams from Sky Sports / DAZN / Reddit / etc., using the in-cluster chrome-service headed browser where needed. Four changes: 1. streamed.py: BASE_URL streamed.su → streamed.pk. The .su domain stopped serving the API host in 2026 (only the marketing page is left); .pk hosts the JSON API now. Adds 3 events/round (currently all routed through embedsports.top — see #2 caveat). 2. chrome_browser.py (new): generic chrome-service-driven extractor. Connects to the existing chrome-service WS (CHROME_WS_URL + CHROME_WS_TOKEN env), navigates a list of TARGETS, captures any HLS playlist URL the page fetches at runtime, returns one ExtractedStream per discovery. Uses the same stealth init script as the verifier so anti-bot checks don't trip the page. Handles iframes (DD12-style /nas → /new-nas/jwplayer) and probes child-frame <video>/source elements after settle. Caveat: most aggregator sites (pooembed, embedsports, hmembeds, even DD12's JW Player path) use a broken runtime decoder that produces no m3u8 in our environment, so the TARGETS list is currently 0-yielding; the framework is the contribution and concrete sites can be added as they're discovered. 3. subreddit.py (new): scans r/MotorsportsReplays, r/motorsports, r/formula1, r/motogp via the public old.reddit.com JSON API for posts whose flair/title indicates a live stream. Discovered URLs are returned as embed-type streams; the verifier visits each via chrome-service to confirm playability. Note: Reddit currently HTTP 403's our cluster outbound IP for anonymous JSON requests; the extractor returns 0 in that state and logs a debug message. Will work from any IP Reddit isn't blocking. 4. dd12.py (new): inline-HTML scraper for DD12Streams. The site embeds `playerInstance.setup({file: "..."})` directly in HTML — no JS decoder needed. Currently surfaces NASCAR Cup Series 24/7 (clean BunnyCDN-hosted HLS at w9329432hnf3h34.b-cdn.net/pdfs/master.m3u8); add new `(path, label, title)` tuples to CHANNELS as DD12 expands. Result: /streams now shows 2 verified live streams (Rally TV via pitsport + DD12 NASCAR Cup 24/7). When the next F1 weekend (Canadian GP, May 22-24) goes live, pitsport will surface F1 sessions automatically via the existing pushembdz path. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com> 2026-05-07 16:05:25 +00:00			`"""DD12Streams extractor — scrapes inline m3u8 URLs from per-channel pages.`

			Each DD12 sport page (`/nas`, `/f1`, `/sky`, etc.) renders an iframe to
			`/<channel>c1` which 302-redirects to `/new-<channel>/jwplayer`. That
			`page contains a JW Player setup with the m3u8 URL hard-coded inline:`

			`playerInstance.setup({`
			`file: "https://...b-cdn.net/.../master.m3u8",`
			`...`
			`});`

			`The JW Player runtime fails in our cluster (same fingerprint trap as`
			`hmembeds), but we don't need it — the file URL is in the HTML and any`
			`browser with H.264 codecs can play it directly via hls.js.`

			`Channel discovery: probe a known list. New ones can be added by checking`
			`DD12's own homepage / nav.`
			`"""`

			`import logging`
			`import re`

			`import httpx`

			`from backend.extractors.base import BaseExtractor`
			`from backend.extractors.models import ExtractedStream`

			`logger = logging.getLogger(__name__)`

			`BASE = "https://dd12streams.com"`
			`USER_AGENT = (`
			`"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "`
			`"AppleWebKit/605.1.15 (KHTML, like Gecko) "`
			`"Version/17.4 Safari/605.1.15"`
			`)`

			`# (path, channel_label, title). Add as DD12 surfaces new channels.`
			`CHANNELS = (`
			`("nas", "DD12Streams", "NASCAR Cup Series (24/7) — DD12"),`
			`)`

			`_FILE_URL_RE = re.compile(r"""file\s:\s["']([^"']+\.m3u8[^"']*)["']""")`


			`class DD12Extractor(BaseExtractor):`
			`@property`
			`def site_key(self) -> str:`
			`return "dd12"`

			`@property`
			`def site_name(self) -> str:`
			`return "DD12Streams"`

			`async def extract(self) -> list[ExtractedStream]:`
			`results: list[ExtractedStream] = []`
			`async with httpx.AsyncClient(`
			`timeout=15.0,`
			`follow_redirects=True,`
			`headers={"User-Agent": USER_AGENT},`
			`) as client:`
			`for path, label, title in CHANNELS:`
			`try:`
			`page_url = f"{BASE}/{path}"`
			`resp = await client.get(page_url)`
			`if resp.status_code != 200:`
			`continue`
			`iframe_path = self._extract_iframe(resp.text)`
			`if not iframe_path:`
			`continue`
			`iframe_url = (`
			`iframe_path`
			`if iframe_path.startswith("http")`
			`else f"{BASE}{iframe_path}"`
			`)`
			`iframe_resp = await client.get(`
			`iframe_url, headers={"Referer": page_url}`
			`)`
			`if iframe_resp.status_code != 200:`
			`continue`
			`m3u8 = self._find_m3u8(iframe_resp.text)`
			`if not m3u8:`
			`continue`
			`results.append(`
			`ExtractedStream(`
			`url=m3u8,`
			`site_key=self.site_key,`
			`site_name=label,`
			`quality="",`
			`title=title,`
			`stream_type="m3u8",`
			`)`
			`)`
			`except Exception:`
			`logger.debug(`
			`"[dd12] /%s extraction failed", path, exc_info=True`
			`)`
			`logger.info("[dd12] Extracted %d stream(s)", len(results))`
			`return results`

			`@staticmethod`
			`def _extract_iframe(html: str) -> str \| None:`
			`m = re.search(`
			`r'<iframe[^>]+id=["\']vplayer["\'][^>]+src=["\']([^"\']+)["\']',`
			`html,`
			`)`
			`return m.group(1) if m else None`

			`@staticmethod`
			`def _find_m3u8(html: str) -> str \| None:`
			`m = _FILE_URL_RE.search(html)`
			`return m.group(1) if m else None`