infra/stacks/f1-stream/files/backend/extractors/base.py

"""Base class for all site-specific stream extractors."""

import logging
from abc import ABC, abstractmethod

import httpx

from backend.extractors.models import ExtractedStream

logger = logging.getLogger(__name__)


class BaseExtractor(ABC):
    """Abstract base class for site-specific stream extractors.

    To create a new extractor:
    1. Create a new file in backend/extractors/
    2. Subclass BaseExtractor
    3. Implement site_key, site_name, and extract()
    4. Register it in backend/extractors/__init__.py
    """

    @property
    @abstractmethod
    def site_key(self) -> str:
        """Unique identifier for this site (e.g., 'sportsurge').

        Must be lowercase, alphanumeric with hyphens/underscores only.
        Used as the cache key and in API responses.
        """

    @property
    @abstractmethod
    def site_name(self) -> str:
        """Human-readable name (e.g., 'SportSurge').

        Displayed in the UI and API responses.
        """

    @abstractmethod
    async def extract(self) -> list[ExtractedStream]:
        """Extract stream URLs from this site.

        Returns a list of ExtractedStream objects. Each represents a
        discovered stream URL. The extractor should set url, quality,
        and title fields; site_key, site_name, and extracted_at are
        auto-populated if left empty.

        Implementations should:
        - Use httpx for HTTP requests
        - Handle their own errors gracefully (log and return empty list)
        - Set quality when detectable from the source
        - Set title to something descriptive
        """

    async def health_check(self, url: str) -> bool:
        """Verify a URL is live (HEAD request, check for m3u8 content).

        Sends a HEAD request and checks:
        1. HTTP 200 response
        2. Content-Type suggests HLS/media content (if available)

        Returns True if the URL appears to be a live stream.
        """
        try:
            async with httpx.AsyncClient(
                timeout=10.0,
                follow_redirects=True,
                headers={"User-Agent": "Mozilla/5.0"},
            ) as client:
                response = await client.head(url)

                if response.status_code != 200:
                    logger.debug(
                        "[%s] Health check failed for %s: HTTP %d",
                        self.site_key,
                        url,
                        response.status_code,
                    )
                    return False

                content_type = response.headers.get("content-type", "").lower()
                # m3u8 streams typically have these content types
                live_indicators = [
                    "application/vnd.apple.mpegurl",
                    "application/x-mpegurl",
                    "video/",
                    "audio/",
                    "octet-stream",
                ]

                # If content-type is present and doesn't look like media,
                # the URL might not be a stream. But some servers don't set
                # content-type properly for HEAD, so we still return True
                # if content-type is missing or generic.
                if content_type and not any(ind in content_type for ind in live_indicators):
                    # Content type present but doesn't look like media.
                    # Could still be valid (some servers return text/plain for m3u8).
                    if "text/" in content_type or "html" in content_type:
                        logger.debug(
                            "[%s] Health check suspect for %s: content-type=%s",
                            self.site_key,
                            url,
                            content_type,
                        )
                        return False

                return True

        except httpx.TimeoutException:
            logger.debug("[%s] Health check timed out for %s", self.site_key, url)
            return False
        except httpx.HTTPError as e:
            logger.debug("[%s] Health check error for %s: %s", self.site_key, url, e)
            return False
        except Exception:
            logger.exception("[%s] Unexpected error during health check for %s", self.site_key, url)
            return False