130 lines
4 KiB
Python
130 lines
4 KiB
Python
"""YouTube RSS feed poller for Meet Kevin channel."""
|
|
|
|
import logging
|
|
from dataclasses import dataclass
|
|
from datetime import datetime
|
|
from xml.etree import ElementTree as ET
|
|
|
|
import httpx
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Atom and YouTube namespace mappings
|
|
_NAMESPACES = {
|
|
"a": "http://www.w3.org/2005/Atom",
|
|
"yt": "http://www.youtube.com/xml/schemas/2015",
|
|
"m": "http://search.yahoo.com/mrss/",
|
|
}
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class DiscoveredVideo:
|
|
"""A video discovered from YouTube RSS feed."""
|
|
|
|
youtube_video_id: str
|
|
title: str
|
|
description: str
|
|
published_at: datetime
|
|
thumbnail_url: str
|
|
|
|
|
|
async def fetch_feed(channel_id: str, client: httpx.AsyncClient) -> bytes:
|
|
"""Fetch YouTube RSS feed for a channel.
|
|
|
|
Args:
|
|
channel_id: YouTube channel ID (e.g., "UCUvvj5lwue7PspotMDjk5UA")
|
|
client: httpx AsyncClient for HTTP requests
|
|
|
|
Returns:
|
|
Raw XML bytes from the feed, or empty bytes on error.
|
|
HTTP errors are logged but do not raise.
|
|
"""
|
|
url = f"https://www.youtube.com/feeds/videos.xml?channel_id={channel_id}"
|
|
|
|
try:
|
|
response = await client.get(url, timeout=15.0)
|
|
response.raise_for_status()
|
|
return response.content
|
|
except httpx.HTTPError as e:
|
|
logger.warning("Failed to fetch feed from %s: %s", url, e)
|
|
return b""
|
|
|
|
|
|
def parse_feed(xml_bytes: bytes) -> list[DiscoveredVideo]:
|
|
"""Parse YouTube RSS feed XML and extract videos.
|
|
|
|
Args:
|
|
xml_bytes: Raw XML bytes from YouTube RSS feed
|
|
|
|
Returns:
|
|
List of DiscoveredVideo objects. Returns empty list on parse error,
|
|
empty input, or if no valid entries found.
|
|
Individual entries with missing required fields are skipped.
|
|
"""
|
|
if not xml_bytes:
|
|
return []
|
|
|
|
try:
|
|
root = ET.fromstring(xml_bytes)
|
|
except ET.ParseError as e:
|
|
logger.warning("Failed to parse feed XML: %s", e)
|
|
return []
|
|
|
|
videos: list[DiscoveredVideo] = []
|
|
|
|
for entry in root.findall("a:entry", _NAMESPACES):
|
|
try:
|
|
# Extract required fields
|
|
video_id_elem = entry.find("yt:videoId", _NAMESPACES)
|
|
title_elem = entry.find("a:title", _NAMESPACES)
|
|
published_elem = entry.find("a:published", _NAMESPACES)
|
|
|
|
# Extract media group (description and thumbnail)
|
|
media_group = entry.find("m:group", _NAMESPACES)
|
|
desc_elem = None
|
|
thumb_elem = None
|
|
if media_group is not None:
|
|
desc_elem = media_group.find("m:description", _NAMESPACES)
|
|
thumb_elem = media_group.find("m:thumbnail", _NAMESPACES)
|
|
|
|
# Skip entries with missing required fields
|
|
if (
|
|
video_id_elem is None
|
|
or video_id_elem.text is None
|
|
or title_elem is None
|
|
or title_elem.text is None
|
|
or published_elem is None
|
|
or published_elem.text is None
|
|
or thumb_elem is None
|
|
):
|
|
continue
|
|
|
|
# Parse published timestamp (handle Z suffix)
|
|
published_text = published_elem.text
|
|
published_text = published_text.replace("Z", "+00:00")
|
|
published_at = datetime.fromisoformat(published_text)
|
|
|
|
# Extract description (may be missing)
|
|
description = ""
|
|
if desc_elem is not None and desc_elem.text is not None:
|
|
description = desc_elem.text
|
|
|
|
# Extract thumbnail URL
|
|
thumbnail_url = thumb_elem.get("url", "")
|
|
if not thumbnail_url:
|
|
continue
|
|
|
|
video = DiscoveredVideo(
|
|
youtube_video_id=video_id_elem.text,
|
|
title=title_elem.text,
|
|
description=description,
|
|
published_at=published_at,
|
|
thumbnail_url=thumbnail_url,
|
|
)
|
|
videos.append(video)
|
|
|
|
except (ValueError, AttributeError) as e:
|
|
logger.warning("Failed to parse entry in feed: %s", e)
|
|
continue
|
|
|
|
return videos
|