trading/services/meet_kevin_watcher/rss_poller.py

130 lines
4 KiB
Python

"""YouTube RSS feed poller for Meet Kevin channel."""
import logging
from dataclasses import dataclass
from datetime import datetime
from xml.etree import ElementTree as ET
import httpx
logger = logging.getLogger(__name__)
# Atom and YouTube namespace mappings
_NAMESPACES = {
"a": "http://www.w3.org/2005/Atom",
"yt": "http://www.youtube.com/xml/schemas/2015",
"m": "http://search.yahoo.com/mrss/",
}
@dataclass(frozen=True)
class DiscoveredVideo:
"""A video discovered from YouTube RSS feed."""
youtube_video_id: str
title: str
description: str
published_at: datetime
thumbnail_url: str
async def fetch_feed(channel_id: str, client: httpx.AsyncClient) -> bytes:
"""Fetch YouTube RSS feed for a channel.
Args:
channel_id: YouTube channel ID (e.g., "UCUvvj5lwue7PspotMDjk5UA")
client: httpx AsyncClient for HTTP requests
Returns:
Raw XML bytes from the feed, or empty bytes on error.
HTTP errors are logged but do not raise.
"""
url = f"https://www.youtube.com/feeds/videos.xml?channel_id={channel_id}"
try:
response = await client.get(url, timeout=15.0)
response.raise_for_status()
return response.content
except httpx.HTTPError as e:
logger.warning("Failed to fetch feed from %s: %s", url, e)
return b""
def parse_feed(xml_bytes: bytes) -> list[DiscoveredVideo]:
"""Parse YouTube RSS feed XML and extract videos.
Args:
xml_bytes: Raw XML bytes from YouTube RSS feed
Returns:
List of DiscoveredVideo objects. Returns empty list on parse error,
empty input, or if no valid entries found.
Individual entries with missing required fields are skipped.
"""
if not xml_bytes:
return []
try:
root = ET.fromstring(xml_bytes)
except ET.ParseError as e:
logger.warning("Failed to parse feed XML: %s", e)
return []
videos: list[DiscoveredVideo] = []
for entry in root.findall("a:entry", _NAMESPACES):
try:
# Extract required fields
video_id_elem = entry.find("yt:videoId", _NAMESPACES)
title_elem = entry.find("a:title", _NAMESPACES)
published_elem = entry.find("a:published", _NAMESPACES)
# Extract media group (description and thumbnail)
media_group = entry.find("m:group", _NAMESPACES)
desc_elem = None
thumb_elem = None
if media_group is not None:
desc_elem = media_group.find("m:description", _NAMESPACES)
thumb_elem = media_group.find("m:thumbnail", _NAMESPACES)
# Skip entries with missing required fields
if (
video_id_elem is None
or video_id_elem.text is None
or title_elem is None
or title_elem.text is None
or published_elem is None
or published_elem.text is None
or thumb_elem is None
):
continue
# Parse published timestamp (handle Z suffix)
published_text = published_elem.text
published_text = published_text.replace("Z", "+00:00")
published_at = datetime.fromisoformat(published_text)
# Extract description (may be missing)
description = ""
if desc_elem is not None and desc_elem.text is not None:
description = desc_elem.text
# Extract thumbnail URL
thumbnail_url = thumb_elem.get("url", "")
if not thumbnail_url:
continue
video = DiscoveredVideo(
youtube_video_id=video_id_elem.text,
title=title_elem.text,
description=description,
published_at=published_at,
thumbnail_url=thumbnail_url,
)
videos.append(video)
except (ValueError, AttributeError) as e:
logger.warning("Failed to parse entry in feed: %s", e)
continue
return videos