trading/services/meet_kevin_watcher/rss_poller.py

131 lines
4 KiB
Python
Raw Normal View History

"""YouTube RSS feed poller for Meet Kevin channel."""
import logging
from dataclasses import dataclass
from datetime import datetime
from xml.etree import ElementTree as ET
import httpx
logger = logging.getLogger(__name__)
# Atom and YouTube namespace mappings
_NAMESPACES = {
"a": "http://www.w3.org/2005/Atom",
"yt": "http://www.youtube.com/xml/schemas/2015",
"m": "http://search.yahoo.com/mrss/",
}
@dataclass(frozen=True)
class DiscoveredVideo:
"""A video discovered from YouTube RSS feed."""
youtube_video_id: str
title: str
description: str
published_at: datetime
thumbnail_url: str
async def fetch_feed(channel_id: str, client: httpx.AsyncClient) -> bytes:
"""Fetch YouTube RSS feed for a channel.
Args:
channel_id: YouTube channel ID (e.g., "UCUvvj5lwue7PspotMDjk5UA")
client: httpx AsyncClient for HTTP requests
Returns:
Raw XML bytes from the feed, or empty bytes on error.
HTTP errors are logged but do not raise.
"""
url = f"https://www.youtube.com/feeds/videos.xml?channel_id={channel_id}"
try:
response = await client.get(url, timeout=15.0)
response.raise_for_status()
return response.content
except httpx.HTTPError as e:
logger.warning("Failed to fetch feed from %s: %s", url, e)
return b""
def parse_feed(xml_bytes: bytes) -> list[DiscoveredVideo]:
"""Parse YouTube RSS feed XML and extract videos.
Args:
xml_bytes: Raw XML bytes from YouTube RSS feed
Returns:
List of DiscoveredVideo objects. Returns empty list on parse error,
empty input, or if no valid entries found.
Individual entries with missing required fields are skipped.
"""
if not xml_bytes:
return []
try:
root = ET.fromstring(xml_bytes)
except ET.ParseError as e:
logger.warning("Failed to parse feed XML: %s", e)
return []
videos: list[DiscoveredVideo] = []
for entry in root.findall("a:entry", _NAMESPACES):
try:
# Extract required fields
video_id_elem = entry.find("yt:videoId", _NAMESPACES)
title_elem = entry.find("a:title", _NAMESPACES)
published_elem = entry.find("a:published", _NAMESPACES)
# Extract media group (description and thumbnail)
media_group = entry.find("m:group", _NAMESPACES)
desc_elem = None
thumb_elem = None
if media_group is not None:
desc_elem = media_group.find("m:description", _NAMESPACES)
thumb_elem = media_group.find("m:thumbnail", _NAMESPACES)
# Skip entries with missing required fields
if (
video_id_elem is None
or video_id_elem.text is None
or title_elem is None
or title_elem.text is None
or published_elem is None
or published_elem.text is None
or thumb_elem is None
):
continue
# Parse published timestamp (handle Z suffix)
published_text = published_elem.text
published_text = published_text.replace("Z", "+00:00")
published_at = datetime.fromisoformat(published_text)
# Extract description (may be missing)
description = ""
if desc_elem is not None and desc_elem.text is not None:
description = desc_elem.text
# Extract thumbnail URL
thumbnail_url = thumb_elem.get("url", "")
if not thumbnail_url:
continue
video = DiscoveredVideo(
youtube_video_id=video_id_elem.text,
title=title_elem.text,
description=description,
published_at=published_at,
thumbnail_url=thumbnail_url,
)
videos.append(video)
except (ValueError, AttributeError) as e:
logger.warning("Failed to parse entry in feed: %s", e)
continue
return videos