"""RSS feed source — fetches articles from configurable RSS feed URLs.""" import hashlib import logging from datetime import datetime, timezone from email.utils import parsedate_to_datetime import feedparser from shared.schemas.news import RawArticle logger = logging.getLogger(__name__) class RSSSource: """Fetches and converts RSS feed entries to :class:`RawArticle` instances.""" def __init__(self, feeds: list[str]) -> None: self.feeds = feeds async def fetch(self) -> list[RawArticle]: """Parse every configured feed and return a list of raw articles. Feeds that fail to parse are logged and skipped so that a single broken feed does not prevent the others from being collected. """ articles: list[RawArticle] = [] now = datetime.now(timezone.utc) for feed_url in self.feeds: try: parsed = feedparser.parse(feed_url) if parsed.bozo and not parsed.entries: logger.warning("Feed %s returned bozo error: %s", feed_url, parsed.bozo_exception) continue for entry in parsed.entries: title = entry.get("title", "") link = entry.get("link", "") content = entry.get("summary", "") or entry.get("description", "") published_at = self._parse_published(entry) content_hash = hashlib.sha256(f"{link}{title}".encode()).hexdigest() articles.append( RawArticle( source="rss", url=link, title=title, content=content, published_at=published_at, fetched_at=now, content_hash=content_hash, ) ) except Exception: logger.exception("Failed to fetch RSS feed %s", feed_url) continue return articles @staticmethod def _parse_published(entry: dict) -> datetime | None: """Best-effort parsing of the entry's publication date.""" raw = entry.get("published") or entry.get("updated") if not raw: return None try: return parsedate_to_datetime(raw) except Exception: return None