trading/services/news_fetcher/sources/rss.py

71 lines
2.4 KiB
Python

"""RSS feed source — fetches articles from configurable RSS feed URLs."""
import hashlib
import logging
from datetime import datetime, timezone
from email.utils import parsedate_to_datetime
import feedparser
from shared.schemas.news import RawArticle
logger = logging.getLogger(__name__)
class RSSSource:
"""Fetches and converts RSS feed entries to :class:`RawArticle` instances."""
def __init__(self, feeds: list[str]) -> None:
self.feeds = feeds
async def fetch(self) -> list[RawArticle]:
"""Parse every configured feed and return a list of raw articles.
Feeds that fail to parse are logged and skipped so that a single
broken feed does not prevent the others from being collected.
"""
articles: list[RawArticle] = []
now = datetime.now(timezone.utc)
for feed_url in self.feeds:
try:
parsed = feedparser.parse(feed_url)
if parsed.bozo and not parsed.entries:
logger.warning("Feed %s returned bozo error: %s", feed_url, parsed.bozo_exception)
continue
for entry in parsed.entries:
title = entry.get("title", "")
link = entry.get("link", "")
content = entry.get("summary", "") or entry.get("description", "")
published_at = self._parse_published(entry)
content_hash = hashlib.sha256(f"{link}{title}".encode()).hexdigest()
articles.append(
RawArticle(
source="rss",
url=link,
title=title,
content=content,
published_at=published_at,
fetched_at=now,
content_hash=content_hash,
)
)
except Exception:
logger.exception("Failed to fetch RSS feed %s", feed_url)
continue
return articles
@staticmethod
def _parse_published(entry: dict) -> datetime | None:
"""Best-effort parsing of the entry's publication date."""
raw = entry.get("published") or entry.get("updated")
if not raw:
return None
try:
return parsedate_to_datetime(raw)
except Exception:
return None