71 lines
2.4 KiB
Python
71 lines
2.4 KiB
Python
"""RSS feed source — fetches articles from configurable RSS feed URLs."""
|
|
|
|
import hashlib
|
|
import logging
|
|
from datetime import datetime, timezone
|
|
from email.utils import parsedate_to_datetime
|
|
|
|
import feedparser
|
|
|
|
from shared.schemas.news import RawArticle
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class RSSSource:
|
|
"""Fetches and converts RSS feed entries to :class:`RawArticle` instances."""
|
|
|
|
def __init__(self, feeds: list[str]) -> None:
|
|
self.feeds = feeds
|
|
|
|
async def fetch(self) -> list[RawArticle]:
|
|
"""Parse every configured feed and return a list of raw articles.
|
|
|
|
Feeds that fail to parse are logged and skipped so that a single
|
|
broken feed does not prevent the others from being collected.
|
|
"""
|
|
articles: list[RawArticle] = []
|
|
now = datetime.now(timezone.utc)
|
|
|
|
for feed_url in self.feeds:
|
|
try:
|
|
parsed = feedparser.parse(feed_url)
|
|
if parsed.bozo and not parsed.entries:
|
|
logger.warning("Feed %s returned bozo error: %s", feed_url, parsed.bozo_exception)
|
|
continue
|
|
|
|
for entry in parsed.entries:
|
|
title = entry.get("title", "")
|
|
link = entry.get("link", "")
|
|
content = entry.get("summary", "") or entry.get("description", "")
|
|
|
|
published_at = self._parse_published(entry)
|
|
content_hash = hashlib.sha256(f"{link}{title}".encode()).hexdigest()
|
|
|
|
articles.append(
|
|
RawArticle(
|
|
source="rss",
|
|
url=link,
|
|
title=title,
|
|
content=content,
|
|
published_at=published_at,
|
|
fetched_at=now,
|
|
content_hash=content_hash,
|
|
)
|
|
)
|
|
except Exception:
|
|
logger.exception("Failed to fetch RSS feed %s", feed_url)
|
|
continue
|
|
|
|
return articles
|
|
|
|
@staticmethod
|
|
def _parse_published(entry: dict) -> datetime | None:
|
|
"""Best-effort parsing of the entry's publication date."""
|
|
raw = entry.get("published") or entry.get("updated")
|
|
if not raw:
|
|
return None
|
|
try:
|
|
return parsedate_to_datetime(raw)
|
|
except Exception:
|
|
return None
|