feat: news fetcher service — RSS and Reddit sources

2026-02-22 15:25:27 +00:00 · 2026-02-22 15:25:27 +00:00 · 90b52a5144
commit 90b52a5144
parent 9f46071502
10 changed files with 722 additions and 2 deletions
--- a/services/news_fetcher/sources/init.py
+++ b/services/news_fetcher/sources/init.py
@ -0,0 +1 @@
+"""News source adapters (RSS, Reddit)."""
--- a/services/news_fetcher/sources/reddit.py
+++ b/services/news_fetcher/sources/reddit.py
@ -0,0 +1,76 @@
+"""Reddit source — fetches hot posts from financial subreddits via asyncpraw."""
+
+import hashlib
+import logging
+from datetime import datetime, timezone
+
+from shared.schemas.news import RawArticle
+
+logger = logging.getLogger(__name__)
+
+
+class RedditSource:
+    """Fetches hot posts from Reddit and converts them to :class:`RawArticle`."""
+
+    def __init__(
+        self,
+        subreddits: list[str],
+        client_id: str,
+        client_secret: str,
+        user_agent: str,
+        min_score: int = 10,
+    ) -> None:
+        self.subreddits = subreddits
+        self.client_id = client_id
+        self.client_secret = client_secret
+        self.user_agent = user_agent
+        self.min_score = min_score
+
+    async def fetch(self) -> list[RawArticle]:
+        """Return hot posts above *min_score* from each configured subreddit.
+
+        Uses ``asyncpraw`` so the caller must run within an ``async`` context.
+        Each Reddit instance is created and closed within this call to avoid
+        leaking sessions across poll cycles.
+        """
+        import asyncpraw  # lazy import so the dep is optional at import time
+
+        articles: list[RawArticle] = []
+        now = datetime.now(timezone.utc)
+
+        reddit = asyncpraw.Reddit(
+            client_id=self.client_id,
+            client_secret=self.client_secret,
+            user_agent=self.user_agent,
+        )
+        try:
+            for sub_name in self.subreddits:
+                try:
+                    subreddit = await reddit.subreddit(sub_name)
+                    async for post in subreddit.hot(limit=25):
+                        if post.score < self.min_score:
+                            continue
+
+                        content = post.selftext if post.selftext else post.url
+                        permalink = post.permalink
+                        content_hash = hashlib.sha256(permalink.encode()).hexdigest()
+                        published_at = datetime.fromtimestamp(post.created_utc, tz=timezone.utc)
+
+                        articles.append(
+                            RawArticle(
+                                source="reddit",
+                                url=f"https://reddit.com{permalink}",
+                                title=post.title,
+                                content=content,
+                                published_at=published_at,
+                                fetched_at=now,
+                                content_hash=content_hash,
+                            )
+                        )
+                except Exception:
+                    logger.exception("Failed to fetch subreddit r/%s", sub_name)
+                    continue
+        finally:
+            await reddit.close()
+
+        return articles
--- a/services/news_fetcher/sources/rss.py
+++ b/services/news_fetcher/sources/rss.py
@ -0,0 +1,71 @@
+"""RSS feed source — fetches articles from configurable RSS feed URLs."""
+
+import hashlib
+import logging
+from datetime import datetime, timezone
+from email.utils import parsedate_to_datetime
+
+import feedparser
+
+from shared.schemas.news import RawArticle
+
+logger = logging.getLogger(__name__)
+
+
+class RSSSource:
+    """Fetches and converts RSS feed entries to :class:`RawArticle` instances."""
+
+    def __init__(self, feeds: list[str]) -> None:
+        self.feeds = feeds
+
+    async def fetch(self) -> list[RawArticle]:
+        """Parse every configured feed and return a list of raw articles.
+
+        Feeds that fail to parse are logged and skipped so that a single
+        broken feed does not prevent the others from being collected.
+        """
+        articles: list[RawArticle] = []
+        now = datetime.now(timezone.utc)
+
+        for feed_url in self.feeds:
+            try:
+                parsed = feedparser.parse(feed_url)
+                if parsed.bozo and not parsed.entries:
+                    logger.warning("Feed %s returned bozo error: %s", feed_url, parsed.bozo_exception)
+                    continue
+
+                for entry in parsed.entries:
+                    title = entry.get("title", "")
+                    link = entry.get("link", "")
+                    content = entry.get("summary", "") or entry.get("description", "")
+
+                    published_at = self._parse_published(entry)
+                    content_hash = hashlib.sha256(f"{link}{title}".encode()).hexdigest()
+
+                    articles.append(
+                        RawArticle(
+                            source="rss",
+                            url=link,
+                            title=title,
+                            content=content,
+                            published_at=published_at,
+                            fetched_at=now,
+                            content_hash=content_hash,
+                        )
+                    )
+            except Exception:
+                logger.exception("Failed to fetch RSS feed %s", feed_url)
+                continue
+
+        return articles
+
+    @staticmethod
+    def _parse_published(entry: dict) -> datetime | None:
+        """Best-effort parsing of the entry's publication date."""
+        raw = entry.get("published") or entry.get("updated")
+        if not raw:
+            return None
+        try:
+            return parsedate_to_datetime(raw)
+        except Exception:
+            return None