feat: news fetcher service — RSS and Reddit sources

This commit is contained in:
Viktor Barzin 2026-02-22 15:25:27 +00:00
parent 9f46071502
commit 90b52a5144
No known key found for this signature in database
GPG key ID: 0EB088298288D958
10 changed files with 722 additions and 2 deletions

View file

@ -0,0 +1 @@
"""News source adapters (RSS, Reddit)."""

View file

@ -0,0 +1,76 @@
"""Reddit source — fetches hot posts from financial subreddits via asyncpraw."""
import hashlib
import logging
from datetime import datetime, timezone
from shared.schemas.news import RawArticle
logger = logging.getLogger(__name__)
class RedditSource:
"""Fetches hot posts from Reddit and converts them to :class:`RawArticle`."""
def __init__(
self,
subreddits: list[str],
client_id: str,
client_secret: str,
user_agent: str,
min_score: int = 10,
) -> None:
self.subreddits = subreddits
self.client_id = client_id
self.client_secret = client_secret
self.user_agent = user_agent
self.min_score = min_score
async def fetch(self) -> list[RawArticle]:
"""Return hot posts above *min_score* from each configured subreddit.
Uses ``asyncpraw`` so the caller must run within an ``async`` context.
Each Reddit instance is created and closed within this call to avoid
leaking sessions across poll cycles.
"""
import asyncpraw # lazy import so the dep is optional at import time
articles: list[RawArticle] = []
now = datetime.now(timezone.utc)
reddit = asyncpraw.Reddit(
client_id=self.client_id,
client_secret=self.client_secret,
user_agent=self.user_agent,
)
try:
for sub_name in self.subreddits:
try:
subreddit = await reddit.subreddit(sub_name)
async for post in subreddit.hot(limit=25):
if post.score < self.min_score:
continue
content = post.selftext if post.selftext else post.url
permalink = post.permalink
content_hash = hashlib.sha256(permalink.encode()).hexdigest()
published_at = datetime.fromtimestamp(post.created_utc, tz=timezone.utc)
articles.append(
RawArticle(
source="reddit",
url=f"https://reddit.com{permalink}",
title=post.title,
content=content,
published_at=published_at,
fetched_at=now,
content_hash=content_hash,
)
)
except Exception:
logger.exception("Failed to fetch subreddit r/%s", sub_name)
continue
finally:
await reddit.close()
return articles

View file

@ -0,0 +1,71 @@
"""RSS feed source — fetches articles from configurable RSS feed URLs."""
import hashlib
import logging
from datetime import datetime, timezone
from email.utils import parsedate_to_datetime
import feedparser
from shared.schemas.news import RawArticle
logger = logging.getLogger(__name__)
class RSSSource:
"""Fetches and converts RSS feed entries to :class:`RawArticle` instances."""
def __init__(self, feeds: list[str]) -> None:
self.feeds = feeds
async def fetch(self) -> list[RawArticle]:
"""Parse every configured feed and return a list of raw articles.
Feeds that fail to parse are logged and skipped so that a single
broken feed does not prevent the others from being collected.
"""
articles: list[RawArticle] = []
now = datetime.now(timezone.utc)
for feed_url in self.feeds:
try:
parsed = feedparser.parse(feed_url)
if parsed.bozo and not parsed.entries:
logger.warning("Feed %s returned bozo error: %s", feed_url, parsed.bozo_exception)
continue
for entry in parsed.entries:
title = entry.get("title", "")
link = entry.get("link", "")
content = entry.get("summary", "") or entry.get("description", "")
published_at = self._parse_published(entry)
content_hash = hashlib.sha256(f"{link}{title}".encode()).hexdigest()
articles.append(
RawArticle(
source="rss",
url=link,
title=title,
content=content,
published_at=published_at,
fetched_at=now,
content_hash=content_hash,
)
)
except Exception:
logger.exception("Failed to fetch RSS feed %s", feed_url)
continue
return articles
@staticmethod
def _parse_published(entry: dict) -> datetime | None:
"""Best-effort parsing of the entry's publication date."""
raw = entry.get("published") or entry.get("updated")
if not raw:
return None
try:
return parsedate_to_datetime(raw)
except Exception:
return None