"""Tests for the news fetcher service — RSS, Reddit, deduplication, publishing.""" import hashlib from datetime import datetime, timezone from types import SimpleNamespace from unittest.mock import AsyncMock, MagicMock, patch import pytest from shared.schemas.news import RawArticle from services.news_fetcher.sources.rss import RSSSource from services.news_fetcher.sources.reddit import RedditSource from services.news_fetcher.main import _deduplicate_and_publish, SEEN_HASHES_KEY # --------------------------------------------------------------------------- # Fixtures # --------------------------------------------------------------------------- FIXTURE_RSS_XML = """\ Test Finance Feed AAPL hits record high https://example.com/aapl Apple stock reached an all-time high today. Sat, 22 Feb 2026 12:00:00 GMT TSLA earnings beat https://example.com/tsla Tesla reported stronger-than-expected earnings. Sat, 22 Feb 2026 13:00:00 GMT """ def _make_fake_entry(title: str, link: str, summary: str, published: str | None = None) -> dict: """Return a dict matching feedparser entry structure.""" entry = {"title": title, "link": link, "summary": summary} if published: entry["published"] = published return entry def _make_reddit_post(title, selftext, url, permalink, score, created_utc): """Return a SimpleNamespace mimicking an asyncpraw Submission.""" return SimpleNamespace( title=title, selftext=selftext, url=url, permalink=permalink, score=score, created_utc=created_utc, ) # --------------------------------------------------------------------------- # RSS tests # --------------------------------------------------------------------------- @pytest.mark.asyncio async def test_rss_source_parses_feed(): """feedparser.parse is called for each feed and entries become RawArticles.""" fake_parsed = MagicMock() fake_parsed.bozo = False fake_parsed.entries = [ _make_fake_entry( "AAPL hits record high", "https://example.com/aapl", "Apple stock reached an all-time high today.", "Sat, 22 Feb 2026 12:00:00 GMT", ), _make_fake_entry( "TSLA earnings beat", "https://example.com/tsla", "Tesla reported stronger-than-expected earnings.", "Sat, 22 Feb 2026 13:00:00 GMT", ), ] with patch("services.news_fetcher.sources.rss.feedparser.parse", return_value=fake_parsed) as mock_parse: source = RSSSource(feeds=["https://example.com/feed"]) articles = await source.fetch() mock_parse.assert_called_once_with("https://example.com/feed") assert len(articles) == 2 assert articles[0].source == "rss" assert articles[0].title == "AAPL hits record high" assert articles[0].url == "https://example.com/aapl" assert articles[0].content == "Apple stock reached an all-time high today." expected_hash = hashlib.sha256("https://example.com/aaplAAPL hits record high".encode()).hexdigest() assert articles[0].content_hash == expected_hash assert articles[0].published_at is not None assert articles[1].title == "TSLA earnings beat" @pytest.mark.asyncio async def test_rss_source_handles_bad_feed(): """A feed that raises an exception is skipped; an empty list is returned.""" with patch( "services.news_fetcher.sources.rss.feedparser.parse", side_effect=Exception("network timeout"), ): source = RSSSource(feeds=["https://bad-feed.example.com/rss"]) articles = await source.fetch() assert articles == [] @pytest.mark.asyncio async def test_rss_source_handles_bozo_feed(): """A bozo feed with no entries is skipped gracefully.""" fake_parsed = MagicMock() fake_parsed.bozo = True fake_parsed.entries = [] fake_parsed.bozo_exception = "malformed XML" with patch("services.news_fetcher.sources.rss.feedparser.parse", return_value=fake_parsed): source = RSSSource(feeds=["https://broken.example.com/rss"]) articles = await source.fetch() assert articles == [] @pytest.mark.asyncio async def test_rss_source_multiple_feeds(): """Articles from multiple feeds are combined.""" feed1 = MagicMock() feed1.bozo = False feed1.entries = [_make_fake_entry("A", "https://a.com", "content a")] feed2 = MagicMock() feed2.bozo = False feed2.entries = [_make_fake_entry("B", "https://b.com", "content b")] with patch( "services.news_fetcher.sources.rss.feedparser.parse", side_effect=[feed1, feed2], ): source = RSSSource(feeds=["https://feed1.com", "https://feed2.com"]) articles = await source.fetch() assert len(articles) == 2 assert {a.title for a in articles} == {"A", "B"} # --------------------------------------------------------------------------- # Reddit tests # --------------------------------------------------------------------------- @pytest.mark.asyncio async def test_reddit_source_converts_posts(): """Hot posts are converted to RawArticle with correct fields.""" post = _make_reddit_post( title="GME to the moon", selftext="Diamond hands forever", url="https://reddit.com/r/wsb/1", permalink="/r/wallstreetbets/comments/abc123/gme_to_the_moon/", score=500, created_utc=1740200000.0, ) # Create an async iterator from the posts async def _hot(limit=25): for p in [post]: yield p fake_subreddit = AsyncMock() fake_subreddit.hot = _hot fake_reddit = AsyncMock() fake_reddit.subreddit = AsyncMock(return_value=fake_subreddit) fake_reddit.close = AsyncMock() with patch("asyncpraw.Reddit", return_value=fake_reddit): source = RedditSource( subreddits=["wallstreetbets"], client_id="test_id", client_secret="test_secret", user_agent="test-agent", min_score=10, ) articles = await source.fetch() assert len(articles) == 1 assert articles[0].source == "reddit" assert articles[0].title == "GME to the moon" assert articles[0].content == "Diamond hands forever" expected_hash = hashlib.sha256( "/r/wallstreetbets/comments/abc123/gme_to_the_moon/".encode() ).hexdigest() assert articles[0].content_hash == expected_hash assert "reddit.com" in articles[0].url fake_reddit.close.assert_awaited_once() @pytest.mark.asyncio async def test_reddit_source_filters_by_score(): """Posts below min_score are excluded.""" high_score = _make_reddit_post("High", "text", "url", "/r/stocks/high", 100, 1740200000.0) low_score = _make_reddit_post("Low", "text", "url", "/r/stocks/low", 5, 1740200000.0) async def _hot(limit=25): for p in [high_score, low_score]: yield p fake_subreddit = AsyncMock() fake_subreddit.hot = _hot fake_reddit = AsyncMock() fake_reddit.subreddit = AsyncMock(return_value=fake_subreddit) fake_reddit.close = AsyncMock() with patch("asyncpraw.Reddit", return_value=fake_reddit): source = RedditSource( subreddits=["stocks"], client_id="id", client_secret="secret", user_agent="agent", min_score=10, ) articles = await source.fetch() assert len(articles) == 1 assert articles[0].title == "High" @pytest.mark.asyncio async def test_reddit_source_uses_url_when_no_selftext(): """When selftext is empty, the post URL is used as content.""" post = _make_reddit_post( title="Link post", selftext="", url="https://example.com/article", permalink="/r/investing/link", score=50, created_utc=1740200000.0, ) async def _hot(limit=25): yield post fake_subreddit = AsyncMock() fake_subreddit.hot = _hot fake_reddit = AsyncMock() fake_reddit.subreddit = AsyncMock(return_value=fake_subreddit) fake_reddit.close = AsyncMock() with patch("asyncpraw.Reddit", return_value=fake_reddit): source = RedditSource( subreddits=["investing"], client_id="id", client_secret="secret", user_agent="agent", min_score=10, ) articles = await source.fetch() assert len(articles) == 1 assert articles[0].content == "https://example.com/article" # --------------------------------------------------------------------------- # Deduplication tests # --------------------------------------------------------------------------- @pytest.mark.asyncio async def test_deduplication_skips_seen_hashes(): """Articles with previously-seen content_hash are not published.""" now = datetime.now(timezone.utc) articles = [ RawArticle( source="rss", url="https://example.com/1", title="First", content="Content 1", fetched_at=now, content_hash="hash_new", ), RawArticle( source="rss", url="https://example.com/2", title="Already seen", content="Content 2", fetched_at=now, content_hash="hash_old", ), ] redis = AsyncMock() # First article is new (SADD returns 1), second is duplicate (returns 0) redis.sadd = AsyncMock(side_effect=[1, 0]) publisher = AsyncMock() publisher.publish = AsyncMock() counter = MagicMock() counter.add = MagicMock() error_counter = MagicMock() count = await _deduplicate_and_publish(articles, redis, publisher, counter, error_counter) assert count == 1 publisher.publish.assert_called_once() # Verify the hash was checked for both articles assert redis.sadd.call_count == 2 @pytest.mark.asyncio async def test_deduplication_publishes_all_new(): """When all articles are new, all are published.""" now = datetime.now(timezone.utc) articles = [ RawArticle( source="rss", url=f"https://example.com/{i}", title=f"Article {i}", content=f"Content {i}", fetched_at=now, content_hash=f"hash_{i}", ) for i in range(3) ] redis = AsyncMock() redis.sadd = AsyncMock(return_value=1) publisher = AsyncMock() publisher.publish = AsyncMock() counter = MagicMock() counter.add = MagicMock() error_counter = MagicMock() count = await _deduplicate_and_publish(articles, redis, publisher, counter, error_counter) assert count == 3 assert publisher.publish.call_count == 3 counter.add.assert_called_once_with(3) # --------------------------------------------------------------------------- # Main service integration test (mocked sources + redis) # --------------------------------------------------------------------------- @pytest.mark.asyncio async def test_main_publishes_to_stream(): """End-to-end: mocked sources produce articles which get published.""" now = datetime.now(timezone.utc) fake_article = RawArticle( source="rss", url="https://example.com/test", title="Test", content="Test content", fetched_at=now, content_hash="unique_hash", ) redis = AsyncMock() redis.sadd = AsyncMock(return_value=1) publisher = AsyncMock() publisher.publish = AsyncMock() counter = MagicMock() counter.add = MagicMock() error_counter = MagicMock() count = await _deduplicate_and_publish( [fake_article], redis, publisher, counter, error_counter ) assert count == 1 publisher.publish.assert_called_once() # Verify the published data matches the article call_args = publisher.publish.call_args[0][0] assert call_args["title"] == "Test" assert call_args["source"] == "rss" assert call_args["content_hash"] == "unique_hash"