"""News fetcher service entry point. Polls RSS feeds and Reddit on independent schedules, deduplicates articles by content hash (via a Redis SET), and publishes new articles to the ``news:raw`` Redis Stream. """ import asyncio import logging from redis.asyncio import Redis from shared.redis_streams import StreamPublisher from shared.telemetry import setup_telemetry from services.news_fetcher.config import NewsFetcherConfig from services.news_fetcher.sources.rss import RSSSource from services.news_fetcher.sources.reddit import RedditSource from shared.schemas.news import RawArticle logger = logging.getLogger(__name__) SEEN_HASHES_KEY = "news:seen_hashes" NEWS_RAW_STREAM = "news:raw" async def _deduplicate_and_publish( articles: list[RawArticle], redis: Redis, publisher: StreamPublisher, articles_fetched_counter, fetch_errors_counter, ) -> int: """Add unseen articles to the ``news:raw`` stream. Returns the number of newly published articles. """ published = 0 for article in articles: # SADD returns 1 if the member was added (i.e. not already present) added = await redis.sadd(SEEN_HASHES_KEY, article.content_hash) if added: await publisher.publish(article.model_dump(mode="json")) published += 1 if published: articles_fetched_counter.add(published) return published async def _poll_rss( source: RSSSource, interval: int, redis: Redis, publisher: StreamPublisher, articles_fetched_counter, fetch_errors_counter, ) -> None: """Continuously poll RSS feeds at *interval* seconds.""" while True: try: logger.info("Polling RSS feeds …") articles = await source.fetch() count = await _deduplicate_and_publish( articles, redis, publisher, articles_fetched_counter, fetch_errors_counter ) logger.info("RSS poll complete: %d new articles published", count) except Exception: logger.exception("RSS poll cycle failed") fetch_errors_counter.add(1) await asyncio.sleep(interval) async def _poll_reddit( source: RedditSource, interval: int, redis: Redis, publisher: StreamPublisher, articles_fetched_counter, fetch_errors_counter, ) -> None: """Continuously poll Reddit at *interval* seconds.""" while True: try: logger.info("Polling Reddit …") articles = await source.fetch() count = await _deduplicate_and_publish( articles, redis, publisher, articles_fetched_counter, fetch_errors_counter ) logger.info("Reddit poll complete: %d new articles published", count) except Exception: logger.exception("Reddit poll cycle failed") fetch_errors_counter.add(1) await asyncio.sleep(interval) async def run() -> None: """Boot the news fetcher and start polling.""" config = NewsFetcherConfig() logging.basicConfig(level=config.log_level) logger.info("Starting news fetcher service") # Telemetry meter = setup_telemetry("news-fetcher", config.otel_metrics_port) articles_fetched_counter = meter.create_counter( "news.articles_fetched", description="Total articles fetched and published", ) fetch_errors_counter = meter.create_counter( "news.fetch_errors", description="Total fetch-cycle errors", ) # Redis redis = Redis.from_url(config.redis_url, decode_responses=True) publisher = StreamPublisher(redis, NEWS_RAW_STREAM) # Sources rss_source = RSSSource(feeds=config.rss_feeds) reddit_source = RedditSource( subreddits=config.reddit_subreddits, client_id=config.reddit_client_id, client_secret=config.reddit_client_secret, user_agent=config.reddit_user_agent, min_score=config.reddit_min_score, ) # Run pollers concurrently async with asyncio.TaskGroup() as tg: tg.create_task( _poll_rss( rss_source, config.rss_poll_interval_seconds, redis, publisher, articles_fetched_counter, fetch_errors_counter, ) ) tg.create_task( _poll_reddit( reddit_source, config.reddit_poll_interval_seconds, redis, publisher, articles_fetched_counter, fetch_errors_counter, ) ) if __name__ == "__main__": asyncio.run(run())