trading/services/news_fetcher/main.py

152 lines
4.6 KiB
Python

"""News fetcher service entry point.
Polls RSS feeds and Reddit on independent schedules, deduplicates
articles by content hash (via a Redis SET), and publishes new articles
to the ``news:raw`` Redis Stream.
"""
import asyncio
import logging
from redis.asyncio import Redis
from shared.redis_streams import StreamPublisher
from shared.telemetry import setup_telemetry
from services.news_fetcher.config import NewsFetcherConfig
from services.news_fetcher.sources.rss import RSSSource
from services.news_fetcher.sources.reddit import RedditSource
from shared.schemas.news import RawArticle
logger = logging.getLogger(__name__)
SEEN_HASHES_KEY = "news:seen_hashes"
NEWS_RAW_STREAM = "news:raw"
async def _deduplicate_and_publish(
articles: list[RawArticle],
redis: Redis,
publisher: StreamPublisher,
articles_fetched_counter,
fetch_errors_counter,
) -> int:
"""Add unseen articles to the ``news:raw`` stream.
Returns the number of newly published articles.
"""
published = 0
for article in articles:
# SADD returns 1 if the member was added (i.e. not already present)
added = await redis.sadd(SEEN_HASHES_KEY, article.content_hash)
if added:
await publisher.publish(article.model_dump(mode="json"))
published += 1
if published:
articles_fetched_counter.add(published)
return published
async def _poll_rss(
source: RSSSource,
interval: int,
redis: Redis,
publisher: StreamPublisher,
articles_fetched_counter,
fetch_errors_counter,
) -> None:
"""Continuously poll RSS feeds at *interval* seconds."""
while True:
try:
logger.info("Polling RSS feeds …")
articles = await source.fetch()
count = await _deduplicate_and_publish(
articles, redis, publisher, articles_fetched_counter, fetch_errors_counter
)
logger.info("RSS poll complete: %d new articles published", count)
except Exception:
logger.exception("RSS poll cycle failed")
fetch_errors_counter.add(1)
await asyncio.sleep(interval)
async def _poll_reddit(
source: RedditSource,
interval: int,
redis: Redis,
publisher: StreamPublisher,
articles_fetched_counter,
fetch_errors_counter,
) -> None:
"""Continuously poll Reddit at *interval* seconds."""
while True:
try:
logger.info("Polling Reddit …")
articles = await source.fetch()
count = await _deduplicate_and_publish(
articles, redis, publisher, articles_fetched_counter, fetch_errors_counter
)
logger.info("Reddit poll complete: %d new articles published", count)
except Exception:
logger.exception("Reddit poll cycle failed")
fetch_errors_counter.add(1)
await asyncio.sleep(interval)
async def run() -> None:
"""Boot the news fetcher and start polling."""
config = NewsFetcherConfig()
logging.basicConfig(level=config.log_level)
logger.info("Starting news fetcher service")
# Telemetry
meter = setup_telemetry("news-fetcher", config.otel_metrics_port)
articles_fetched_counter = meter.create_counter(
"news.articles_fetched",
description="Total articles fetched and published",
)
fetch_errors_counter = meter.create_counter(
"news.fetch_errors",
description="Total fetch-cycle errors",
)
# Redis
redis = Redis.from_url(config.redis_url, decode_responses=True)
publisher = StreamPublisher(redis, NEWS_RAW_STREAM)
# Sources
rss_source = RSSSource(feeds=config.rss_feeds)
reddit_source = RedditSource(
subreddits=config.reddit_subreddits,
client_id=config.reddit_client_id,
client_secret=config.reddit_client_secret,
user_agent=config.reddit_user_agent,
min_score=config.reddit_min_score,
)
# Run pollers concurrently
async with asyncio.TaskGroup() as tg:
tg.create_task(
_poll_rss(
rss_source,
config.rss_poll_interval_seconds,
redis,
publisher,
articles_fetched_counter,
fetch_errors_counter,
)
)
tg.create_task(
_poll_reddit(
reddit_source,
config.reddit_poll_interval_seconds,
redis,
publisher,
articles_fetched_counter,
fetch_errors_counter,
)
)
if __name__ == "__main__":
asyncio.run(run())