"""Listing fetcher service - fetches listing data from Rightmove API.""" import asyncio import logging from config.scraper_config import ScraperConfig from listing_processor import ListingProcessor from rec.query import create_session, listing_query from rec.exceptions import CircuitBreakerOpenError, ThrottlingError from rec.throttle_detector import get_throttle_metrics, reset_throttle_metrics from models.listing import Listing, QueryParameters from repositories import ListingRepository from services.query_splitter import QuerySplitter, SubQuery logger = logging.getLogger("uvicorn.error") # Number of concurrent workers that process listing details (fetch details, # download images, run OCR) from the streaming queue in parallel. NUM_WORKERS = 20 async def dump_listings_full( parameters: QueryParameters, repository: ListingRepository, ) -> list[Listing]: """Fetches all listings, images as well as detects floorplans.""" new_listings = await dump_listings(parameters, repository) logger.debug(f"Upserted {len(new_listings)} new listings") new_listing_ids = [listing.id for listing in new_listings] return await repository.get_listings(only_ids=new_listing_ids) async def _fetch_subquery( sq: SubQuery, parameters: QueryParameters, session: object, config: ScraperConfig, semaphore: asyncio.Semaphore, existing_ids: set[int], queue: asyncio.Queue[int | None], ) -> int: """Fetch listing IDs for a single subquery and enqueue new ones. Iterates through pages of results for the given subquery, adding any newly discovered listing IDs to the processing queue. Args: sq: The subquery to fetch results for. parameters: The original query parameters (for page_size, etc.). session: The aiohttp session for making requests. config: Scraper configuration. semaphore: Concurrency limiter for HTTP requests. existing_ids: Set of already-known listing IDs (mutated in place). queue: Queue to push new listing IDs onto for processing. Returns: The number of new IDs discovered and enqueued. """ estimated = sq.estimated_results or 0 if estimated == 0: return 0 ids_found = 0 page_size = parameters.page_size max_pages = min( config.max_pages_per_query, (estimated // page_size) + 1, ) for page_id in range(1, max_pages + 1): async with semaphore: await asyncio.sleep(config.request_delay_ms / 1000) try: result = await listing_query( page=page_id, channel=parameters.listing_type, min_bedrooms=sq.min_bedrooms, max_bedrooms=sq.max_bedrooms, radius=parameters.radius, min_price=sq.min_price, max_price=sq.max_price, district=sq.district, page_size=page_size, max_days_since_added=parameters.max_days_since_added, furnish_types=parameters.furnish_types or [], session=session, config=config, ) # Extract and enqueue new IDs inline properties = result.get("properties", []) for prop in properties: identifier = prop.get("identifier") if identifier and identifier not in existing_ids: existing_ids.add(identifier) ids_found += 1 await queue.put(identifier) if len(properties) < page_size: break except CircuitBreakerOpenError as e: logger.error(f"Circuit breaker open: {e}") break except ThrottlingError as e: logger.warning( f"Throttling error on page {page_id} for " f"{sq.district}: {e}" ) break except Exception as e: # Rightmove returns GENERIC_ERROR when requesting pages # past the last page of results. This is expected behavior # and signals we've exhausted this subquery's results. if "GENERIC_ERROR" in str(e): logger.debug( f"Max page for {sq.district}: {page_id - 1}" ) break logger.warning( f"Error fetching page {page_id} for " f"{sq.district}: {e}" ) break return ids_found async def dump_listings( parameters: QueryParameters, repository: ListingRepository, ) -> list[Listing]: """Fetch listings from Rightmove API and process them. Uses intelligent query splitting and a streaming pipeline so that listing processing starts as soon as IDs become available. """ config = ScraperConfig.from_env() splitter = QuerySplitter(config) # Reset throttle metrics at start reset_throttle_metrics() try: async with create_session(config) as session: # Phase 1: Split and probe queries logger.info("Splitting query and probing result counts...") subqueries = await splitter.split(parameters, session) total_estimated = splitter.calculate_total_estimated_results(subqueries) logger.info( f"Split into {len(subqueries)} subqueries, " f"estimated {total_estimated} total results" ) # Load existing IDs (fast, ID-only projection) existing_ids = repository.get_listing_ids(parameters.listing_type) logger.info(f"Found {len(existing_ids)} existing listings in DB") # Phase 2: Streaming fetch & process queue: asyncio.Queue[int | None] = asyncio.Queue() semaphore = asyncio.Semaphore(config.max_concurrent_requests) processed_listings: list[Listing] = [] async def producer() -> int: """Fetch all subqueries and send sentinel values to workers.""" tasks = [ _fetch_subquery( sq, parameters, session, config, semaphore, existing_ids, queue, ) for sq in subqueries ] counts = await asyncio.gather(*tasks) ids_collected = sum(counts) logger.info(f"Fetch complete: {ids_collected} new IDs found") for _ in range(NUM_WORKERS): await queue.put(None) return ids_collected async def worker() -> None: while True: listing_id = await queue.get() if listing_id is None: break listing_processor = ListingProcessor(repository) listing = await listing_processor.process_listing(listing_id) if listing is not None: processed_listings.append(listing) results = await asyncio.gather( producer(), *[worker() for _ in range(NUM_WORKERS)], ) ids_collected = results[0] except CircuitBreakerOpenError as e: logger.error(f"Circuit breaker prevented listing fetch: {e}") logger.info(get_throttle_metrics().summary()) return [] finally: # Log throttle metrics at end metrics = get_throttle_metrics() if metrics.total_requests > 0: logger.info("\n" + metrics.summary()) logger.info( f"Processed {len(processed_listings)} new listings " f"({ids_collected} total found)" ) return processed_listings