"""Listing fetcher service - fetches listing data from Rightmove API.""" import asyncio import logging from config.scraper_config import ScraperConfig from listing_processor import ListingProcessor from rec.query import create_session, listing_query from rec.exceptions import CircuitBreakerOpenError, ThrottlingError from rec.throttle_detector import get_throttle_metrics, reset_throttle_metrics from models.listing import Listing, QueryParameters from repositories import ListingRepository from services.query_splitter import QuerySplitter, SubQuery logger = logging.getLogger("uvicorn.error") NUM_WORKERS = 20 async def dump_listings_full( parameters: QueryParameters, repository: ListingRepository, ) -> list[Listing]: """Fetches all listings, images as well as detects floorplans.""" new_listings = await dump_listings(parameters, repository) logger.debug(f"Upserted {len(new_listings)} new listings") # refresh listings listings = await repository.get_listings(parameters) # this can be better new_listings = [x for x in listings if x.id in new_listings] return new_listings async def dump_listings( parameters: QueryParameters, repository: ListingRepository, ) -> list[Listing]: """Fetch listings from Rightmove API and process them. Uses intelligent query splitting and a streaming pipeline so that listing processing starts as soon as IDs become available. """ config = ScraperConfig.from_env() splitter = QuerySplitter(config) # Reset throttle metrics at start reset_throttle_metrics() try: async with create_session(config) as session: # Phase 1: Split and probe queries logger.info("Splitting query and probing result counts...") subqueries = await splitter.split(parameters, session) total_estimated = splitter.calculate_total_estimated_results(subqueries) logger.info( f"Split into {len(subqueries)} subqueries, " f"estimated {total_estimated} total results" ) # Load existing IDs (fast, ID-only projection) existing_ids = repository.get_listing_ids(parameters.listing_type) logger.info(f"Found {len(existing_ids)} existing listings in DB") # Phase 2: Streaming fetch & process queue: asyncio.Queue[int | None] = asyncio.Queue() semaphore = asyncio.Semaphore(config.max_concurrent_requests) ids_collected = 0 processed_listings: list[Listing] = [] async def fetch_subquery(sq: SubQuery) -> None: nonlocal ids_collected estimated = sq.estimated_results or 0 if estimated == 0: return page_size = parameters.page_size max_pages = min( config.max_pages_per_query, (estimated // page_size) + 1, ) for page_id in range(1, max_pages + 1): async with semaphore: await asyncio.sleep(config.request_delay_ms / 1000) try: result = await listing_query( page=page_id, channel=parameters.listing_type, min_bedrooms=sq.min_bedrooms, max_bedrooms=sq.max_bedrooms, radius=parameters.radius, min_price=sq.min_price, max_price=sq.max_price, district=sq.district, page_size=page_size, max_days_since_added=parameters.max_days_since_added, furnish_types=parameters.furnish_types or [], session=session, config=config, ) # Extract and enqueue new IDs inline properties = result.get("properties", []) for prop in properties: identifier = prop.get("identifier") if identifier and identifier not in existing_ids: existing_ids.add(identifier) ids_collected += 1 await queue.put(identifier) if len(properties) < page_size: break except CircuitBreakerOpenError as e: logger.error(f"Circuit breaker open: {e}") break except ThrottlingError as e: logger.warning( f"Throttling error on page {page_id} for " f"{sq.district}: {e}" ) break except Exception as e: if "GENERIC_ERROR" in str(e): logger.debug( f"Max page for {sq.district}: {page_id - 1}" ) break logger.warning( f"Error fetching page {page_id} for " f"{sq.district}: {e}" ) break async def producer() -> None: await asyncio.gather( *[fetch_subquery(sq) for sq in subqueries] ) logger.info(f"Fetch complete: {ids_collected} new IDs found") for _ in range(NUM_WORKERS): await queue.put(None) async def worker() -> None: while True: listing_id = await queue.get() if listing_id is None: break listing_processor = ListingProcessor(repository) listing = await listing_processor.process_listing(listing_id) if listing is not None: processed_listings.append(listing) await asyncio.gather( producer(), *[worker() for _ in range(NUM_WORKERS)], ) except CircuitBreakerOpenError as e: logger.error(f"Circuit breaker prevented listing fetch: {e}") logger.info(get_throttle_metrics().summary()) return [] finally: # Log throttle metrics at end metrics = get_throttle_metrics() if metrics.total_requests > 0: logger.info("\n" + metrics.summary()) logger.info( f"Processed {len(processed_listings)} new listings " f"({ids_collected} total found)" ) return processed_listings