"""Listing fetcher service - fetches listing data from Rightmove API.""" import asyncio import logging from typing import Any from config.scraper_config import ScraperConfig from listing_processor import ListingProcessor from rec.query import create_session, listing_query from models.listing import QueryParameters from repositories import ListingRepository from tqdm.asyncio import tqdm from models import Listing as modelListing from services.query_splitter import QuerySplitter, SubQuery logger = logging.getLogger("uvicorn.error") async def dump_listings_full( parameters: QueryParameters, repository: ListingRepository, ) -> list[modelListing]: """Fetches all listings, images as well as detects floorplans.""" new_listings = await dump_listings(parameters, repository) logger.debug(f"Upserted {len(new_listings)} new listings") # refresh listings listings = await repository.get_listings(parameters) # this can be better new_listings = [x for x in listings if x.id in new_listings] return new_listings async def dump_listings( parameters: QueryParameters, repository: ListingRepository, ) -> list[modelListing]: """Fetch listings from Rightmove API and process them. Uses intelligent query splitting to maximize data extraction while respecting Rightmove's result caps. """ config = ScraperConfig.from_env() splitter = QuerySplitter(config) async with create_session(config) as session: # Phase 1 & 2: Split and probe queries logger.info("Splitting query and probing result counts...") subqueries = await splitter.split(parameters, session) total_estimated = splitter.calculate_total_estimated_results(subqueries) logger.info( f"Split into {len(subqueries)} subqueries, " f"estimated {total_estimated} total results" ) # Phase 3: Fetch all pages for each subquery semaphore = asyncio.Semaphore(config.max_concurrent_requests) async def fetch_subquery(sq: SubQuery) -> list[dict[str, Any]]: """Fetch all pages for a single subquery.""" results: list[dict[str, Any]] = [] estimated = sq.estimated_results or 0 if estimated == 0: return results page_size = parameters.page_size max_pages = min( config.max_pages_per_query, (estimated // page_size) + 1, ) for page_id in range(1, max_pages + 1): async with semaphore: await asyncio.sleep(config.request_delay_ms / 1000) try: result = await listing_query( page=page_id, channel=parameters.listing_type, min_bedrooms=sq.min_bedrooms, max_bedrooms=sq.max_bedrooms, radius=parameters.radius, min_price=sq.min_price, max_price=sq.max_price, district=sq.district, page_size=page_size, max_days_since_added=parameters.max_days_since_added, furnish_types=parameters.furnish_types or [], session=session, ) results.append(result) properties = result.get("properties", []) if len(properties) < page_size: break except Exception as e: if "GENERIC_ERROR" in str(e): logger.debug( f"Max page for {sq.district}: {page_id - 1}" ) break logger.warning( f"Error fetching page {page_id} for {sq.district}: {e}" ) break return results # Fetch all subqueries with progress bar all_results = await tqdm.gather( *[fetch_subquery(sq) for sq in subqueries], desc="Fetching listings", ) # Extract listing identifiers from results listing_ids: list[int] = [] for subquery_results in all_results: for response_json in subquery_results: if not response_json: continue if response_json.get("totalAvailableResults", 0) == 0: continue for property_data in response_json.get("properties", []): identifier = property_data.get("identifier") if identifier: listing_ids.append(identifier) logger.info(f"Found {len(listing_ids)} total listings") # Deduplicate unique_ids = list(set(listing_ids)) logger.info(f"After deduplication: {len(unique_ids)} unique listings") # Filter out listings already in database all_listing_ids = [x.id for x in await repository.get_listings()] missing_ids = [ listing_id for listing_id in unique_ids if listing_id not in all_listing_ids ] listing_processor = ListingProcessor(repository) logger.info(f"Starting processing {len(missing_ids)} new listings") processed_listings = await tqdm.gather( *[listing_processor.process_listing(id) for id in missing_ids] ) filtered_listings = [x for x in processed_listings if x is not None] return filtered_listings