"""Query splitting service for handling Rightmove's result cap. This module provides intelligent query splitting to work around Rightmove's ~1,500 listing cap per search. It adaptively splits queries by price bands based on actual result counts. """ from __future__ import annotations import asyncio import logging from dataclasses import dataclass, replace from typing import Any import aiohttp from config.scraper_config import ScraperConfig from models.listing import ListingType, QueryParameters from rec.districts import get_districts logger = logging.getLogger("uvicorn.error") @dataclass class SubQuery: """Represents a single query subdivision. Attributes: district: District identifier string. min_bedrooms: Minimum number of bedrooms. max_bedrooms: Maximum number of bedrooms. min_price: Minimum price in currency units. max_price: Maximum price in currency units. estimated_results: Cached result count from probing (None if not probed). """ district: str min_bedrooms: int max_bedrooms: int min_price: int max_price: int estimated_results: int | None = None @property def price_range(self) -> int: """Returns the width of the price band.""" return self.max_price - self.min_price class QuerySplitter: """Splits large queries into smaller subqueries to avoid result caps. Uses adaptive binary search on price ranges to find optimal subdivisions that keep each subquery under the result threshold. """ def __init__(self, config: ScraperConfig | None = None) -> None: """Initialize the splitter with configuration. Args: config: Scraper configuration. Loads from environment if not provided. """ self.config = config or ScraperConfig.from_env() def create_initial_subqueries( self, parameters: QueryParameters, districts: dict[str, str], ) -> list[SubQuery]: """Create initial subqueries by splitting on district and bedrooms. This creates the initial split before probing for result counts. Each bedroom count gets its own subquery to enable finer-grained splitting. Args: parameters: Original query parameters. districts: Dictionary of district name to location ID. Returns: List of initial SubQuery objects. """ subqueries: list[SubQuery] = [] for district in districts.keys(): for num_bedrooms in range( parameters.min_bedrooms, parameters.max_bedrooms + 1 ): subqueries.append( SubQuery( district=district, min_bedrooms=num_bedrooms, max_bedrooms=num_bedrooms, min_price=parameters.min_price, max_price=parameters.max_price, ) ) return subqueries async def probe_result_count( self, subquery: SubQuery, session: aiohttp.ClientSession, parameters: QueryParameters, ) -> int: """Probe the API to get the total result count for a subquery. Makes a minimal request (page_size=1) to get totalAvailableResults. Args: subquery: The subquery to probe. session: aiohttp session for making requests. parameters: Original query parameters for additional settings. Returns: Total available results for this subquery. """ from rec.query import probe_query try: result = await probe_query( session=session, channel=parameters.listing_type, min_bedrooms=subquery.min_bedrooms, max_bedrooms=subquery.max_bedrooms, radius=parameters.radius, min_price=subquery.min_price, max_price=subquery.max_price, district=subquery.district, max_days_since_added=parameters.max_days_since_added, furnish_types=parameters.furnish_types or [], ) return result.get("totalAvailableResults", 0) except Exception as e: logger.warning(f"Failed to probe subquery {subquery}: {e}") return 0 def split_by_price(self, subquery: SubQuery) -> list[SubQuery]: """Split a subquery into two by halving the price range. Args: subquery: The subquery to split. Returns: List of two subqueries covering the same price range. """ mid_price = (subquery.min_price + subquery.max_price) // 2 return [ replace( subquery, max_price=mid_price, estimated_results=None, ), replace( subquery, min_price=mid_price, estimated_results=None, ), ] async def adaptive_split( self, subquery: SubQuery, session: aiohttp.ClientSession, parameters: QueryParameters, semaphore: asyncio.Semaphore, ) -> list[SubQuery]: """Recursively split a subquery until all parts are under threshold. Uses binary search on price range to find optimal splits. Args: subquery: The subquery to split. session: aiohttp session for making requests. parameters: Original query parameters. semaphore: Semaphore for rate limiting. Returns: List of subqueries that are all under the split threshold. """ # Check if we can split further if subquery.price_range <= self.config.min_price_band: logger.warning( f"Cannot split further, price band at minimum: {subquery}" ) return [subquery] # Split into two halves halves = self.split_by_price(subquery) result: list[SubQuery] = [] for half in halves: async with semaphore: await asyncio.sleep(self.config.request_delay_ms / 1000) count = await self.probe_result_count(half, session, parameters) half = replace(half, estimated_results=count) if count > self.config.split_threshold: # Need to split further result.extend( await self.adaptive_split( half, session, parameters, semaphore ) ) else: result.append(half) return result async def split( self, parameters: QueryParameters, session: aiohttp.ClientSession, on_progress: Any = None, ) -> list[SubQuery]: """Split query parameters into optimized subqueries. Performs the full splitting algorithm: 1. Create initial splits by district and bedroom count 2. Probe each to get result counts 3. Adaptively split any that exceed the threshold Args: parameters: Original query parameters to split. session: aiohttp session for making requests. on_progress: Optional callback for progress updates. Returns: List of SubQuery objects, each under the result threshold. """ # Get valid districts if parameters.district_names: districts = { district: locid for district, locid in get_districts().items() if district in parameters.district_names } else: districts = get_districts() # Phase 1: Create initial subqueries initial_subqueries = self.create_initial_subqueries(parameters, districts) logger.info(f"Created {len(initial_subqueries)} initial subqueries") if on_progress: on_progress( phase="splitting", message=f"Created {len(initial_subqueries)} initial subqueries", ) # Phase 2: Probe and adaptively split semaphore = asyncio.Semaphore(self.config.max_concurrent_requests) refined_subqueries: list[SubQuery] = [] # Probe all initial subqueries in parallel async def probe_and_split(sq: SubQuery) -> list[SubQuery]: async with semaphore: await asyncio.sleep(self.config.request_delay_ms / 1000) count = await self.probe_result_count(sq, session, parameters) sq = replace(sq, estimated_results=count) if count > self.config.split_threshold: logger.info( f"Subquery {sq.district}/{sq.min_bedrooms}BR " f"has {count} results, splitting..." ) return await self.adaptive_split( sq, session, parameters, semaphore ) return [sq] tasks = [probe_and_split(sq) for sq in initial_subqueries] results = await asyncio.gather(*tasks) for subquery_list in results: refined_subqueries.extend(subquery_list) logger.info( f"Refined to {len(refined_subqueries)} subqueries after splitting" ) if on_progress: on_progress( phase="splitting_complete", message=f"Refined to {len(refined_subqueries)} subqueries", ) return refined_subqueries def calculate_total_estimated_results( self, subqueries: list[SubQuery] ) -> int: """Calculate total estimated results across all subqueries. Args: subqueries: List of subqueries with estimated_results set. Returns: Sum of all estimated results. """ return sum(sq.estimated_results or 0 for sq in subqueries)