Add intelligent query splitting to maximize Rightmove data extraction

2026-02-02 21:57:45 +00:00 · 2026-02-02 21:57:45 +00:00 · e8293c6042
commit e8293c6042
parent 29ba739063
11 changed files with 1970 additions and 113 deletions
--- a/crawler/services/listing_fetcher.py
+++ b/crawler/services/listing_fetcher.py
@ -0,0 +1,146 @@
+"""Listing fetcher service - fetches listing data from Rightmove API."""
+import asyncio
+import logging
+from typing import Any
+
+from config.scraper_config import ScraperConfig
+from listing_processor import ListingProcessor
+from rec.query import create_session, listing_query
+from models.listing import QueryParameters
+from repositories import ListingRepository
+from tqdm.asyncio import tqdm
+from models import Listing as modelListing
+from services.query_splitter import QuerySplitter, SubQuery
+
+logger = logging.getLogger("uvicorn.error")
+
+
+async def dump_listings_full(
+    parameters: QueryParameters,
+    repository: ListingRepository,
+) -> list[modelListing]:
+    """Fetches all listings, images as well as detects floorplans."""
+    new_listings = await dump_listings(parameters, repository)
+    logger.debug(f"Upserted {len(new_listings)} new listings")
+    # refresh listings
+    listings = await repository.get_listings(parameters)  # this can be better
+    new_listings = [x for x in listings if x.id in new_listings]
+    return new_listings
+
+
+async def dump_listings(
+    parameters: QueryParameters,
+    repository: ListingRepository,
+) -> list[modelListing]:
+    """Fetch listings from Rightmove API and process them.
+
+    Uses intelligent query splitting to maximize data extraction
+    while respecting Rightmove's result caps.
+    """
+    config = ScraperConfig.from_env()
+    splitter = QuerySplitter(config)
+
+    async with create_session(config) as session:
+        # Phase 1 & 2: Split and probe queries
+        logger.info("Splitting query and probing result counts...")
+        subqueries = await splitter.split(parameters, session)
+
+        total_estimated = splitter.calculate_total_estimated_results(subqueries)
+        logger.info(
+            f"Split into {len(subqueries)} subqueries, "
+            f"estimated {total_estimated} total results"
+        )
+
+        # Phase 3: Fetch all pages for each subquery
+        semaphore = asyncio.Semaphore(config.max_concurrent_requests)
+
+        async def fetch_subquery(sq: SubQuery) -> list[dict[str, Any]]:
+            """Fetch all pages for a single subquery."""
+            results: list[dict[str, Any]] = []
+
+            estimated = sq.estimated_results or 0
+            if estimated == 0:
+                return results
+
+            page_size = parameters.page_size
+            max_pages = min(
+                config.max_pages_per_query,
+                (estimated // page_size) + 1,
+            )
+
+            for page_id in range(1, max_pages + 1):
+                async with semaphore:
+                    await asyncio.sleep(config.request_delay_ms / 1000)
+                    try:
+                        result = await listing_query(
+                            page=page_id,
+                            channel=parameters.listing_type,
+                            min_bedrooms=sq.min_bedrooms,
+                            max_bedrooms=sq.max_bedrooms,
+                            radius=parameters.radius,
+                            min_price=sq.min_price,
+                            max_price=sq.max_price,
+                            district=sq.district,
+                            page_size=page_size,
+                            max_days_since_added=parameters.max_days_since_added,
+                            furnish_types=parameters.furnish_types or [],
+                            session=session,
+                        )
+                        results.append(result)
+
+                        properties = result.get("properties", [])
+                        if len(properties) < page_size:
+                            break
+
+                    except Exception as e:
+                        if "GENERIC_ERROR" in str(e):
+                            logger.debug(
+                                f"Max page for {sq.district}: {page_id - 1}"
+                            )
+                            break
+                        logger.warning(
+                            f"Error fetching page {page_id} for {sq.district}: {e}"
+                        )
+                        break
+
+            return results
+
+        # Fetch all subqueries with progress bar
+        all_results = await tqdm.gather(
+            *[fetch_subquery(sq) for sq in subqueries],
+            desc="Fetching listings",
+        )
+
+    # Extract listing identifiers from results
+    listing_ids: list[int] = []
+    for subquery_results in all_results:
+        for response_json in subquery_results:
+            if not response_json:
+                continue
+            if response_json.get("totalAvailableResults", 0) == 0:
+                continue
+            for property_data in response_json.get("properties", []):
+                identifier = property_data.get("identifier")
+                if identifier:
+                    listing_ids.append(identifier)
+
+    logger.info(f"Found {len(listing_ids)} total listings")
+
+    # Deduplicate
+    unique_ids = list(set(listing_ids))
+    logger.info(f"After deduplication: {len(unique_ids)} unique listings")
+
+    # Filter out listings already in database
+    all_listing_ids = [x.id for x in await repository.get_listings()]
+    missing_ids = [
+        listing_id for listing_id in unique_ids if listing_id not in all_listing_ids
+    ]
+
+    listing_processor = ListingProcessor(repository)
+    logger.info(f"Starting processing {len(missing_ids)} new listings")
+    processed_listings = await tqdm.gather(
+        *[listing_processor.process_listing(id) for id in missing_ids]
+    )
+    filtered_listings = [x for x in processed_listings if x is not None]
+
+    return filtered_listings
--- a/crawler/services/query_splitter.py
+++ b/crawler/services/query_splitter.py
@ -0,0 +1,303 @@
+"""Query splitting service for handling Rightmove's result cap.
+
+This module provides intelligent query splitting to work around Rightmove's
+~1,500 listing cap per search. It adaptively splits queries by price bands
+based on actual result counts.
+"""
+from __future__ import annotations
+
+import asyncio
+import logging
+from dataclasses import dataclass, replace
+from typing import Any
+
+import aiohttp
+
+from config.scraper_config import ScraperConfig
+from models.listing import ListingType, QueryParameters
+from rec.districts import get_districts
+
+logger = logging.getLogger("uvicorn.error")
+
+
+@dataclass
+class SubQuery:
+    """Represents a single query subdivision.
+
+    Attributes:
+        district: District identifier string.
+        min_bedrooms: Minimum number of bedrooms.
+        max_bedrooms: Maximum number of bedrooms.
+        min_price: Minimum price in currency units.
+        max_price: Maximum price in currency units.
+        estimated_results: Cached result count from probing (None if not probed).
+    """
+
+    district: str
+    min_bedrooms: int
+    max_bedrooms: int
+    min_price: int
+    max_price: int
+    estimated_results: int | None = None
+
+    @property
+    def price_range(self) -> int:
+        """Returns the width of the price band."""
+        return self.max_price - self.min_price
+
+
+class QuerySplitter:
+    """Splits large queries into smaller subqueries to avoid result caps.
+
+    Uses adaptive binary search on price ranges to find optimal subdivisions
+    that keep each subquery under the result threshold.
+    """
+
+    def __init__(self, config: ScraperConfig | None = None) -> None:
+        """Initialize the splitter with configuration.
+
+        Args:
+            config: Scraper configuration. Loads from environment if not provided.
+        """
+        self.config = config or ScraperConfig.from_env()
+
+    def create_initial_subqueries(
+        self,
+        parameters: QueryParameters,
+        districts: dict[str, str],
+    ) -> list[SubQuery]:
+        """Create initial subqueries by splitting on district and bedrooms.
+
+        This creates the initial split before probing for result counts.
+        Each bedroom count gets its own subquery to enable finer-grained splitting.
+
+        Args:
+            parameters: Original query parameters.
+            districts: Dictionary of district name to location ID.
+
+        Returns:
+            List of initial SubQuery objects.
+        """
+        subqueries: list[SubQuery] = []
+
+        for district in districts.keys():
+            for num_bedrooms in range(
+                parameters.min_bedrooms, parameters.max_bedrooms + 1
+            ):
+                subqueries.append(
+                    SubQuery(
+                        district=district,
+                        min_bedrooms=num_bedrooms,
+                        max_bedrooms=num_bedrooms,
+                        min_price=parameters.min_price,
+                        max_price=parameters.max_price,
+                    )
+                )
+
+        return subqueries
+
+    async def probe_result_count(
+        self,
+        subquery: SubQuery,
+        session: aiohttp.ClientSession,
+        parameters: QueryParameters,
+    ) -> int:
+        """Probe the API to get the total result count for a subquery.
+
+        Makes a minimal request (page_size=1) to get totalAvailableResults.
+
+        Args:
+            subquery: The subquery to probe.
+            session: aiohttp session for making requests.
+            parameters: Original query parameters for additional settings.
+
+        Returns:
+            Total available results for this subquery.
+        """
+        from rec.query import probe_query
+
+        try:
+            result = await probe_query(
+                session=session,
+                channel=parameters.listing_type,
+                min_bedrooms=subquery.min_bedrooms,
+                max_bedrooms=subquery.max_bedrooms,
+                radius=parameters.radius,
+                min_price=subquery.min_price,
+                max_price=subquery.max_price,
+                district=subquery.district,
+                max_days_since_added=parameters.max_days_since_added,
+                furnish_types=parameters.furnish_types or [],
+            )
+            return result.get("totalAvailableResults", 0)
+        except Exception as e:
+            logger.warning(f"Failed to probe subquery {subquery}: {e}")
+            return 0
+
+    def split_by_price(self, subquery: SubQuery) -> list[SubQuery]:
+        """Split a subquery into two by halving the price range.
+
+        Args:
+            subquery: The subquery to split.
+
+        Returns:
+            List of two subqueries covering the same price range.
+        """
+        mid_price = (subquery.min_price + subquery.max_price) // 2
+
+        return [
+            replace(
+                subquery,
+                max_price=mid_price,
+                estimated_results=None,
+            ),
+            replace(
+                subquery,
+                min_price=mid_price,
+                estimated_results=None,
+            ),
+        ]
+
+    async def adaptive_split(
+        self,
+        subquery: SubQuery,
+        session: aiohttp.ClientSession,
+        parameters: QueryParameters,
+        semaphore: asyncio.Semaphore,
+    ) -> list[SubQuery]:
+        """Recursively split a subquery until all parts are under threshold.
+
+        Uses binary search on price range to find optimal splits.
+
+        Args:
+            subquery: The subquery to split.
+            session: aiohttp session for making requests.
+            parameters: Original query parameters.
+            semaphore: Semaphore for rate limiting.
+
+        Returns:
+            List of subqueries that are all under the split threshold.
+        """
+        # Check if we can split further
+        if subquery.price_range <= self.config.min_price_band:
+            logger.warning(
+                f"Cannot split further, price band at minimum: {subquery}"
+            )
+            return [subquery]
+
+        # Split into two halves
+        halves = self.split_by_price(subquery)
+        result: list[SubQuery] = []
+
+        for half in halves:
+            async with semaphore:
+                await asyncio.sleep(self.config.request_delay_ms / 1000)
+                count = await self.probe_result_count(half, session, parameters)
+
+            half = replace(half, estimated_results=count)
+
+            if count > self.config.split_threshold:
+                # Need to split further
+                result.extend(
+                    await self.adaptive_split(
+                        half, session, parameters, semaphore
+                    )
+                )
+            else:
+                result.append(half)
+
+        return result
+
+    async def split(
+        self,
+        parameters: QueryParameters,
+        session: aiohttp.ClientSession,
+        on_progress: Any = None,
+    ) -> list[SubQuery]:
+        """Split query parameters into optimized subqueries.
+
+        Performs the full splitting algorithm:
+        1. Create initial splits by district and bedroom count
+        2. Probe each to get result counts
+        3. Adaptively split any that exceed the threshold
+
+        Args:
+            parameters: Original query parameters to split.
+            session: aiohttp session for making requests.
+            on_progress: Optional callback for progress updates.
+
+        Returns:
+            List of SubQuery objects, each under the result threshold.
+        """
+        # Get valid districts
+        if parameters.district_names:
+            districts = {
+                district: locid
+                for district, locid in get_districts().items()
+                if district in parameters.district_names
+            }
+        else:
+            districts = get_districts()
+
+        # Phase 1: Create initial subqueries
+        initial_subqueries = self.create_initial_subqueries(parameters, districts)
+        logger.info(f"Created {len(initial_subqueries)} initial subqueries")
+
+        if on_progress:
+            on_progress(
+                phase="splitting",
+                message=f"Created {len(initial_subqueries)} initial subqueries",
+            )
+
+        # Phase 2: Probe and adaptively split
+        semaphore = asyncio.Semaphore(self.config.max_concurrent_requests)
+        refined_subqueries: list[SubQuery] = []
+
+        # Probe all initial subqueries in parallel
+        async def probe_and_split(sq: SubQuery) -> list[SubQuery]:
+            async with semaphore:
+                await asyncio.sleep(self.config.request_delay_ms / 1000)
+                count = await self.probe_result_count(sq, session, parameters)
+
+            sq = replace(sq, estimated_results=count)
+
+            if count > self.config.split_threshold:
+                logger.info(
+                    f"Subquery {sq.district}/{sq.min_bedrooms}BR "
+                    f"has {count} results, splitting..."
+                )
+                return await self.adaptive_split(
+                    sq, session, parameters, semaphore
+                )
+            return [sq]
+
+        tasks = [probe_and_split(sq) for sq in initial_subqueries]
+        results = await asyncio.gather(*tasks)
+
+        for subquery_list in results:
+            refined_subqueries.extend(subquery_list)
+
+        logger.info(
+            f"Refined to {len(refined_subqueries)} subqueries after splitting"
+        )
+
+        if on_progress:
+            on_progress(
+                phase="splitting_complete",
+                message=f"Refined to {len(refined_subqueries)} subqueries",
+            )
+
+        return refined_subqueries
+
+    def calculate_total_estimated_results(
+        self, subqueries: list[SubQuery]
+    ) -> int:
+        """Calculate total estimated results across all subqueries.
+
+        Args:
+            subqueries: List of subqueries with estimated_results set.
+
+        Returns:
+            Sum of all estimated results.
+        """
+        return sum(sq.estimated_results or 0 for sq in subqueries)