Add intelligent query splitting to maximize Rightmove data extraction

2026-02-02 21:57:45 +00:00 · 2026-02-02 21:57:45 +00:00 · e8293c6042
commit e8293c6042
parent 29ba739063
11 changed files with 1970 additions and 113 deletions
--- a/crawler/services/listing_fetcher.py
+++ b/crawler/services/listing_fetcher.py
@ -0,0 +1,146 @@
+"""Listing fetcher service - fetches listing data from Rightmove API."""
+import asyncio
+import logging
+from typing import Any
+
+from config.scraper_config import ScraperConfig
+from listing_processor import ListingProcessor
+from rec.query import create_session, listing_query
+from models.listing import QueryParameters
+from repositories import ListingRepository
+from tqdm.asyncio import tqdm
+from models import Listing as modelListing
+from services.query_splitter import QuerySplitter, SubQuery
+
+logger = logging.getLogger("uvicorn.error")
+
+
+async def dump_listings_full(
+    parameters: QueryParameters,
+    repository: ListingRepository,
+) -> list[modelListing]:
+    """Fetches all listings, images as well as detects floorplans."""
+    new_listings = await dump_listings(parameters, repository)
+    logger.debug(f"Upserted {len(new_listings)} new listings")
+    # refresh listings
+    listings = await repository.get_listings(parameters)  # this can be better
+    new_listings = [x for x in listings if x.id in new_listings]
+    return new_listings
+
+
+async def dump_listings(
+    parameters: QueryParameters,
+    repository: ListingRepository,
+) -> list[modelListing]:
+    """Fetch listings from Rightmove API and process them.
+
+    Uses intelligent query splitting to maximize data extraction
+    while respecting Rightmove's result caps.
+    """
+    config = ScraperConfig.from_env()
+    splitter = QuerySplitter(config)
+
+    async with create_session(config) as session:
+        # Phase 1 & 2: Split and probe queries
+        logger.info("Splitting query and probing result counts...")
+        subqueries = await splitter.split(parameters, session)
+
+        total_estimated = splitter.calculate_total_estimated_results(subqueries)
+        logger.info(
+            f"Split into {len(subqueries)} subqueries, "
+            f"estimated {total_estimated} total results"
+        )
+
+        # Phase 3: Fetch all pages for each subquery
+        semaphore = asyncio.Semaphore(config.max_concurrent_requests)
+
+        async def fetch_subquery(sq: SubQuery) -> list[dict[str, Any]]:
+            """Fetch all pages for a single subquery."""
+            results: list[dict[str, Any]] = []
+
+            estimated = sq.estimated_results or 0
+            if estimated == 0:
+                return results
+
+            page_size = parameters.page_size
+            max_pages = min(
+                config.max_pages_per_query,
+                (estimated // page_size) + 1,
+            )
+
+            for page_id in range(1, max_pages + 1):
+                async with semaphore:
+                    await asyncio.sleep(config.request_delay_ms / 1000)
+                    try:
+                        result = await listing_query(
+                            page=page_id,
+                            channel=parameters.listing_type,
+                            min_bedrooms=sq.min_bedrooms,
+                            max_bedrooms=sq.max_bedrooms,
+                            radius=parameters.radius,
+                            min_price=sq.min_price,
+                            max_price=sq.max_price,
+                            district=sq.district,
+                            page_size=page_size,
+                            max_days_since_added=parameters.max_days_since_added,
+                            furnish_types=parameters.furnish_types or [],
+                            session=session,
+                        )
+                        results.append(result)
+
+                        properties = result.get("properties", [])
+                        if len(properties) < page_size:
+                            break
+
+                    except Exception as e:
+                        if "GENERIC_ERROR" in str(e):
+                            logger.debug(
+                                f"Max page for {sq.district}: {page_id - 1}"
+                            )
+                            break
+                        logger.warning(
+                            f"Error fetching page {page_id} for {sq.district}: {e}"
+                        )
+                        break
+
+            return results
+
+        # Fetch all subqueries with progress bar
+        all_results = await tqdm.gather(
+            *[fetch_subquery(sq) for sq in subqueries],
+            desc="Fetching listings",
+        )
+
+    # Extract listing identifiers from results
+    listing_ids: list[int] = []
+    for subquery_results in all_results:
+        for response_json in subquery_results:
+            if not response_json:
+                continue
+            if response_json.get("totalAvailableResults", 0) == 0:
+                continue
+            for property_data in response_json.get("properties", []):
+                identifier = property_data.get("identifier")
+                if identifier:
+                    listing_ids.append(identifier)
+
+    logger.info(f"Found {len(listing_ids)} total listings")
+
+    # Deduplicate
+    unique_ids = list(set(listing_ids))
+    logger.info(f"After deduplication: {len(unique_ids)} unique listings")
+
+    # Filter out listings already in database
+    all_listing_ids = [x.id for x in await repository.get_listings()]
+    missing_ids = [
+        listing_id for listing_id in unique_ids if listing_id not in all_listing_ids
+    ]
+
+    listing_processor = ListingProcessor(repository)
+    logger.info(f"Starting processing {len(missing_ids)} new listings")
+    processed_listings = await tqdm.gather(
+        *[listing_processor.process_listing(id) for id in missing_ids]
+    )
+    filtered_listings = [x for x in processed_listings if x is not None]
+
+    return filtered_listings