Add intelligent query splitting to maximize Rightmove data extraction

2026-02-02 21:57:45 +00:00 · 2026-02-02 21:57:45 +00:00 · e8293c6042
commit e8293c6042
parent 29ba739063
11 changed files with 1970 additions and 113 deletions
--- a/crawler/tasks/listing_tasks.py
+++ b/crawler/tasks/listing_tasks.py
@ -1,18 +1,17 @@
 import asyncio
-import itertools
 import logging
 from typing import Any
 from celery import Task
 from celery.schedules import crontab
 from celery_app import app
 from config.schedule_config import SchedulesConfig
+from config.scraper_config import ScraperConfig
 from listing_processor import ListingProcessor
 from models.listing import Listing, QueryParameters
-from rec.districts import get_districts
-from rec.query import listing_query
+from rec.query import create_session, listing_query
 from repositories.listing_repository import ListingRepository
 from database import engine
-from services import image_fetcher, floorplan_detector
+from services.query_splitter import QuerySplitter, SubQuery
 from utils.redis_lock import redis_lock

 logger = logging.getLogger("uvicorn.error")
@ -134,106 +133,138 @@ async def get_ids_to_process(
    repository: ListingRepository,
    task: Task,
 ) -> set[int]:
-    semaphore = asyncio.Semaphore(5)  # if too high, rightmove drops connections
-    districts = await get_valid_districts_to_scrape(parameters.district_names)
-    task.update_state(state="Fetching listings to scrape", meta={"progress": 0})
-    json_responses: list[list[dict[str, Any]]] = await asyncio.gather(
-        *[
-            _fetch_listings_with_semaphore(
-                task=task, semaphore=semaphore, parameters=parameters, district=district
-            )
-            for district in districts.keys()
-        ],
-    )
-    json_responses_flat = list(itertools.chain.from_iterable(json_responses))
-    logger.debug(f"Total listings fetched {len(json_responses_flat)}")
+    """Fetch all listing IDs using intelligent query splitting.

-    identifiers: set[int] = set()
-    for response_json in json_responses_flat:
-        if response_json == {}:
-            continue
-        if response_json["totalAvailableResults"] == 0:
-            continue
-        for property in response_json["properties"]:
-            identifier = property["identifier"]
-            identifiers.add(identifier)
+    Uses the QuerySplitter to adaptively split large queries and maximize
+    data extraction while respecting Rightmove's result caps.

-    # if listing is already in db, do not fetch details again
-    all_listing_ids = {l.id for l in await repository.get_listings()}
-    new_ids = identifiers - all_listing_ids
-    return new_ids
+    Args:
+        parameters: Query parameters for the search.
+        repository: Repository for checking existing listings.
+        task: Celery task for progress updates.

+    Returns:
+        Set of new listing IDs that need to be processed.
+    """
+    config = ScraperConfig.from_env()
+    splitter = QuerySplitter(config)

-async def get_valid_districts_to_scrape(
-    district_names: set[str] | None,
-) -> dict[str, str]:
-    if district_names:
-        districts = {
-            district: locid
-            for district, locid in get_districts().items()
-            if district in district_names
-        }
-    else:
-        districts = get_districts()
-    return districts
+    def on_progress(phase: str, message: str) -> None:
+        task.update_state(state=message, meta={"phase": phase})

-
-async def _fetch_listings_with_semaphore(
-    *,
-    task: Task,
-    semaphore: asyncio.Semaphore,
-    parameters: QueryParameters,
-    district: str,
-) -> list[dict[str, Any]]:
-    result = []
-    # split the price in N bands to avoid the 1.5k capping by rightmove
-    # basically instead of 1 query with price between 1k and 5k that is capped at 1500 results
-    # we do 10 queries each with an increment in price range so we send more queries but each
-    # has a smaller chance of returning more than 1.5k results
-
-    number_of_steps = 10
-    price_step = parameters.max_price // number_of_steps
-
-    for step in range(number_of_steps):
+    async with create_session(config) as session:
+        # Phase 1 & 2: Split and probe queries
        task.update_state(
-            state=f"Fetching listings ({step} out of {number_of_steps})",
-            meta={"progress": step / number_of_steps},
+            state="Analyzing query and splitting by price bands...",
+            meta={"phase": "splitting", "progress": 0},
        )
-        min_price = step * price_step
-        max_price = (step + 1) * price_step
-        logger.debug(
-            f"Step {step} of {number_of_steps} with {min_price=} and {max_price=}"
+        subqueries = await splitter.split(parameters, session, on_progress)
+
+        total_estimated = splitter.calculate_total_estimated_results(subqueries)
+        logger.info(
+            f"Split into {len(subqueries)} subqueries, "
+            f"estimated {total_estimated} total results"
        )

-        for num_bedrooms in range(parameters.min_bedrooms, parameters.max_bedrooms + 1):
-            for page_id in range(
-                1,
-                3,  # seems like all searches stop at 1500 entries (page_id * page_size)
-            ):
-                logger.debug(f"Processing {page_id=} for {district=}")
+        # Phase 3: Fetch all pages for each subquery
+        task.update_state(
+            state=f"Fetching listings from {len(subqueries)} subqueries...",
+            meta={
+                "phase": "fetching",
+                "subqueries": len(subqueries),
+                "estimated_results": total_estimated,
+            },
+        )

+        semaphore = asyncio.Semaphore(config.max_concurrent_requests)
+        identifiers: set[int] = set()
+
+        async def fetch_subquery(sq: SubQuery) -> list[dict[str, Any]]:
+            """Fetch all pages for a single subquery."""
+            results: list[dict[str, Any]] = []
+
+            # Calculate how many pages we need based on estimated results
+            estimated = sq.estimated_results or 0
+            if estimated == 0:
+                return results
+
+            # Fetch pages up to max_pages_per_query or until no more results
+            page_size = parameters.page_size
+            max_pages = min(
+                config.max_pages_per_query,
+                (estimated // page_size) + 1,
+            )
+
+            for page_id in range(1, max_pages + 1):
                async with semaphore:
+                    await asyncio.sleep(config.request_delay_ms / 1000)
                    try:
-                        listing_query_result = await listing_query(
+                        result = await listing_query(
                            page=page_id,
                            channel=parameters.listing_type,
-                            # min_bedrooms=parameters.min_bedrooms,
-                            # max_bedrooms=parameters.max_bedrooms,
-                            min_bedrooms=num_bedrooms,
-                            max_bedrooms=num_bedrooms,
+                            min_bedrooms=sq.min_bedrooms,
+                            max_bedrooms=sq.max_bedrooms,
                            radius=parameters.radius,
-                            min_price=min_price,
-                            max_price=max_price,
-                            district=district,
-                            page_size=parameters.page_size,
+                            min_price=sq.min_price,
+                            max_price=sq.max_price,
+                            district=sq.district,
+                            page_size=page_size,
                            max_days_since_added=parameters.max_days_since_added,
                            furnish_types=parameters.furnish_types or [],
+                            session=session,
                        )
+                        results.append(result)
+
+                        # Check if we've received all results
+                        properties = result.get("properties", [])
+                        if len(properties) < page_size:
+                            # No more results on next page
+                            break

                    except Exception as e:
-                        if "GENERIC_ERROR" in str(e):  # Too big page id
-                            logger.debug(f"Max page id for {district=}: {page_id-1}")
+                        if "GENERIC_ERROR" in str(e):
+                            # Reached end of results
+                            logger.debug(
+                                f"Max page for {sq.district}: {page_id - 1}"
+                            )
                            break
-                        raise e
-                result.append(listing_query_result)
-    return result
+                        logger.warning(
+                            f"Error fetching page {page_id} for {sq.district}: {e}"
+                        )
+                        break
+
+            return results
+
+        # Fetch all subqueries concurrently
+        all_results = await asyncio.gather(
+            *[fetch_subquery(sq) for sq in subqueries]
+        )
+
+        # Extract identifiers from all results
+        for subquery_results in all_results:
+            for response_json in subquery_results:
+                if not response_json:
+                    continue
+                if response_json.get("totalAvailableResults", 0) == 0:
+                    continue
+                for property_data in response_json.get("properties", []):
+                    identifier = property_data.get("identifier")
+                    if identifier:
+                        identifiers.add(identifier)
+
+    logger.info(f"Found {len(identifiers)} unique listings")
+
+    # Filter out listings already in the database
+    all_listing_ids = {l.id for l in await repository.get_listings()}
+    new_ids = identifiers - all_listing_ids
+
+    task.update_state(
+        state=f"Found {len(new_ids)} new listings to process",
+        meta={
+            "phase": "filtering",
+            "total_found": len(identifiers),
+            "new_listings": len(new_ids),
+        },
+    )
+
+    return new_ids