Add intelligent query splitting to maximize Rightmove data extraction

This commit is contained in:
Viktor Barzin 2026-02-02 21:57:45 +00:00
parent 29ba739063
commit e8293c6042
11 changed files with 1970 additions and 113 deletions

View file

@ -0,0 +1,146 @@
"""Listing fetcher service - fetches listing data from Rightmove API."""
import asyncio
import logging
from typing import Any
from config.scraper_config import ScraperConfig
from listing_processor import ListingProcessor
from rec.query import create_session, listing_query
from models.listing import QueryParameters
from repositories import ListingRepository
from tqdm.asyncio import tqdm
from models import Listing as modelListing
from services.query_splitter import QuerySplitter, SubQuery
logger = logging.getLogger("uvicorn.error")
async def dump_listings_full(
parameters: QueryParameters,
repository: ListingRepository,
) -> list[modelListing]:
"""Fetches all listings, images as well as detects floorplans."""
new_listings = await dump_listings(parameters, repository)
logger.debug(f"Upserted {len(new_listings)} new listings")
# refresh listings
listings = await repository.get_listings(parameters) # this can be better
new_listings = [x for x in listings if x.id in new_listings]
return new_listings
async def dump_listings(
parameters: QueryParameters,
repository: ListingRepository,
) -> list[modelListing]:
"""Fetch listings from Rightmove API and process them.
Uses intelligent query splitting to maximize data extraction
while respecting Rightmove's result caps.
"""
config = ScraperConfig.from_env()
splitter = QuerySplitter(config)
async with create_session(config) as session:
# Phase 1 & 2: Split and probe queries
logger.info("Splitting query and probing result counts...")
subqueries = await splitter.split(parameters, session)
total_estimated = splitter.calculate_total_estimated_results(subqueries)
logger.info(
f"Split into {len(subqueries)} subqueries, "
f"estimated {total_estimated} total results"
)
# Phase 3: Fetch all pages for each subquery
semaphore = asyncio.Semaphore(config.max_concurrent_requests)
async def fetch_subquery(sq: SubQuery) -> list[dict[str, Any]]:
"""Fetch all pages for a single subquery."""
results: list[dict[str, Any]] = []
estimated = sq.estimated_results or 0
if estimated == 0:
return results
page_size = parameters.page_size
max_pages = min(
config.max_pages_per_query,
(estimated // page_size) + 1,
)
for page_id in range(1, max_pages + 1):
async with semaphore:
await asyncio.sleep(config.request_delay_ms / 1000)
try:
result = await listing_query(
page=page_id,
channel=parameters.listing_type,
min_bedrooms=sq.min_bedrooms,
max_bedrooms=sq.max_bedrooms,
radius=parameters.radius,
min_price=sq.min_price,
max_price=sq.max_price,
district=sq.district,
page_size=page_size,
max_days_since_added=parameters.max_days_since_added,
furnish_types=parameters.furnish_types or [],
session=session,
)
results.append(result)
properties = result.get("properties", [])
if len(properties) < page_size:
break
except Exception as e:
if "GENERIC_ERROR" in str(e):
logger.debug(
f"Max page for {sq.district}: {page_id - 1}"
)
break
logger.warning(
f"Error fetching page {page_id} for {sq.district}: {e}"
)
break
return results
# Fetch all subqueries with progress bar
all_results = await tqdm.gather(
*[fetch_subquery(sq) for sq in subqueries],
desc="Fetching listings",
)
# Extract listing identifiers from results
listing_ids: list[int] = []
for subquery_results in all_results:
for response_json in subquery_results:
if not response_json:
continue
if response_json.get("totalAvailableResults", 0) == 0:
continue
for property_data in response_json.get("properties", []):
identifier = property_data.get("identifier")
if identifier:
listing_ids.append(identifier)
logger.info(f"Found {len(listing_ids)} total listings")
# Deduplicate
unique_ids = list(set(listing_ids))
logger.info(f"After deduplication: {len(unique_ids)} unique listings")
# Filter out listings already in database
all_listing_ids = [x.id for x in await repository.get_listings()]
missing_ids = [
listing_id for listing_id in unique_ids if listing_id not in all_listing_ids
]
listing_processor = ListingProcessor(repository)
logger.info(f"Starting processing {len(missing_ids)} new listings")
processed_listings = await tqdm.gather(
*[listing_processor.process_listing(id) for id in missing_ids]
)
filtered_listings = [x for x in processed_listings if x is not None]
return filtered_listings

View file

@ -0,0 +1,303 @@
"""Query splitting service for handling Rightmove's result cap.
This module provides intelligent query splitting to work around Rightmove's
~1,500 listing cap per search. It adaptively splits queries by price bands
based on actual result counts.
"""
from __future__ import annotations
import asyncio
import logging
from dataclasses import dataclass, replace
from typing import Any
import aiohttp
from config.scraper_config import ScraperConfig
from models.listing import ListingType, QueryParameters
from rec.districts import get_districts
logger = logging.getLogger("uvicorn.error")
@dataclass
class SubQuery:
"""Represents a single query subdivision.
Attributes:
district: District identifier string.
min_bedrooms: Minimum number of bedrooms.
max_bedrooms: Maximum number of bedrooms.
min_price: Minimum price in currency units.
max_price: Maximum price in currency units.
estimated_results: Cached result count from probing (None if not probed).
"""
district: str
min_bedrooms: int
max_bedrooms: int
min_price: int
max_price: int
estimated_results: int | None = None
@property
def price_range(self) -> int:
"""Returns the width of the price band."""
return self.max_price - self.min_price
class QuerySplitter:
"""Splits large queries into smaller subqueries to avoid result caps.
Uses adaptive binary search on price ranges to find optimal subdivisions
that keep each subquery under the result threshold.
"""
def __init__(self, config: ScraperConfig | None = None) -> None:
"""Initialize the splitter with configuration.
Args:
config: Scraper configuration. Loads from environment if not provided.
"""
self.config = config or ScraperConfig.from_env()
def create_initial_subqueries(
self,
parameters: QueryParameters,
districts: dict[str, str],
) -> list[SubQuery]:
"""Create initial subqueries by splitting on district and bedrooms.
This creates the initial split before probing for result counts.
Each bedroom count gets its own subquery to enable finer-grained splitting.
Args:
parameters: Original query parameters.
districts: Dictionary of district name to location ID.
Returns:
List of initial SubQuery objects.
"""
subqueries: list[SubQuery] = []
for district in districts.keys():
for num_bedrooms in range(
parameters.min_bedrooms, parameters.max_bedrooms + 1
):
subqueries.append(
SubQuery(
district=district,
min_bedrooms=num_bedrooms,
max_bedrooms=num_bedrooms,
min_price=parameters.min_price,
max_price=parameters.max_price,
)
)
return subqueries
async def probe_result_count(
self,
subquery: SubQuery,
session: aiohttp.ClientSession,
parameters: QueryParameters,
) -> int:
"""Probe the API to get the total result count for a subquery.
Makes a minimal request (page_size=1) to get totalAvailableResults.
Args:
subquery: The subquery to probe.
session: aiohttp session for making requests.
parameters: Original query parameters for additional settings.
Returns:
Total available results for this subquery.
"""
from rec.query import probe_query
try:
result = await probe_query(
session=session,
channel=parameters.listing_type,
min_bedrooms=subquery.min_bedrooms,
max_bedrooms=subquery.max_bedrooms,
radius=parameters.radius,
min_price=subquery.min_price,
max_price=subquery.max_price,
district=subquery.district,
max_days_since_added=parameters.max_days_since_added,
furnish_types=parameters.furnish_types or [],
)
return result.get("totalAvailableResults", 0)
except Exception as e:
logger.warning(f"Failed to probe subquery {subquery}: {e}")
return 0
def split_by_price(self, subquery: SubQuery) -> list[SubQuery]:
"""Split a subquery into two by halving the price range.
Args:
subquery: The subquery to split.
Returns:
List of two subqueries covering the same price range.
"""
mid_price = (subquery.min_price + subquery.max_price) // 2
return [
replace(
subquery,
max_price=mid_price,
estimated_results=None,
),
replace(
subquery,
min_price=mid_price,
estimated_results=None,
),
]
async def adaptive_split(
self,
subquery: SubQuery,
session: aiohttp.ClientSession,
parameters: QueryParameters,
semaphore: asyncio.Semaphore,
) -> list[SubQuery]:
"""Recursively split a subquery until all parts are under threshold.
Uses binary search on price range to find optimal splits.
Args:
subquery: The subquery to split.
session: aiohttp session for making requests.
parameters: Original query parameters.
semaphore: Semaphore for rate limiting.
Returns:
List of subqueries that are all under the split threshold.
"""
# Check if we can split further
if subquery.price_range <= self.config.min_price_band:
logger.warning(
f"Cannot split further, price band at minimum: {subquery}"
)
return [subquery]
# Split into two halves
halves = self.split_by_price(subquery)
result: list[SubQuery] = []
for half in halves:
async with semaphore:
await asyncio.sleep(self.config.request_delay_ms / 1000)
count = await self.probe_result_count(half, session, parameters)
half = replace(half, estimated_results=count)
if count > self.config.split_threshold:
# Need to split further
result.extend(
await self.adaptive_split(
half, session, parameters, semaphore
)
)
else:
result.append(half)
return result
async def split(
self,
parameters: QueryParameters,
session: aiohttp.ClientSession,
on_progress: Any = None,
) -> list[SubQuery]:
"""Split query parameters into optimized subqueries.
Performs the full splitting algorithm:
1. Create initial splits by district and bedroom count
2. Probe each to get result counts
3. Adaptively split any that exceed the threshold
Args:
parameters: Original query parameters to split.
session: aiohttp session for making requests.
on_progress: Optional callback for progress updates.
Returns:
List of SubQuery objects, each under the result threshold.
"""
# Get valid districts
if parameters.district_names:
districts = {
district: locid
for district, locid in get_districts().items()
if district in parameters.district_names
}
else:
districts = get_districts()
# Phase 1: Create initial subqueries
initial_subqueries = self.create_initial_subqueries(parameters, districts)
logger.info(f"Created {len(initial_subqueries)} initial subqueries")
if on_progress:
on_progress(
phase="splitting",
message=f"Created {len(initial_subqueries)} initial subqueries",
)
# Phase 2: Probe and adaptively split
semaphore = asyncio.Semaphore(self.config.max_concurrent_requests)
refined_subqueries: list[SubQuery] = []
# Probe all initial subqueries in parallel
async def probe_and_split(sq: SubQuery) -> list[SubQuery]:
async with semaphore:
await asyncio.sleep(self.config.request_delay_ms / 1000)
count = await self.probe_result_count(sq, session, parameters)
sq = replace(sq, estimated_results=count)
if count > self.config.split_threshold:
logger.info(
f"Subquery {sq.district}/{sq.min_bedrooms}BR "
f"has {count} results, splitting..."
)
return await self.adaptive_split(
sq, session, parameters, semaphore
)
return [sq]
tasks = [probe_and_split(sq) for sq in initial_subqueries]
results = await asyncio.gather(*tasks)
for subquery_list in results:
refined_subqueries.extend(subquery_list)
logger.info(
f"Refined to {len(refined_subqueries)} subqueries after splitting"
)
if on_progress:
on_progress(
phase="splitting_complete",
message=f"Refined to {len(refined_subqueries)} subqueries",
)
return refined_subqueries
def calculate_total_estimated_results(
self, subqueries: list[SubQuery]
) -> int:
"""Calculate total estimated results across all subqueries.
Args:
subqueries: List of subqueries with estimated_results set.
Returns:
Sum of all estimated results.
"""
return sum(sq.estimated_results or 0 for sq in subqueries)