wrongmove/crawler/services/query_splitter.py

303 lines
9.8 KiB
Python

"""Query splitting service for handling Rightmove's result cap.
This module provides intelligent query splitting to work around Rightmove's
~1,500 listing cap per search. It adaptively splits queries by price bands
based on actual result counts.
"""
from __future__ import annotations
import asyncio
import logging
from dataclasses import dataclass, replace
from typing import Any
import aiohttp
from config.scraper_config import ScraperConfig
from models.listing import ListingType, QueryParameters
from rec.districts import get_districts
logger = logging.getLogger("uvicorn.error")
@dataclass
class SubQuery:
"""Represents a single query subdivision.
Attributes:
district: District identifier string.
min_bedrooms: Minimum number of bedrooms.
max_bedrooms: Maximum number of bedrooms.
min_price: Minimum price in currency units.
max_price: Maximum price in currency units.
estimated_results: Cached result count from probing (None if not probed).
"""
district: str
min_bedrooms: int
max_bedrooms: int
min_price: int
max_price: int
estimated_results: int | None = None
@property
def price_range(self) -> int:
"""Returns the width of the price band."""
return self.max_price - self.min_price
class QuerySplitter:
"""Splits large queries into smaller subqueries to avoid result caps.
Uses adaptive binary search on price ranges to find optimal subdivisions
that keep each subquery under the result threshold.
"""
def __init__(self, config: ScraperConfig | None = None) -> None:
"""Initialize the splitter with configuration.
Args:
config: Scraper configuration. Loads from environment if not provided.
"""
self.config = config or ScraperConfig.from_env()
def create_initial_subqueries(
self,
parameters: QueryParameters,
districts: dict[str, str],
) -> list[SubQuery]:
"""Create initial subqueries by splitting on district and bedrooms.
This creates the initial split before probing for result counts.
Each bedroom count gets its own subquery to enable finer-grained splitting.
Args:
parameters: Original query parameters.
districts: Dictionary of district name to location ID.
Returns:
List of initial SubQuery objects.
"""
subqueries: list[SubQuery] = []
for district in districts.keys():
for num_bedrooms in range(
parameters.min_bedrooms, parameters.max_bedrooms + 1
):
subqueries.append(
SubQuery(
district=district,
min_bedrooms=num_bedrooms,
max_bedrooms=num_bedrooms,
min_price=parameters.min_price,
max_price=parameters.max_price,
)
)
return subqueries
async def probe_result_count(
self,
subquery: SubQuery,
session: aiohttp.ClientSession,
parameters: QueryParameters,
) -> int:
"""Probe the API to get the total result count for a subquery.
Makes a minimal request (page_size=1) to get totalAvailableResults.
Args:
subquery: The subquery to probe.
session: aiohttp session for making requests.
parameters: Original query parameters for additional settings.
Returns:
Total available results for this subquery.
"""
from rec.query import probe_query
try:
result = await probe_query(
session=session,
channel=parameters.listing_type,
min_bedrooms=subquery.min_bedrooms,
max_bedrooms=subquery.max_bedrooms,
radius=parameters.radius,
min_price=subquery.min_price,
max_price=subquery.max_price,
district=subquery.district,
max_days_since_added=parameters.max_days_since_added,
furnish_types=parameters.furnish_types or [],
)
return result.get("totalAvailableResults", 0)
except Exception as e:
logger.warning(f"Failed to probe subquery {subquery}: {e}")
return 0
def split_by_price(self, subquery: SubQuery) -> list[SubQuery]:
"""Split a subquery into two by halving the price range.
Args:
subquery: The subquery to split.
Returns:
List of two subqueries covering the same price range.
"""
mid_price = (subquery.min_price + subquery.max_price) // 2
return [
replace(
subquery,
max_price=mid_price,
estimated_results=None,
),
replace(
subquery,
min_price=mid_price,
estimated_results=None,
),
]
async def adaptive_split(
self,
subquery: SubQuery,
session: aiohttp.ClientSession,
parameters: QueryParameters,
semaphore: asyncio.Semaphore,
) -> list[SubQuery]:
"""Recursively split a subquery until all parts are under threshold.
Uses binary search on price range to find optimal splits.
Args:
subquery: The subquery to split.
session: aiohttp session for making requests.
parameters: Original query parameters.
semaphore: Semaphore for rate limiting.
Returns:
List of subqueries that are all under the split threshold.
"""
# Check if we can split further
if subquery.price_range <= self.config.min_price_band:
logger.warning(
f"Cannot split further, price band at minimum: {subquery}"
)
return [subquery]
# Split into two halves
halves = self.split_by_price(subquery)
result: list[SubQuery] = []
for half in halves:
async with semaphore:
await asyncio.sleep(self.config.request_delay_ms / 1000)
count = await self.probe_result_count(half, session, parameters)
half = replace(half, estimated_results=count)
if count > self.config.split_threshold:
# Need to split further
result.extend(
await self.adaptive_split(
half, session, parameters, semaphore
)
)
else:
result.append(half)
return result
async def split(
self,
parameters: QueryParameters,
session: aiohttp.ClientSession,
on_progress: Any = None,
) -> list[SubQuery]:
"""Split query parameters into optimized subqueries.
Performs the full splitting algorithm:
1. Create initial splits by district and bedroom count
2. Probe each to get result counts
3. Adaptively split any that exceed the threshold
Args:
parameters: Original query parameters to split.
session: aiohttp session for making requests.
on_progress: Optional callback for progress updates.
Returns:
List of SubQuery objects, each under the result threshold.
"""
# Get valid districts
if parameters.district_names:
districts = {
district: locid
for district, locid in get_districts().items()
if district in parameters.district_names
}
else:
districts = get_districts()
# Phase 1: Create initial subqueries
initial_subqueries = self.create_initial_subqueries(parameters, districts)
logger.info(f"Created {len(initial_subqueries)} initial subqueries")
if on_progress:
on_progress(
phase="splitting",
message=f"Created {len(initial_subqueries)} initial subqueries",
)
# Phase 2: Probe and adaptively split
semaphore = asyncio.Semaphore(self.config.max_concurrent_requests)
refined_subqueries: list[SubQuery] = []
# Probe all initial subqueries in parallel
async def probe_and_split(sq: SubQuery) -> list[SubQuery]:
async with semaphore:
await asyncio.sleep(self.config.request_delay_ms / 1000)
count = await self.probe_result_count(sq, session, parameters)
sq = replace(sq, estimated_results=count)
if count > self.config.split_threshold:
logger.info(
f"Subquery {sq.district}/{sq.min_bedrooms}BR "
f"has {count} results, splitting..."
)
return await self.adaptive_split(
sq, session, parameters, semaphore
)
return [sq]
tasks = [probe_and_split(sq) for sq in initial_subqueries]
results = await asyncio.gather(*tasks)
for subquery_list in results:
refined_subqueries.extend(subquery_list)
logger.info(
f"Refined to {len(refined_subqueries)} subqueries after splitting"
)
if on_progress:
on_progress(
phase="splitting_complete",
message=f"Refined to {len(refined_subqueries)} subqueries",
)
return refined_subqueries
def calculate_total_estimated_results(
self, subqueries: list[SubQuery]
) -> int:
"""Calculate total estimated results across all subqueries.
Args:
subqueries: List of subqueries with estimated_results set.
Returns:
Sum of all estimated results.
"""
return sum(sq.estimated_results or 0 for sq in subqueries)