Add intelligent query splitting to maximize Rightmove data extraction
This commit is contained in:
parent
29ba739063
commit
e8293c6042
11 changed files with 1970 additions and 113 deletions
|
|
@ -1,18 +1,17 @@
|
|||
import asyncio
|
||||
import itertools
|
||||
import logging
|
||||
from typing import Any
|
||||
from celery import Task
|
||||
from celery.schedules import crontab
|
||||
from celery_app import app
|
||||
from config.schedule_config import SchedulesConfig
|
||||
from config.scraper_config import ScraperConfig
|
||||
from listing_processor import ListingProcessor
|
||||
from models.listing import Listing, QueryParameters
|
||||
from rec.districts import get_districts
|
||||
from rec.query import listing_query
|
||||
from rec.query import create_session, listing_query
|
||||
from repositories.listing_repository import ListingRepository
|
||||
from database import engine
|
||||
from services import image_fetcher, floorplan_detector
|
||||
from services.query_splitter import QuerySplitter, SubQuery
|
||||
from utils.redis_lock import redis_lock
|
||||
|
||||
logger = logging.getLogger("uvicorn.error")
|
||||
|
|
@ -134,106 +133,138 @@ async def get_ids_to_process(
|
|||
repository: ListingRepository,
|
||||
task: Task,
|
||||
) -> set[int]:
|
||||
semaphore = asyncio.Semaphore(5) # if too high, rightmove drops connections
|
||||
districts = await get_valid_districts_to_scrape(parameters.district_names)
|
||||
task.update_state(state="Fetching listings to scrape", meta={"progress": 0})
|
||||
json_responses: list[list[dict[str, Any]]] = await asyncio.gather(
|
||||
*[
|
||||
_fetch_listings_with_semaphore(
|
||||
task=task, semaphore=semaphore, parameters=parameters, district=district
|
||||
)
|
||||
for district in districts.keys()
|
||||
],
|
||||
)
|
||||
json_responses_flat = list(itertools.chain.from_iterable(json_responses))
|
||||
logger.debug(f"Total listings fetched {len(json_responses_flat)}")
|
||||
"""Fetch all listing IDs using intelligent query splitting.
|
||||
|
||||
identifiers: set[int] = set()
|
||||
for response_json in json_responses_flat:
|
||||
if response_json == {}:
|
||||
continue
|
||||
if response_json["totalAvailableResults"] == 0:
|
||||
continue
|
||||
for property in response_json["properties"]:
|
||||
identifier = property["identifier"]
|
||||
identifiers.add(identifier)
|
||||
Uses the QuerySplitter to adaptively split large queries and maximize
|
||||
data extraction while respecting Rightmove's result caps.
|
||||
|
||||
# if listing is already in db, do not fetch details again
|
||||
all_listing_ids = {l.id for l in await repository.get_listings()}
|
||||
new_ids = identifiers - all_listing_ids
|
||||
return new_ids
|
||||
Args:
|
||||
parameters: Query parameters for the search.
|
||||
repository: Repository for checking existing listings.
|
||||
task: Celery task for progress updates.
|
||||
|
||||
Returns:
|
||||
Set of new listing IDs that need to be processed.
|
||||
"""
|
||||
config = ScraperConfig.from_env()
|
||||
splitter = QuerySplitter(config)
|
||||
|
||||
async def get_valid_districts_to_scrape(
|
||||
district_names: set[str] | None,
|
||||
) -> dict[str, str]:
|
||||
if district_names:
|
||||
districts = {
|
||||
district: locid
|
||||
for district, locid in get_districts().items()
|
||||
if district in district_names
|
||||
}
|
||||
else:
|
||||
districts = get_districts()
|
||||
return districts
|
||||
def on_progress(phase: str, message: str) -> None:
|
||||
task.update_state(state=message, meta={"phase": phase})
|
||||
|
||||
|
||||
async def _fetch_listings_with_semaphore(
|
||||
*,
|
||||
task: Task,
|
||||
semaphore: asyncio.Semaphore,
|
||||
parameters: QueryParameters,
|
||||
district: str,
|
||||
) -> list[dict[str, Any]]:
|
||||
result = []
|
||||
# split the price in N bands to avoid the 1.5k capping by rightmove
|
||||
# basically instead of 1 query with price between 1k and 5k that is capped at 1500 results
|
||||
# we do 10 queries each with an increment in price range so we send more queries but each
|
||||
# has a smaller chance of returning more than 1.5k results
|
||||
|
||||
number_of_steps = 10
|
||||
price_step = parameters.max_price // number_of_steps
|
||||
|
||||
for step in range(number_of_steps):
|
||||
async with create_session(config) as session:
|
||||
# Phase 1 & 2: Split and probe queries
|
||||
task.update_state(
|
||||
state=f"Fetching listings ({step} out of {number_of_steps})",
|
||||
meta={"progress": step / number_of_steps},
|
||||
state="Analyzing query and splitting by price bands...",
|
||||
meta={"phase": "splitting", "progress": 0},
|
||||
)
|
||||
min_price = step * price_step
|
||||
max_price = (step + 1) * price_step
|
||||
logger.debug(
|
||||
f"Step {step} of {number_of_steps} with {min_price=} and {max_price=}"
|
||||
subqueries = await splitter.split(parameters, session, on_progress)
|
||||
|
||||
total_estimated = splitter.calculate_total_estimated_results(subqueries)
|
||||
logger.info(
|
||||
f"Split into {len(subqueries)} subqueries, "
|
||||
f"estimated {total_estimated} total results"
|
||||
)
|
||||
|
||||
for num_bedrooms in range(parameters.min_bedrooms, parameters.max_bedrooms + 1):
|
||||
for page_id in range(
|
||||
1,
|
||||
3, # seems like all searches stop at 1500 entries (page_id * page_size)
|
||||
):
|
||||
logger.debug(f"Processing {page_id=} for {district=}")
|
||||
# Phase 3: Fetch all pages for each subquery
|
||||
task.update_state(
|
||||
state=f"Fetching listings from {len(subqueries)} subqueries...",
|
||||
meta={
|
||||
"phase": "fetching",
|
||||
"subqueries": len(subqueries),
|
||||
"estimated_results": total_estimated,
|
||||
},
|
||||
)
|
||||
|
||||
semaphore = asyncio.Semaphore(config.max_concurrent_requests)
|
||||
identifiers: set[int] = set()
|
||||
|
||||
async def fetch_subquery(sq: SubQuery) -> list[dict[str, Any]]:
|
||||
"""Fetch all pages for a single subquery."""
|
||||
results: list[dict[str, Any]] = []
|
||||
|
||||
# Calculate how many pages we need based on estimated results
|
||||
estimated = sq.estimated_results or 0
|
||||
if estimated == 0:
|
||||
return results
|
||||
|
||||
# Fetch pages up to max_pages_per_query or until no more results
|
||||
page_size = parameters.page_size
|
||||
max_pages = min(
|
||||
config.max_pages_per_query,
|
||||
(estimated // page_size) + 1,
|
||||
)
|
||||
|
||||
for page_id in range(1, max_pages + 1):
|
||||
async with semaphore:
|
||||
await asyncio.sleep(config.request_delay_ms / 1000)
|
||||
try:
|
||||
listing_query_result = await listing_query(
|
||||
result = await listing_query(
|
||||
page=page_id,
|
||||
channel=parameters.listing_type,
|
||||
# min_bedrooms=parameters.min_bedrooms,
|
||||
# max_bedrooms=parameters.max_bedrooms,
|
||||
min_bedrooms=num_bedrooms,
|
||||
max_bedrooms=num_bedrooms,
|
||||
min_bedrooms=sq.min_bedrooms,
|
||||
max_bedrooms=sq.max_bedrooms,
|
||||
radius=parameters.radius,
|
||||
min_price=min_price,
|
||||
max_price=max_price,
|
||||
district=district,
|
||||
page_size=parameters.page_size,
|
||||
min_price=sq.min_price,
|
||||
max_price=sq.max_price,
|
||||
district=sq.district,
|
||||
page_size=page_size,
|
||||
max_days_since_added=parameters.max_days_since_added,
|
||||
furnish_types=parameters.furnish_types or [],
|
||||
session=session,
|
||||
)
|
||||
results.append(result)
|
||||
|
||||
# Check if we've received all results
|
||||
properties = result.get("properties", [])
|
||||
if len(properties) < page_size:
|
||||
# No more results on next page
|
||||
break
|
||||
|
||||
except Exception as e:
|
||||
if "GENERIC_ERROR" in str(e): # Too big page id
|
||||
logger.debug(f"Max page id for {district=}: {page_id-1}")
|
||||
if "GENERIC_ERROR" in str(e):
|
||||
# Reached end of results
|
||||
logger.debug(
|
||||
f"Max page for {sq.district}: {page_id - 1}"
|
||||
)
|
||||
break
|
||||
raise e
|
||||
result.append(listing_query_result)
|
||||
return result
|
||||
logger.warning(
|
||||
f"Error fetching page {page_id} for {sq.district}: {e}"
|
||||
)
|
||||
break
|
||||
|
||||
return results
|
||||
|
||||
# Fetch all subqueries concurrently
|
||||
all_results = await asyncio.gather(
|
||||
*[fetch_subquery(sq) for sq in subqueries]
|
||||
)
|
||||
|
||||
# Extract identifiers from all results
|
||||
for subquery_results in all_results:
|
||||
for response_json in subquery_results:
|
||||
if not response_json:
|
||||
continue
|
||||
if response_json.get("totalAvailableResults", 0) == 0:
|
||||
continue
|
||||
for property_data in response_json.get("properties", []):
|
||||
identifier = property_data.get("identifier")
|
||||
if identifier:
|
||||
identifiers.add(identifier)
|
||||
|
||||
logger.info(f"Found {len(identifiers)} unique listings")
|
||||
|
||||
# Filter out listings already in the database
|
||||
all_listing_ids = {l.id for l in await repository.get_listings()}
|
||||
new_ids = identifiers - all_listing_ids
|
||||
|
||||
task.update_state(
|
||||
state=f"Found {len(new_ids)} new listings to process",
|
||||
meta={
|
||||
"phase": "filtering",
|
||||
"total_found": len(identifiers),
|
||||
"new_listings": len(new_ids),
|
||||
},
|
||||
)
|
||||
|
||||
return new_ids
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue