146 lines
5.5 KiB
Python
146 lines
5.5 KiB
Python
"""Listing fetcher service - fetches listing data from Rightmove API."""
|
|
import asyncio
|
|
import logging
|
|
from typing import Any
|
|
|
|
from config.scraper_config import ScraperConfig
|
|
from listing_processor import ListingProcessor
|
|
from rec.query import create_session, listing_query
|
|
from models.listing import QueryParameters
|
|
from repositories import ListingRepository
|
|
from tqdm.asyncio import tqdm
|
|
from models import Listing as modelListing
|
|
from services.query_splitter import QuerySplitter, SubQuery
|
|
|
|
logger = logging.getLogger("uvicorn.error")
|
|
|
|
|
|
async def dump_listings_full(
|
|
parameters: QueryParameters,
|
|
repository: ListingRepository,
|
|
) -> list[modelListing]:
|
|
"""Fetches all listings, images as well as detects floorplans."""
|
|
new_listings = await dump_listings(parameters, repository)
|
|
logger.debug(f"Upserted {len(new_listings)} new listings")
|
|
# refresh listings
|
|
listings = await repository.get_listings(parameters) # this can be better
|
|
new_listings = [x for x in listings if x.id in new_listings]
|
|
return new_listings
|
|
|
|
|
|
async def dump_listings(
|
|
parameters: QueryParameters,
|
|
repository: ListingRepository,
|
|
) -> list[modelListing]:
|
|
"""Fetch listings from Rightmove API and process them.
|
|
|
|
Uses intelligent query splitting to maximize data extraction
|
|
while respecting Rightmove's result caps.
|
|
"""
|
|
config = ScraperConfig.from_env()
|
|
splitter = QuerySplitter(config)
|
|
|
|
async with create_session(config) as session:
|
|
# Phase 1 & 2: Split and probe queries
|
|
logger.info("Splitting query and probing result counts...")
|
|
subqueries = await splitter.split(parameters, session)
|
|
|
|
total_estimated = splitter.calculate_total_estimated_results(subqueries)
|
|
logger.info(
|
|
f"Split into {len(subqueries)} subqueries, "
|
|
f"estimated {total_estimated} total results"
|
|
)
|
|
|
|
# Phase 3: Fetch all pages for each subquery
|
|
semaphore = asyncio.Semaphore(config.max_concurrent_requests)
|
|
|
|
async def fetch_subquery(sq: SubQuery) -> list[dict[str, Any]]:
|
|
"""Fetch all pages for a single subquery."""
|
|
results: list[dict[str, Any]] = []
|
|
|
|
estimated = sq.estimated_results or 0
|
|
if estimated == 0:
|
|
return results
|
|
|
|
page_size = parameters.page_size
|
|
max_pages = min(
|
|
config.max_pages_per_query,
|
|
(estimated // page_size) + 1,
|
|
)
|
|
|
|
for page_id in range(1, max_pages + 1):
|
|
async with semaphore:
|
|
await asyncio.sleep(config.request_delay_ms / 1000)
|
|
try:
|
|
result = await listing_query(
|
|
page=page_id,
|
|
channel=parameters.listing_type,
|
|
min_bedrooms=sq.min_bedrooms,
|
|
max_bedrooms=sq.max_bedrooms,
|
|
radius=parameters.radius,
|
|
min_price=sq.min_price,
|
|
max_price=sq.max_price,
|
|
district=sq.district,
|
|
page_size=page_size,
|
|
max_days_since_added=parameters.max_days_since_added,
|
|
furnish_types=parameters.furnish_types or [],
|
|
session=session,
|
|
)
|
|
results.append(result)
|
|
|
|
properties = result.get("properties", [])
|
|
if len(properties) < page_size:
|
|
break
|
|
|
|
except Exception as e:
|
|
if "GENERIC_ERROR" in str(e):
|
|
logger.debug(
|
|
f"Max page for {sq.district}: {page_id - 1}"
|
|
)
|
|
break
|
|
logger.warning(
|
|
f"Error fetching page {page_id} for {sq.district}: {e}"
|
|
)
|
|
break
|
|
|
|
return results
|
|
|
|
# Fetch all subqueries with progress bar
|
|
all_results = await tqdm.gather(
|
|
*[fetch_subquery(sq) for sq in subqueries],
|
|
desc="Fetching listings",
|
|
)
|
|
|
|
# Extract listing identifiers from results
|
|
listing_ids: list[int] = []
|
|
for subquery_results in all_results:
|
|
for response_json in subquery_results:
|
|
if not response_json:
|
|
continue
|
|
if response_json.get("totalAvailableResults", 0) == 0:
|
|
continue
|
|
for property_data in response_json.get("properties", []):
|
|
identifier = property_data.get("identifier")
|
|
if identifier:
|
|
listing_ids.append(identifier)
|
|
|
|
logger.info(f"Found {len(listing_ids)} total listings")
|
|
|
|
# Deduplicate
|
|
unique_ids = list(set(listing_ids))
|
|
logger.info(f"After deduplication: {len(unique_ids)} unique listings")
|
|
|
|
# Filter out listings already in database
|
|
all_listing_ids = [x.id for x in await repository.get_listings()]
|
|
missing_ids = [
|
|
listing_id for listing_id in unique_ids if listing_id not in all_listing_ids
|
|
]
|
|
|
|
listing_processor = ListingProcessor(repository)
|
|
logger.info(f"Starting processing {len(missing_ids)} new listings")
|
|
processed_listings = await tqdm.gather(
|
|
*[listing_processor.process_listing(id) for id in missing_ids]
|
|
)
|
|
filtered_listings = [x for x in processed_listings if x is not None]
|
|
|
|
return filtered_listings
|