The listing processor was hardcoded to create RentListing objects and query only the rentlisting table. Buy listings fetched from Rightmove were stored in the wrong table with missing fields. This threads ListingType through ListingProcessor and all Step subclasses so the correct model (RentListing/BuyListing) is created, the correct table is queried, and buy-specific fields (service_charge, lease_left) are parsed from the API response and included in GeoJSON streaming output.
211 lines
7.8 KiB
Python
211 lines
7.8 KiB
Python
"""Listing fetcher service - fetches listing data from Rightmove API."""
|
|
import asyncio
|
|
import logging
|
|
|
|
from config.scraper_config import ScraperConfig
|
|
from listing_processor import ListingProcessor
|
|
from rec.query import create_session, listing_query
|
|
from rec.exceptions import CircuitBreakerOpenError, ThrottlingError
|
|
from rec.throttle_detector import get_throttle_metrics, reset_throttle_metrics
|
|
from models.listing import Listing, QueryParameters
|
|
from repositories import ListingRepository
|
|
from services.query_splitter import QuerySplitter, SubQuery
|
|
|
|
logger = logging.getLogger("uvicorn.error")
|
|
|
|
# Number of concurrent workers that process listing details (fetch details,
|
|
# download images, run OCR) from the streaming queue in parallel.
|
|
NUM_WORKERS = 20
|
|
|
|
|
|
async def dump_listings_full(
|
|
parameters: QueryParameters,
|
|
repository: ListingRepository,
|
|
) -> list[Listing]:
|
|
"""Fetches all listings, images as well as detects floorplans."""
|
|
new_listings = await dump_listings(parameters, repository)
|
|
logger.debug(f"Upserted {len(new_listings)} new listings")
|
|
new_listing_ids = [listing.id for listing in new_listings]
|
|
return await repository.get_listings(only_ids=new_listing_ids)
|
|
|
|
|
|
async def _fetch_subquery(
|
|
sq: SubQuery,
|
|
parameters: QueryParameters,
|
|
session: object,
|
|
config: ScraperConfig,
|
|
semaphore: asyncio.Semaphore,
|
|
existing_ids: set[int],
|
|
queue: asyncio.Queue[int | None],
|
|
) -> int:
|
|
"""Fetch listing IDs for a single subquery and enqueue new ones.
|
|
|
|
Iterates through pages of results for the given subquery, adding any
|
|
newly discovered listing IDs to the processing queue.
|
|
|
|
Args:
|
|
sq: The subquery to fetch results for.
|
|
parameters: The original query parameters (for page_size, etc.).
|
|
session: The aiohttp session for making requests.
|
|
config: Scraper configuration.
|
|
semaphore: Concurrency limiter for HTTP requests.
|
|
existing_ids: Set of already-known listing IDs (mutated in place).
|
|
queue: Queue to push new listing IDs onto for processing.
|
|
|
|
Returns:
|
|
The number of new IDs discovered and enqueued.
|
|
"""
|
|
estimated = sq.estimated_results or 0
|
|
if estimated == 0:
|
|
return 0
|
|
|
|
ids_found = 0
|
|
page_size = parameters.page_size
|
|
max_pages = min(
|
|
config.max_pages_per_query,
|
|
(estimated // page_size) + 1,
|
|
)
|
|
|
|
for page_id in range(1, max_pages + 1):
|
|
async with semaphore:
|
|
await asyncio.sleep(config.request_delay_ms / 1000)
|
|
try:
|
|
result = await listing_query(
|
|
page=page_id,
|
|
channel=parameters.listing_type,
|
|
min_bedrooms=sq.min_bedrooms,
|
|
max_bedrooms=sq.max_bedrooms,
|
|
radius=parameters.radius,
|
|
min_price=sq.min_price,
|
|
max_price=sq.max_price,
|
|
district=sq.district,
|
|
page_size=page_size,
|
|
max_days_since_added=parameters.max_days_since_added,
|
|
furnish_types=parameters.furnish_types or [],
|
|
session=session,
|
|
config=config,
|
|
)
|
|
|
|
# Extract and enqueue new IDs inline
|
|
properties = result.get("properties", [])
|
|
for prop in properties:
|
|
identifier = prop.get("identifier")
|
|
if identifier and identifier not in existing_ids:
|
|
existing_ids.add(identifier)
|
|
ids_found += 1
|
|
await queue.put(identifier)
|
|
|
|
if len(properties) < page_size:
|
|
break
|
|
|
|
except CircuitBreakerOpenError as e:
|
|
logger.error(f"Circuit breaker open: {e}")
|
|
break
|
|
except ThrottlingError as e:
|
|
logger.warning(
|
|
f"Throttling error on page {page_id} for "
|
|
f"{sq.district}: {e}"
|
|
)
|
|
break
|
|
except Exception as e:
|
|
# Rightmove returns GENERIC_ERROR when requesting pages
|
|
# past the last page of results. This is expected behavior
|
|
# and signals we've exhausted this subquery's results.
|
|
if "GENERIC_ERROR" in str(e):
|
|
logger.debug(
|
|
f"Max page for {sq.district}: {page_id - 1}"
|
|
)
|
|
break
|
|
logger.warning(
|
|
f"Error fetching page {page_id} for "
|
|
f"{sq.district}: {e}"
|
|
)
|
|
break
|
|
|
|
return ids_found
|
|
|
|
|
|
async def dump_listings(
|
|
parameters: QueryParameters,
|
|
repository: ListingRepository,
|
|
) -> list[Listing]:
|
|
"""Fetch listings from Rightmove API and process them.
|
|
|
|
Uses intelligent query splitting and a streaming pipeline so that
|
|
listing processing starts as soon as IDs become available.
|
|
"""
|
|
config = ScraperConfig.from_env()
|
|
splitter = QuerySplitter(config)
|
|
|
|
# Reset throttle metrics at start
|
|
reset_throttle_metrics()
|
|
|
|
try:
|
|
async with create_session(config) as session:
|
|
# Phase 1: Split and probe queries
|
|
logger.info("Splitting query and probing result counts...")
|
|
subqueries = await splitter.split(parameters, session)
|
|
|
|
total_estimated = splitter.calculate_total_estimated_results(subqueries)
|
|
logger.info(
|
|
f"Split into {len(subqueries)} subqueries, "
|
|
f"estimated {total_estimated} total results"
|
|
)
|
|
|
|
# Load existing IDs (fast, ID-only projection)
|
|
existing_ids = repository.get_listing_ids(parameters.listing_type)
|
|
logger.info(f"Found {len(existing_ids)} existing listings in DB")
|
|
|
|
# Phase 2: Streaming fetch & process
|
|
queue: asyncio.Queue[int | None] = asyncio.Queue()
|
|
semaphore = asyncio.Semaphore(config.max_concurrent_requests)
|
|
processed_listings: list[Listing] = []
|
|
|
|
async def producer() -> int:
|
|
"""Fetch all subqueries and send sentinel values to workers."""
|
|
tasks = [
|
|
_fetch_subquery(
|
|
sq, parameters, session, config,
|
|
semaphore, existing_ids, queue,
|
|
)
|
|
for sq in subqueries
|
|
]
|
|
counts = await asyncio.gather(*tasks)
|
|
ids_collected = sum(counts)
|
|
logger.info(f"Fetch complete: {ids_collected} new IDs found")
|
|
for _ in range(NUM_WORKERS):
|
|
await queue.put(None)
|
|
return ids_collected
|
|
|
|
async def worker() -> None:
|
|
while True:
|
|
listing_id = await queue.get()
|
|
if listing_id is None:
|
|
break
|
|
listing_processor = ListingProcessor(repository, parameters.listing_type)
|
|
listing = await listing_processor.process_listing(listing_id)
|
|
if listing is not None:
|
|
processed_listings.append(listing)
|
|
|
|
results = await asyncio.gather(
|
|
producer(),
|
|
*[worker() for _ in range(NUM_WORKERS)],
|
|
)
|
|
ids_collected = results[0]
|
|
|
|
except CircuitBreakerOpenError as e:
|
|
logger.error(f"Circuit breaker prevented listing fetch: {e}")
|
|
logger.info(get_throttle_metrics().summary())
|
|
return []
|
|
finally:
|
|
# Log throttle metrics at end
|
|
metrics = get_throttle_metrics()
|
|
if metrics.total_requests > 0:
|
|
logger.info("\n" + metrics.summary())
|
|
|
|
logger.info(
|
|
f"Processed {len(processed_listings)} new listings "
|
|
f"({ids_collected} total found)"
|
|
)
|
|
|
|
return processed_listings
|