wrongmove/services/listing_fetcher.py
Viktor Barzin e5ce8c1201
Fix buy listing support: thread ListingType through processing pipeline
The listing processor was hardcoded to create RentListing objects and
query only the rentlisting table. Buy listings fetched from Rightmove
were stored in the wrong table with missing fields. This threads
ListingType through ListingProcessor and all Step subclasses so the
correct model (RentListing/BuyListing) is created, the correct table
is queried, and buy-specific fields (service_charge, lease_left) are
parsed from the API response and included in GeoJSON streaming output.
2026-02-07 23:34:08 +00:00

211 lines
7.8 KiB
Python

"""Listing fetcher service - fetches listing data from Rightmove API."""
import asyncio
import logging
from config.scraper_config import ScraperConfig
from listing_processor import ListingProcessor
from rec.query import create_session, listing_query
from rec.exceptions import CircuitBreakerOpenError, ThrottlingError
from rec.throttle_detector import get_throttle_metrics, reset_throttle_metrics
from models.listing import Listing, QueryParameters
from repositories import ListingRepository
from services.query_splitter import QuerySplitter, SubQuery
logger = logging.getLogger("uvicorn.error")
# Number of concurrent workers that process listing details (fetch details,
# download images, run OCR) from the streaming queue in parallel.
NUM_WORKERS = 20
async def dump_listings_full(
parameters: QueryParameters,
repository: ListingRepository,
) -> list[Listing]:
"""Fetches all listings, images as well as detects floorplans."""
new_listings = await dump_listings(parameters, repository)
logger.debug(f"Upserted {len(new_listings)} new listings")
new_listing_ids = [listing.id for listing in new_listings]
return await repository.get_listings(only_ids=new_listing_ids)
async def _fetch_subquery(
sq: SubQuery,
parameters: QueryParameters,
session: object,
config: ScraperConfig,
semaphore: asyncio.Semaphore,
existing_ids: set[int],
queue: asyncio.Queue[int | None],
) -> int:
"""Fetch listing IDs for a single subquery and enqueue new ones.
Iterates through pages of results for the given subquery, adding any
newly discovered listing IDs to the processing queue.
Args:
sq: The subquery to fetch results for.
parameters: The original query parameters (for page_size, etc.).
session: The aiohttp session for making requests.
config: Scraper configuration.
semaphore: Concurrency limiter for HTTP requests.
existing_ids: Set of already-known listing IDs (mutated in place).
queue: Queue to push new listing IDs onto for processing.
Returns:
The number of new IDs discovered and enqueued.
"""
estimated = sq.estimated_results or 0
if estimated == 0:
return 0
ids_found = 0
page_size = parameters.page_size
max_pages = min(
config.max_pages_per_query,
(estimated // page_size) + 1,
)
for page_id in range(1, max_pages + 1):
async with semaphore:
await asyncio.sleep(config.request_delay_ms / 1000)
try:
result = await listing_query(
page=page_id,
channel=parameters.listing_type,
min_bedrooms=sq.min_bedrooms,
max_bedrooms=sq.max_bedrooms,
radius=parameters.radius,
min_price=sq.min_price,
max_price=sq.max_price,
district=sq.district,
page_size=page_size,
max_days_since_added=parameters.max_days_since_added,
furnish_types=parameters.furnish_types or [],
session=session,
config=config,
)
# Extract and enqueue new IDs inline
properties = result.get("properties", [])
for prop in properties:
identifier = prop.get("identifier")
if identifier and identifier not in existing_ids:
existing_ids.add(identifier)
ids_found += 1
await queue.put(identifier)
if len(properties) < page_size:
break
except CircuitBreakerOpenError as e:
logger.error(f"Circuit breaker open: {e}")
break
except ThrottlingError as e:
logger.warning(
f"Throttling error on page {page_id} for "
f"{sq.district}: {e}"
)
break
except Exception as e:
# Rightmove returns GENERIC_ERROR when requesting pages
# past the last page of results. This is expected behavior
# and signals we've exhausted this subquery's results.
if "GENERIC_ERROR" in str(e):
logger.debug(
f"Max page for {sq.district}: {page_id - 1}"
)
break
logger.warning(
f"Error fetching page {page_id} for "
f"{sq.district}: {e}"
)
break
return ids_found
async def dump_listings(
parameters: QueryParameters,
repository: ListingRepository,
) -> list[Listing]:
"""Fetch listings from Rightmove API and process them.
Uses intelligent query splitting and a streaming pipeline so that
listing processing starts as soon as IDs become available.
"""
config = ScraperConfig.from_env()
splitter = QuerySplitter(config)
# Reset throttle metrics at start
reset_throttle_metrics()
try:
async with create_session(config) as session:
# Phase 1: Split and probe queries
logger.info("Splitting query and probing result counts...")
subqueries = await splitter.split(parameters, session)
total_estimated = splitter.calculate_total_estimated_results(subqueries)
logger.info(
f"Split into {len(subqueries)} subqueries, "
f"estimated {total_estimated} total results"
)
# Load existing IDs (fast, ID-only projection)
existing_ids = repository.get_listing_ids(parameters.listing_type)
logger.info(f"Found {len(existing_ids)} existing listings in DB")
# Phase 2: Streaming fetch & process
queue: asyncio.Queue[int | None] = asyncio.Queue()
semaphore = asyncio.Semaphore(config.max_concurrent_requests)
processed_listings: list[Listing] = []
async def producer() -> int:
"""Fetch all subqueries and send sentinel values to workers."""
tasks = [
_fetch_subquery(
sq, parameters, session, config,
semaphore, existing_ids, queue,
)
for sq in subqueries
]
counts = await asyncio.gather(*tasks)
ids_collected = sum(counts)
logger.info(f"Fetch complete: {ids_collected} new IDs found")
for _ in range(NUM_WORKERS):
await queue.put(None)
return ids_collected
async def worker() -> None:
while True:
listing_id = await queue.get()
if listing_id is None:
break
listing_processor = ListingProcessor(repository, parameters.listing_type)
listing = await listing_processor.process_listing(listing_id)
if listing is not None:
processed_listings.append(listing)
results = await asyncio.gather(
producer(),
*[worker() for _ in range(NUM_WORKERS)],
)
ids_collected = results[0]
except CircuitBreakerOpenError as e:
logger.error(f"Circuit breaker prevented listing fetch: {e}")
logger.info(get_throttle_metrics().summary())
return []
finally:
# Log throttle metrics at end
metrics = get_throttle_metrics()
if metrics.total_requests > 0:
logger.info("\n" + metrics.summary())
logger.info(
f"Processed {len(processed_listings)} new listings "
f"({ids_collected} total found)"
)
return processed_listings