wrongmove/crawler/services/listing_fetcher.py
Viktor Barzin b9f576ae2b
Stream-process listings as IDs arrive via asyncio.Queue
Replace the sequential fetch-all-then-process pipeline with a streaming
architecture where listing processing starts as soon as IDs become
available from each subquery. A producer task fetches pages and enqueues
new IDs (filtered inline against DB), while 20 consumer workers process
listings concurrently from the queue.

- Add ListingRepository.get_listing_ids() for fast ID-only projection
- Refactor listing_tasks.py: remove get_ids_to_process/dump_listings_and_monitor,
  replace with unified producer/worker/monitor pipeline
- Apply same pattern to CLI path in listing_fetcher.py
- Remove 'filtering' phase from frontend, show combined fetch+process metrics
- Add fetching_done flag to TaskResult for phase transition tracking
2026-02-06 23:43:54 +00:00

173 lines
6.9 KiB
Python

"""Listing fetcher service - fetches listing data from Rightmove API."""
import asyncio
import logging
from config.scraper_config import ScraperConfig
from listing_processor import ListingProcessor
from rec.query import create_session, listing_query
from rec.exceptions import CircuitBreakerOpenError, ThrottlingError
from rec.throttle_detector import get_throttle_metrics, reset_throttle_metrics
from models.listing import Listing, QueryParameters
from repositories import ListingRepository
from services.query_splitter import QuerySplitter, SubQuery
logger = logging.getLogger("uvicorn.error")
NUM_WORKERS = 20
async def dump_listings_full(
parameters: QueryParameters,
repository: ListingRepository,
) -> list[Listing]:
"""Fetches all listings, images as well as detects floorplans."""
new_listings = await dump_listings(parameters, repository)
logger.debug(f"Upserted {len(new_listings)} new listings")
# refresh listings
listings = await repository.get_listings(parameters) # this can be better
new_listings = [x for x in listings if x.id in new_listings]
return new_listings
async def dump_listings(
parameters: QueryParameters,
repository: ListingRepository,
) -> list[Listing]:
"""Fetch listings from Rightmove API and process them.
Uses intelligent query splitting and a streaming pipeline so that
listing processing starts as soon as IDs become available.
"""
config = ScraperConfig.from_env()
splitter = QuerySplitter(config)
# Reset throttle metrics at start
reset_throttle_metrics()
try:
async with create_session(config) as session:
# Phase 1: Split and probe queries
logger.info("Splitting query and probing result counts...")
subqueries = await splitter.split(parameters, session)
total_estimated = splitter.calculate_total_estimated_results(subqueries)
logger.info(
f"Split into {len(subqueries)} subqueries, "
f"estimated {total_estimated} total results"
)
# Load existing IDs (fast, ID-only projection)
existing_ids = repository.get_listing_ids(parameters.listing_type)
logger.info(f"Found {len(existing_ids)} existing listings in DB")
# Phase 2: Streaming fetch & process
queue: asyncio.Queue[int | None] = asyncio.Queue()
semaphore = asyncio.Semaphore(config.max_concurrent_requests)
ids_collected = 0
processed_listings: list[Listing] = []
async def fetch_subquery(sq: SubQuery) -> None:
nonlocal ids_collected
estimated = sq.estimated_results or 0
if estimated == 0:
return
page_size = parameters.page_size
max_pages = min(
config.max_pages_per_query,
(estimated // page_size) + 1,
)
for page_id in range(1, max_pages + 1):
async with semaphore:
await asyncio.sleep(config.request_delay_ms / 1000)
try:
result = await listing_query(
page=page_id,
channel=parameters.listing_type,
min_bedrooms=sq.min_bedrooms,
max_bedrooms=sq.max_bedrooms,
radius=parameters.radius,
min_price=sq.min_price,
max_price=sq.max_price,
district=sq.district,
page_size=page_size,
max_days_since_added=parameters.max_days_since_added,
furnish_types=parameters.furnish_types or [],
session=session,
config=config,
)
# Extract and enqueue new IDs inline
properties = result.get("properties", [])
for prop in properties:
identifier = prop.get("identifier")
if identifier and identifier not in existing_ids:
existing_ids.add(identifier)
ids_collected += 1
await queue.put(identifier)
if len(properties) < page_size:
break
except CircuitBreakerOpenError as e:
logger.error(f"Circuit breaker open: {e}")
break
except ThrottlingError as e:
logger.warning(
f"Throttling error on page {page_id} for "
f"{sq.district}: {e}"
)
break
except Exception as e:
if "GENERIC_ERROR" in str(e):
logger.debug(
f"Max page for {sq.district}: {page_id - 1}"
)
break
logger.warning(
f"Error fetching page {page_id} for "
f"{sq.district}: {e}"
)
break
async def producer() -> None:
await asyncio.gather(
*[fetch_subquery(sq) for sq in subqueries]
)
logger.info(f"Fetch complete: {ids_collected} new IDs found")
for _ in range(NUM_WORKERS):
await queue.put(None)
async def worker() -> None:
while True:
listing_id = await queue.get()
if listing_id is None:
break
listing_processor = ListingProcessor(repository)
listing = await listing_processor.process_listing(listing_id)
if listing is not None:
processed_listings.append(listing)
await asyncio.gather(
producer(),
*[worker() for _ in range(NUM_WORKERS)],
)
except CircuitBreakerOpenError as e:
logger.error(f"Circuit breaker prevented listing fetch: {e}")
logger.info(get_throttle_metrics().summary())
return []
finally:
# Log throttle metrics at end
metrics = get_throttle_metrics()
if metrics.total_requests > 0:
logger.info("\n" + metrics.summary())
logger.info(
f"Processed {len(processed_listings)} new listings "
f"({ids_collected} total found)"
)
return processed_listings