Stream-process listings as IDs arrive via asyncio.Queue
Replace the sequential fetch-all-then-process pipeline with a streaming architecture where listing processing starts as soon as IDs become available from each subquery. A producer task fetches pages and enqueues new IDs (filtered inline against DB), while 20 consumer workers process listings concurrently from the queue. - Add ListingRepository.get_listing_ids() for fast ID-only projection - Refactor listing_tasks.py: remove get_ids_to_process/dump_listings_and_monitor, replace with unified producer/worker/monitor pipeline - Apply same pattern to CLI path in listing_fetcher.py - Remove 'filtering' phase from frontend, show combined fetch+process metrics - Add fetching_done flag to TaskResult for phase transition tracking
This commit is contained in:
parent
7e8f1f0339
commit
b9f576ae2b
6 changed files with 372 additions and 420 deletions
|
|
@ -1,26 +1,25 @@
|
|||
"""Listing fetcher service - fetches listing data from Rightmove API."""
|
||||
import asyncio
|
||||
import logging
|
||||
from typing import Any
|
||||
|
||||
from config.scraper_config import ScraperConfig
|
||||
from listing_processor import ListingProcessor
|
||||
from rec.query import create_session, listing_query
|
||||
from rec.exceptions import CircuitBreakerOpenError, ThrottlingError
|
||||
from rec.throttle_detector import get_throttle_metrics, reset_throttle_metrics
|
||||
from models.listing import QueryParameters
|
||||
from models.listing import Listing, QueryParameters
|
||||
from repositories import ListingRepository
|
||||
from tqdm.asyncio import tqdm
|
||||
from models import Listing as modelListing
|
||||
from services.query_splitter import QuerySplitter, SubQuery
|
||||
|
||||
logger = logging.getLogger("uvicorn.error")
|
||||
|
||||
NUM_WORKERS = 20
|
||||
|
||||
|
||||
async def dump_listings_full(
|
||||
parameters: QueryParameters,
|
||||
repository: ListingRepository,
|
||||
) -> list[modelListing]:
|
||||
) -> list[Listing]:
|
||||
"""Fetches all listings, images as well as detects floorplans."""
|
||||
new_listings = await dump_listings(parameters, repository)
|
||||
logger.debug(f"Upserted {len(new_listings)} new listings")
|
||||
|
|
@ -33,11 +32,11 @@ async def dump_listings_full(
|
|||
async def dump_listings(
|
||||
parameters: QueryParameters,
|
||||
repository: ListingRepository,
|
||||
) -> list[modelListing]:
|
||||
) -> list[Listing]:
|
||||
"""Fetch listings from Rightmove API and process them.
|
||||
|
||||
Uses intelligent query splitting to maximize data extraction
|
||||
while respecting Rightmove's result caps.
|
||||
Uses intelligent query splitting and a streaming pipeline so that
|
||||
listing processing starts as soon as IDs become available.
|
||||
"""
|
||||
config = ScraperConfig.from_env()
|
||||
splitter = QuerySplitter(config)
|
||||
|
|
@ -47,7 +46,7 @@ async def dump_listings(
|
|||
|
||||
try:
|
||||
async with create_session(config) as session:
|
||||
# Phase 1 & 2: Split and probe queries
|
||||
# Phase 1: Split and probe queries
|
||||
logger.info("Splitting query and probing result counts...")
|
||||
subqueries = await splitter.split(parameters, session)
|
||||
|
||||
|
|
@ -57,16 +56,22 @@ async def dump_listings(
|
|||
f"estimated {total_estimated} total results"
|
||||
)
|
||||
|
||||
# Phase 3: Fetch all pages for each subquery
|
||||
semaphore = asyncio.Semaphore(config.max_concurrent_requests)
|
||||
# Load existing IDs (fast, ID-only projection)
|
||||
existing_ids = repository.get_listing_ids(parameters.listing_type)
|
||||
logger.info(f"Found {len(existing_ids)} existing listings in DB")
|
||||
|
||||
async def fetch_subquery(sq: SubQuery) -> list[dict[str, Any]]:
|
||||
"""Fetch all pages for a single subquery."""
|
||||
results: list[dict[str, Any]] = []
|
||||
# Phase 2: Streaming fetch & process
|
||||
queue: asyncio.Queue[int | None] = asyncio.Queue()
|
||||
semaphore = asyncio.Semaphore(config.max_concurrent_requests)
|
||||
ids_collected = 0
|
||||
processed_listings: list[Listing] = []
|
||||
|
||||
async def fetch_subquery(sq: SubQuery) -> None:
|
||||
nonlocal ids_collected
|
||||
|
||||
estimated = sq.estimated_results or 0
|
||||
if estimated == 0:
|
||||
return results
|
||||
return
|
||||
|
||||
page_size = parameters.page_size
|
||||
max_pages = min(
|
||||
|
|
@ -93,9 +98,16 @@ async def dump_listings(
|
|||
session=session,
|
||||
config=config,
|
||||
)
|
||||
results.append(result)
|
||||
|
||||
# Extract and enqueue new IDs inline
|
||||
properties = result.get("properties", [])
|
||||
for prop in properties:
|
||||
identifier = prop.get("identifier")
|
||||
if identifier and identifier not in existing_ids:
|
||||
existing_ids.add(identifier)
|
||||
ids_collected += 1
|
||||
await queue.put(identifier)
|
||||
|
||||
if len(properties) < page_size:
|
||||
break
|
||||
|
||||
|
|
@ -104,7 +116,8 @@ async def dump_listings(
|
|||
break
|
||||
except ThrottlingError as e:
|
||||
logger.warning(
|
||||
f"Throttling error on page {page_id} for {sq.district}: {e}"
|
||||
f"Throttling error on page {page_id} for "
|
||||
f"{sq.district}: {e}"
|
||||
)
|
||||
break
|
||||
except Exception as e:
|
||||
|
|
@ -114,17 +127,34 @@ async def dump_listings(
|
|||
)
|
||||
break
|
||||
logger.warning(
|
||||
f"Error fetching page {page_id} for {sq.district}: {e}"
|
||||
f"Error fetching page {page_id} for "
|
||||
f"{sq.district}: {e}"
|
||||
)
|
||||
break
|
||||
|
||||
return results
|
||||
async def producer() -> None:
|
||||
await asyncio.gather(
|
||||
*[fetch_subquery(sq) for sq in subqueries]
|
||||
)
|
||||
logger.info(f"Fetch complete: {ids_collected} new IDs found")
|
||||
for _ in range(NUM_WORKERS):
|
||||
await queue.put(None)
|
||||
|
||||
# Fetch all subqueries with progress bar
|
||||
all_results = await tqdm.gather(
|
||||
*[fetch_subquery(sq) for sq in subqueries],
|
||||
desc="Fetching listings",
|
||||
async def worker() -> None:
|
||||
while True:
|
||||
listing_id = await queue.get()
|
||||
if listing_id is None:
|
||||
break
|
||||
listing_processor = ListingProcessor(repository)
|
||||
listing = await listing_processor.process_listing(listing_id)
|
||||
if listing is not None:
|
||||
processed_listings.append(listing)
|
||||
|
||||
await asyncio.gather(
|
||||
producer(),
|
||||
*[worker() for _ in range(NUM_WORKERS)],
|
||||
)
|
||||
|
||||
except CircuitBreakerOpenError as e:
|
||||
logger.error(f"Circuit breaker prevented listing fetch: {e}")
|
||||
logger.info(get_throttle_metrics().summary())
|
||||
|
|
@ -135,36 +165,9 @@ async def dump_listings(
|
|||
if metrics.total_requests > 0:
|
||||
logger.info("\n" + metrics.summary())
|
||||
|
||||
# Extract listing identifiers from results
|
||||
listing_ids: list[int] = []
|
||||
for subquery_results in all_results:
|
||||
for response_json in subquery_results:
|
||||
if not response_json:
|
||||
continue
|
||||
if response_json.get("totalAvailableResults", 0) == 0:
|
||||
continue
|
||||
for property_data in response_json.get("properties", []):
|
||||
identifier = property_data.get("identifier")
|
||||
if identifier:
|
||||
listing_ids.append(identifier)
|
||||
|
||||
logger.info(f"Found {len(listing_ids)} total listings")
|
||||
|
||||
# Deduplicate
|
||||
unique_ids = list(set(listing_ids))
|
||||
logger.info(f"After deduplication: {len(unique_ids)} unique listings")
|
||||
|
||||
# Filter out listings already in database
|
||||
all_listing_ids = [x.id for x in await repository.get_listings()]
|
||||
missing_ids = [
|
||||
listing_id for listing_id in unique_ids if listing_id not in all_listing_ids
|
||||
]
|
||||
|
||||
listing_processor = ListingProcessor(repository)
|
||||
logger.info(f"Starting processing {len(missing_ids)} new listings")
|
||||
processed_listings = await tqdm.gather(
|
||||
*[listing_processor.process_listing(id) for id in missing_ids]
|
||||
logger.info(
|
||||
f"Processed {len(processed_listings)} new listings "
|
||||
f"({ids_collected} total found)"
|
||||
)
|
||||
filtered_listings = [x for x in processed_listings if x is not None]
|
||||
|
||||
return filtered_listings
|
||||
return processed_listings
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue