Stream-process listings as IDs arrive via asyncio.Queue

Replace the sequential fetch-all-then-process pipeline with a streaming architecture where listing processing starts as soon as IDs become available from each subquery. A producer task fetches pages and enqueues new IDs (filtered inline against DB), while 20 consumer workers process listings concurrently from the queue. - Add ListingRepository.get_listing_ids() for fast ID-only projection - Refactor listing_tasks.py: remove get_ids_to_process/dump_listings_and_monitor, replace with unified producer/worker/monitor pipeline - Apply same pattern to CLI path in listing_fetcher.py - Remove 'filtering' phase from frontend, show combined fetch+process metrics - Add fetching_done flag to TaskResult for phase transition tracking
2026-02-06 23:43:54 +00:00 · 2026-02-06 23:43:54 +00:00 · b9f576ae2b
commit b9f576ae2b
parent 7e8f1f0339
6 changed files with 372 additions and 420 deletions
--- a/crawler/services/listing_fetcher.py
+++ b/crawler/services/listing_fetcher.py
@ -1,26 +1,25 @@
 """Listing fetcher service - fetches listing data from Rightmove API."""
 import asyncio
 import logging
-from typing import Any

 from config.scraper_config import ScraperConfig
 from listing_processor import ListingProcessor
 from rec.query import create_session, listing_query
 from rec.exceptions import CircuitBreakerOpenError, ThrottlingError
 from rec.throttle_detector import get_throttle_metrics, reset_throttle_metrics
-from models.listing import QueryParameters
+from models.listing import Listing, QueryParameters
 from repositories import ListingRepository
-from tqdm.asyncio import tqdm
-from models import Listing as modelListing
 from services.query_splitter import QuerySplitter, SubQuery

 logger = logging.getLogger("uvicorn.error")

+NUM_WORKERS = 20
+

 async def dump_listings_full(
    parameters: QueryParameters,
    repository: ListingRepository,
-) -> list[modelListing]:
+) -> list[Listing]:
    """Fetches all listings, images as well as detects floorplans."""
    new_listings = await dump_listings(parameters, repository)
    logger.debug(f"Upserted {len(new_listings)} new listings")
@ -33,11 +32,11 @@ async def dump_listings_full(
 async def dump_listings(
    parameters: QueryParameters,
    repository: ListingRepository,
-) -> list[modelListing]:
+) -> list[Listing]:
    """Fetch listings from Rightmove API and process them.

-    Uses intelligent query splitting to maximize data extraction
-    while respecting Rightmove's result caps.
+    Uses intelligent query splitting and a streaming pipeline so that
+    listing processing starts as soon as IDs become available.
    """
    config = ScraperConfig.from_env()
    splitter = QuerySplitter(config)
@ -47,7 +46,7 @@ async def dump_listings(

    try:
        async with create_session(config) as session:
-            # Phase 1 & 2: Split and probe queries
+            # Phase 1: Split and probe queries
            logger.info("Splitting query and probing result counts...")
            subqueries = await splitter.split(parameters, session)

@ -57,16 +56,22 @@ async def dump_listings(
                f"estimated {total_estimated} total results"
            )

-            # Phase 3: Fetch all pages for each subquery
-            semaphore = asyncio.Semaphore(config.max_concurrent_requests)
+            # Load existing IDs (fast, ID-only projection)
+            existing_ids = repository.get_listing_ids(parameters.listing_type)
+            logger.info(f"Found {len(existing_ids)} existing listings in DB")

-            async def fetch_subquery(sq: SubQuery) -> list[dict[str, Any]]:
-                """Fetch all pages for a single subquery."""
-                results: list[dict[str, Any]] = []
+            # Phase 2: Streaming fetch & process
+            queue: asyncio.Queue[int | None] = asyncio.Queue()
+            semaphore = asyncio.Semaphore(config.max_concurrent_requests)
+            ids_collected = 0
+            processed_listings: list[Listing] = []
+
+            async def fetch_subquery(sq: SubQuery) -> None:
+                nonlocal ids_collected

                estimated = sq.estimated_results or 0
                if estimated == 0:
-                    return results
+                    return

                page_size = parameters.page_size
                max_pages = min(
@ -93,9 +98,16 @@ async def dump_listings(
                                session=session,
                                config=config,
                            )
-                            results.append(result)

+                            # Extract and enqueue new IDs inline
                            properties = result.get("properties", [])
+                            for prop in properties:
+                                identifier = prop.get("identifier")
+                                if identifier and identifier not in existing_ids:
+                                    existing_ids.add(identifier)
+                                    ids_collected += 1
+                                    await queue.put(identifier)
+
                            if len(properties) < page_size:
                                break

@ -104,7 +116,8 @@ async def dump_listings(
                            break
                        except ThrottlingError as e:
                            logger.warning(
-                                f"Throttling error on page {page_id} for {sq.district}: {e}"
+                                f"Throttling error on page {page_id} for "
+                                f"{sq.district}: {e}"
                            )
                            break
                        except Exception as e:
@ -114,17 +127,34 @@ async def dump_listings(
                                )
                                break
                            logger.warning(
-                                f"Error fetching page {page_id} for {sq.district}: {e}"
+                                f"Error fetching page {page_id} for "
+                                f"{sq.district}: {e}"
                            )
                            break

-                return results
+            async def producer() -> None:
+                await asyncio.gather(
+                    *[fetch_subquery(sq) for sq in subqueries]
+                )
+                logger.info(f"Fetch complete: {ids_collected} new IDs found")
+                for _ in range(NUM_WORKERS):
+                    await queue.put(None)

-            # Fetch all subqueries with progress bar
-            all_results = await tqdm.gather(
-                *[fetch_subquery(sq) for sq in subqueries],
-                desc="Fetching listings",
+            async def worker() -> None:
+                while True:
+                    listing_id = await queue.get()
+                    if listing_id is None:
+                        break
+                    listing_processor = ListingProcessor(repository)
+                    listing = await listing_processor.process_listing(listing_id)
+                    if listing is not None:
+                        processed_listings.append(listing)
+
+            await asyncio.gather(
+                producer(),
+                *[worker() for _ in range(NUM_WORKERS)],
            )
+
    except CircuitBreakerOpenError as e:
        logger.error(f"Circuit breaker prevented listing fetch: {e}")
        logger.info(get_throttle_metrics().summary())
@ -135,36 +165,9 @@ async def dump_listings(
        if metrics.total_requests > 0:
            logger.info("\n" + metrics.summary())

-    # Extract listing identifiers from results
-    listing_ids: list[int] = []
-    for subquery_results in all_results:
-        for response_json in subquery_results:
-            if not response_json:
-                continue
-            if response_json.get("totalAvailableResults", 0) == 0:
-                continue
-            for property_data in response_json.get("properties", []):
-                identifier = property_data.get("identifier")
-                if identifier:
-                    listing_ids.append(identifier)
-
-    logger.info(f"Found {len(listing_ids)} total listings")
-
-    # Deduplicate
-    unique_ids = list(set(listing_ids))
-    logger.info(f"After deduplication: {len(unique_ids)} unique listings")
-
-    # Filter out listings already in database
-    all_listing_ids = [x.id for x in await repository.get_listings()]
-    missing_ids = [
-        listing_id for listing_id in unique_ids if listing_id not in all_listing_ids
-    ]
-
-    listing_processor = ListingProcessor(repository)
-    logger.info(f"Starting processing {len(missing_ids)} new listings")
-    processed_listings = await tqdm.gather(
-        *[listing_processor.process_listing(id) for id in missing_ids]
+    logger.info(
+        f"Processed {len(processed_listings)} new listings "
+        f"({ids_collected} total found)"
    )
-    filtered_listings = [x for x in processed_listings if x is not None]

-    return filtered_listings
+    return processed_listings