Stream-process listings as IDs arrive via asyncio.Queue

Replace the sequential fetch-all-then-process pipeline with a streaming architecture where listing processing starts as soon as IDs become available from each subquery. A producer task fetches pages and enqueues new IDs (filtered inline against DB), while 20 consumer workers process listings concurrently from the queue. - Add ListingRepository.get_listing_ids() for fast ID-only projection - Refactor listing_tasks.py: remove get_ids_to_process/dump_listings_and_monitor, replace with unified producer/worker/monitor pipeline - Apply same pattern to CLI path in listing_fetcher.py - Remove 'filtering' phase from frontend, show combined fetch+process metrics - Add fetching_done flag to TaskResult for phase transition tracking
2026-02-06 23:43:54 +00:00 · 2026-02-06 23:43:54 +00:00 · b9f576ae2b
commit b9f576ae2b
parent 7e8f1f0339
6 changed files with 372 additions and 420 deletions
--- a/crawler/frontend/src/components/TaskIndicator.tsx
+++ b/crawler/frontend/src/components/TaskIndicator.tsx
@ -180,7 +180,6 @@ export function TaskIndicator({ taskID, onTaskCancelled }: TaskIndicatorProps) {
                splitting: 'Splitting',
                splitting_complete: 'Split done',
                fetching: 'Fetching',
                filtering: 'Filtering',
            };
            return phaseLabels[taskResult.phase] ?? `${Math.round(progressPercentage)}%`;
        }
--- a/crawler/frontend/src/components/TaskProgressDrawer.tsx
+++ b/crawler/frontend/src/components/TaskProgressDrawer.tsx
@ -23,9 +23,8 @@ interface TaskProgressDrawerProps {
 const PHASES: { key: TaskPhase; label: string }[] = [
    { key: 'splitting', label: 'Splitting queries' },
-    { key: 'fetching', label: 'Fetching listings' },
+    { key: 'fetching', label: 'Fetching & processing' },
-    { key: 'filtering', label: 'Filtering results' },
+    { key: 'processing', label: 'Processing remaining' },
    { key: 'processing', label: 'Processing listings' },
 ];
 function getPhaseIndex(phase: TaskPhase | undefined): number {
@ -175,7 +174,7 @@ function PhaseDetails({ result }: { result: TaskResult }) {
        return (
            <div className="rounded-md border p-3 space-y-1">
                <p className="text-xs font-medium text-muted-foreground uppercase tracking-wide mb-2">
-                    Fetching
+                    {result.fetching_done ? 'Fetching complete' : 'Fetching & processing'}
                </p>
                <CounterRow
                    label="Subqueries completed"
@ -184,19 +183,24 @@ function PhaseDetails({ result }: { result: TaskResult }) {
                />
                <CounterRow label="IDs collected" value={result.ids_collected} />
                <CounterRow label="Pages fetched" value={result.pages_fetched} />
-            </div>
+                {(result.details_fetched !== undefined && result.details_fetched > 0) && (
-        );
+                    <>
-    }
+                        <div className="border-t my-2" />
-
+                        <CounterRow
-    if (phase === 'filtering') {
+                            label="Details fetched"
-        return (
+                            value={result.details_fetched}
-            <div className="rounded-md border p-3 space-y-1">
+                            total={result.total}
-                <p className="text-xs font-medium text-muted-foreground uppercase tracking-wide mb-2">
+                        />
-                    Filtering
+                        <CounterRow label="Images downloaded" value={result.images_downloaded} />
-                </p>
+                        <CounterRow label="OCR completed" value={result.ocr_completed} />
-                <CounterRow label="Total from API" value={result.total_found} />
+                        {(result.failed ?? 0) > 0 && (
-                <CounterRow label="Already in DB" value={result.existing_in_db} />
+                            <div className="flex justify-between text-sm">
-                <CounterRow label="New to process" value={result.new_listings} />
+                                <span className="text-red-500">Failed</span>
                                <span className="font-mono tabular-nums text-red-500">{result.failed}</span>
                            </div>
                        )}
                    </>
                )}
            </div>
        );
    }
@ -306,7 +310,7 @@ export function TaskProgressDrawer({
                    {taskResult && <PhaseDetails result={taskResult} />}
-                    {taskResult && taskResult.phase === 'processing' && (
+                    {taskResult && (taskResult.phase === 'processing' || taskResult.phase === 'fetching') && (taskResult.total ?? 0) > 0 && (
                        <div className="space-y-1">
                            <div className="w-full h-2 bg-primary/20 rounded-full overflow-hidden">
                                <div
--- a/crawler/frontend/src/types/index.ts
+++ b/crawler/frontend/src/types/index.ts
@ -52,7 +52,7 @@ export interface TaskStatusResponse {
  message?: string;
 }
-export type TaskPhase = 'splitting' | 'splitting_complete' | 'fetching' | 'filtering' | 'processing' | 'completed';
+export type TaskPhase = 'splitting' | 'splitting_complete' | 'fetching' | 'processing' | 'completed';
 export interface TaskResult {
  progress: number;
@ -69,10 +69,7 @@ export interface TaskResult {
  subqueries_completed?: number;
  ids_collected?: number;
  pages_fetched?: number;
-  // Filtering phase
+  fetching_done?: boolean;
  total_found?: number;
  existing_in_db?: number;
  new_listings?: number;
  // Processing phase
  details_fetched?: number;
  images_downloaded?: number;
--- a/crawler/repositories/listing_repository.py
+++ b/crawler/repositories/listing_repository.py
@ -343,6 +343,20 @@ class ListingRepository:
        return model_listing
    def get_listing_ids(
        self,
        listing_type: ListingType = ListingType.RENT,
    ) -> set[int]:
        """Get all listing IDs from the database (ID-only projection).
        Much faster than get_listings() when only IDs are needed for
        filtering against API results.
        """
        model = RentListing if listing_type == ListingType.RENT else BuyListing
        with Session(self.engine) as session:
            result = session.execute(sa_select(model.id))
            return {row[0] for row in result.fetchall()}
    async def mark_seen(self, listing_id: int) -> None:
        listings = await self.get_listings(only_ids=[listing_id])
        if len(listings) == 0:
--- a/crawler/services/listing_fetcher.py
+++ b/crawler/services/listing_fetcher.py
@ -1,26 +1,25 @@
 """Listing fetcher service - fetches listing data from Rightmove API."""
 import asyncio
 import logging
 from typing import Any
 from config.scraper_config import ScraperConfig
 from listing_processor import ListingProcessor
 from rec.query import create_session, listing_query
 from rec.exceptions import CircuitBreakerOpenError, ThrottlingError
 from rec.throttle_detector import get_throttle_metrics, reset_throttle_metrics
-from models.listing import QueryParameters
+from models.listing import Listing, QueryParameters
 from repositories import ListingRepository
 from tqdm.asyncio import tqdm
 from models import Listing as modelListing
 from services.query_splitter import QuerySplitter, SubQuery
 logger = logging.getLogger("uvicorn.error")
 NUM_WORKERS = 20
 async def dump_listings_full(
    parameters: QueryParameters,
    repository: ListingRepository,
-) -> list[modelListing]:
+) -> list[Listing]:
    """Fetches all listings, images as well as detects floorplans."""
    new_listings = await dump_listings(parameters, repository)
    logger.debug(f"Upserted {len(new_listings)} new listings")
@ -33,11 +32,11 @@ async def dump_listings_full(
 async def dump_listings(
    parameters: QueryParameters,
    repository: ListingRepository,
-) -> list[modelListing]:
+) -> list[Listing]:
    """Fetch listings from Rightmove API and process them.
-    Uses intelligent query splitting to maximize data extraction
+    Uses intelligent query splitting and a streaming pipeline so that
-    while respecting Rightmove's result caps.
+    listing processing starts as soon as IDs become available.
    """
    config = ScraperConfig.from_env()
    splitter = QuerySplitter(config)
@ -47,7 +46,7 @@ async def dump_listings(
    try:
        async with create_session(config) as session:
-            # Phase 1 & 2: Split and probe queries
+            # Phase 1: Split and probe queries
            logger.info("Splitting query and probing result counts...")
            subqueries = await splitter.split(parameters, session)
@ -57,16 +56,22 @@ async def dump_listings(
                f"estimated {total_estimated} total results"
            )
-            # Phase 3: Fetch all pages for each subquery
+            # Load existing IDs (fast, ID-only projection)
-            semaphore = asyncio.Semaphore(config.max_concurrent_requests)
+            existing_ids = repository.get_listing_ids(parameters.listing_type)
            logger.info(f"Found {len(existing_ids)} existing listings in DB")
-            async def fetch_subquery(sq: SubQuery) -> list[dict[str, Any]]:
+            # Phase 2: Streaming fetch & process
-                """Fetch all pages for a single subquery."""
+            queue: asyncio.Queue[int | None] = asyncio.Queue()
-                results: list[dict[str, Any]] = []
+            semaphore = asyncio.Semaphore(config.max_concurrent_requests)
            ids_collected = 0
            processed_listings: list[Listing] = []
            async def fetch_subquery(sq: SubQuery) -> None:
                nonlocal ids_collected
                estimated = sq.estimated_results or 0
                if estimated == 0:
-                    return results
+                    return
                page_size = parameters.page_size
                max_pages = min(
@ -93,9 +98,16 @@ async def dump_listings(
                                session=session,
                                config=config,
                            )
                            results.append(result)
                            # Extract and enqueue new IDs inline
                            properties = result.get("properties", [])
                            for prop in properties:
                                identifier = prop.get("identifier")
                                if identifier and identifier not in existing_ids:
                                    existing_ids.add(identifier)
                                    ids_collected += 1
                                    await queue.put(identifier)
                            if len(properties) < page_size:
                                break
@ -104,7 +116,8 @@ async def dump_listings(
                            break
                        except ThrottlingError as e:
                            logger.warning(
-                                f"Throttling error on page {page_id} for {sq.district}: {e}"
+                                f"Throttling error on page {page_id} for "
                                f"{sq.district}: {e}"
                            )
                            break
                        except Exception as e:
@ -114,17 +127,34 @@ async def dump_listings(
                                )
                                break
                            logger.warning(
-                                f"Error fetching page {page_id} for {sq.district}: {e}"
+                                f"Error fetching page {page_id} for "
                                f"{sq.district}: {e}"
                            )
                            break
-                return results
+            async def producer() -> None:
                await asyncio.gather(
                    *[fetch_subquery(sq) for sq in subqueries]
                )
                logger.info(f"Fetch complete: {ids_collected} new IDs found")
                for _ in range(NUM_WORKERS):
                    await queue.put(None)
-            # Fetch all subqueries with progress bar
+            async def worker() -> None:
-            all_results = await tqdm.gather(
+                while True:
-                *[fetch_subquery(sq) for sq in subqueries],
+                    listing_id = await queue.get()
-                desc="Fetching listings",
+                    if listing_id is None:
                        break
                    listing_processor = ListingProcessor(repository)
                    listing = await listing_processor.process_listing(listing_id)
                    if listing is not None:
                        processed_listings.append(listing)
            await asyncio.gather(
                producer(),
                *[worker() for _ in range(NUM_WORKERS)],
            )
    except CircuitBreakerOpenError as e:
        logger.error(f"Circuit breaker prevented listing fetch: {e}")
        logger.info(get_throttle_metrics().summary())
@ -135,36 +165,9 @@ async def dump_listings(
        if metrics.total_requests > 0:
            logger.info("\n" + metrics.summary())
-    # Extract listing identifiers from results
+    logger.info(
-    listing_ids: list[int] = []
+        f"Processed {len(processed_listings)} new listings "
-    for subquery_results in all_results:
+        f"({ids_collected} total found)"
        for response_json in subquery_results:
            if not response_json:
                continue
            if response_json.get("totalAvailableResults", 0) == 0:
                continue
            for property_data in response_json.get("properties", []):
                identifier = property_data.get("identifier")
                if identifier:
                    listing_ids.append(identifier)
    logger.info(f"Found {len(listing_ids)} total listings")
    # Deduplicate
    unique_ids = list(set(listing_ids))
    logger.info(f"After deduplication: {len(unique_ids)} unique listings")
    # Filter out listings already in database
    all_listing_ids = [x.id for x in await repository.get_listings()]
    missing_ids = [
        listing_id for listing_id in unique_ids if listing_id not in all_listing_ids
    ]
    listing_processor = ListingProcessor(repository)
    logger.info(f"Starting processing {len(missing_ids)} new listings")
    processed_listings = await tqdm.gather(
        *[listing_processor.process_listing(id) for id in missing_ids]
    )
    filtered_listings = [x for x in processed_listings if x is not None]
-    return filtered_listings
+    return processed_listings
--- a/crawler/tasks/listing_tasks.py
+++ b/crawler/tasks/listing_tasks.py
@ -134,144 +134,303 @@ async def dump_listings_full(
 async def _dump_listings_full_inner(
    *, task: Task, parameters: QueryParameters
 ) -> list[Listing]:
-    """Inner implementation with log capture active."""
+    """Inner implementation with log capture active.
    Uses a streaming pipeline: an asyncio.Queue bridges the fetcher (producer)
    and processor workers (consumers) so that listing processing starts as
    soon as IDs become available from each subquery.
    """
    start_time = time.time()
    NUM_WORKERS = 20
    celery_logger.info("=" * 60)
-    celery_logger.info("PHASE 1: Initializing listing fetch")
+    celery_logger.info("PHASE 1: Splitting queries")
    celery_logger.info("=" * 60)
    repository = ListingRepository(engine)
    config = ScraperConfig.from_env()
    splitter = QuerySplitter(config)
-    _update_task_state(task, "Identifying missing listings", {"phase": "splitting", "progress": 0})
+    # Reset throttle metrics
-    celery_logger.info("Querying Rightmove API to identify new listings...")
+    reset_throttle_metrics()
    ids_to_process = await get_ids_to_process(
        parameters=parameters, repository=repository, task=task
    )
-    celery_logger.info(f"Found {len(ids_to_process)} new listings to process")
+    def on_progress(phase: str, message: str, **kwargs: Any) -> None:
-    logger.info(f"Found {len(ids_to_process)} listings to process")
+        meta: dict[str, Any] = {"phase": phase, "message": message}
        meta.update(kwargs)
        _update_task_state(task, message, meta)
        celery_logger.info(f"[{phase}] {message}")
-    if len(ids_to_process) == 0:
+    _update_task_state(task, "Analyzing query and splitting by price bands...", {
-        elapsed = time.time() - start_time
+        "phase": "splitting", "progress": 0,
-        celery_logger.info(f"No new listings found. Completed in {elapsed:.1f}s")
+    })
-        invalidate_cache()
+    celery_logger.info("Starting query splitting and probing...")
-        _update_task_state(task, "No new listings found", {
+
-            "phase": "completed", "progress": 1, "processed": 0, "total": 0,
+    try:
-            "message": "All listings are up to date",
+        async with create_session(config) as session:
-        })
+            subqueries = await splitter.split(parameters, session, on_progress)
            total_estimated = splitter.calculate_total_estimated_results(subqueries)
            celery_logger.info(
                f"Query split complete: {len(subqueries)} subqueries, "
                f"~{total_estimated} estimated total results"
            )
            # Load existing IDs (fast, ID-only projection)
            celery_logger.info("Loading existing listing IDs from database...")
            existing_ids = repository.get_listing_ids(parameters.listing_type)
            celery_logger.info(f"Found {len(existing_ids)} existing listings in DB")
            celery_logger.info("=" * 60)
            celery_logger.info("PHASE 2: Streaming fetch & process")
            celery_logger.info("=" * 60)
            # Shared state for the streaming pipeline
            queue: asyncio.Queue[int | None] = asyncio.Queue()
            ids_collected = 0
            completed_subqueries = 0
            total_pages_fetched = 0
            fetching_done = False
            processed_count = 0
            failed_count = 0
            details_fetched = 0
            images_downloaded = 0
            ocr_completed = 0
            processed_listings: list[Listing] = []
            semaphore = asyncio.Semaphore(config.max_concurrent_requests)
            _update_task_state(
                task,
                f"Fetching listings from {len(subqueries)} subqueries...",
                {
                    "phase": "fetching",
                    "subqueries_completed": 0,
                    "subqueries_total": len(subqueries),
                    "ids_collected": 0,
                    "pages_fetched": 0,
                    "estimated_results": total_estimated,
                    "fetching_done": False,
                },
            )
            listing_processor = ListingProcessor(repository)
            # --- Producer: fetch subquery pages and enqueue new IDs ---
            async def producer() -> None:
                nonlocal ids_collected, completed_subqueries, total_pages_fetched
                nonlocal fetching_done
                async def fetch_subquery(sq: SubQuery) -> None:
                    nonlocal ids_collected, completed_subqueries, total_pages_fetched
                    estimated = sq.estimated_results or 0
                    if estimated == 0:
                        completed_subqueries += 1
                        return
                    page_size = parameters.page_size
                    max_pages = min(
                        config.max_pages_per_query,
                        (estimated // page_size) + 1,
                    )
                    for page_id in range(1, max_pages + 1):
                        async with semaphore:
                            await asyncio.sleep(config.request_delay_ms / 1000)
                            try:
                                result = await listing_query(
                                    page=page_id,
                                    channel=parameters.listing_type,
                                    min_bedrooms=sq.min_bedrooms,
                                    max_bedrooms=sq.max_bedrooms,
                                    radius=parameters.radius,
                                    min_price=sq.min_price,
                                    max_price=sq.max_price,
                                    district=sq.district,
                                    page_size=page_size,
                                    max_days_since_added=parameters.max_days_since_added,
                                    furnish_types=parameters.furnish_types or [],
                                    session=session,
                                    config=config,
                                )
                                total_pages_fetched += 1
                                # Extract and enqueue new IDs inline
                                properties = result.get("properties", [])
                                for prop in properties:
                                    identifier = prop.get("identifier")
                                    if identifier and identifier not in existing_ids:
                                        existing_ids.add(identifier)
                                        ids_collected += 1
                                        await queue.put(identifier)
                                if len(properties) < page_size:
                                    break
                            except CircuitBreakerOpenError as e:
                                celery_logger.error(f"Circuit breaker open: {e}")
                                break
                            except ThrottlingError as e:
                                celery_logger.warning(
                                    f"Throttling on {sq.district} page {page_id}: {e}"
                                )
                                break
                            except Exception as e:
                                if "GENERIC_ERROR" in str(e):
                                    logger.debug(
                                        f"Max page for {sq.district}: {page_id - 1}"
                                    )
                                    break
                                logger.warning(
                                    f"Error fetching page {page_id} for "
                                    f"{sq.district}: {e}"
                                )
                                break
                    completed_subqueries += 1
                # Fetch all subqueries concurrently
                await asyncio.gather(
                    *[fetch_subquery(sq) for sq in subqueries]
                )
                celery_logger.info(
                    f"Fetch complete: {total_pages_fetched} pages from "
                    f"{completed_subqueries} subqueries, "
                    f"{ids_collected} new IDs"
                )
                fetching_done = True
                # Send sentinel values to stop workers
                for _ in range(NUM_WORKERS):
                    await queue.put(None)
            # --- Consumer workers: process listings from queue ---
            async def worker() -> None:
                nonlocal processed_count, failed_count
                nonlocal details_fetched, images_downloaded, ocr_completed
                while True:
                    listing_id = await queue.get()
                    if listing_id is None:
                        break
                    def step_callback(step_name: str) -> None:
                        nonlocal details_fetched, images_downloaded, ocr_completed
                        if step_name == "details":
                            details_fetched += 1
                        elif step_name == "images":
                            images_downloaded += 1
                        elif step_name == "ocr":
                            ocr_completed += 1
                    listing = await listing_processor.process_listing(
                        listing_id, on_step_complete=step_callback
                    )
                    if listing is not None:
                        processed_count += 1
                        processed_listings.append(listing)
                    else:
                        failed_count += 1
            # --- Monitor: reports combined progress ---
            async def monitor() -> None:
                last_progress = 0.0
                while True:
                    total = ids_collected
                    done = processed_count + failed_count
                    if fetching_done and done >= total and total > 0:
                        break
                    if fetching_done and total == 0:
                        break
                    # Determine phase label
                    phase = "processing" if fetching_done else "fetching"
                    if total > 0:
                        progress_ratio = round(done / total, 2)
                    else:
                        progress_ratio = 0.0
                    elapsed = time.time() - start_time
                    rate = done / elapsed if elapsed > 0 else 0
                    remaining = (total - done) if total > 0 else 0
                    eta = remaining / rate if rate > 0 else 0
                    if progress_ratio >= last_progress + 0.1 or done == 1:
                        celery_logger.info(
                            f"Progress: {progress_ratio * 100:.0f}% "
                            f"({done}/{total}) "
                            f"| Elapsed: {elapsed:.0f}s "
                            f"| Rate: {rate:.1f}/s "
                            f"| ETA: {eta:.0f}s"
                        )
                        last_progress = progress_ratio
                    _update_task_state(
                        task,
                        f"{'Processing' if fetching_done else 'Fetching & processing'}: "
                        f"{done}/{total}",
                        {
                            "phase": phase,
                            "progress": progress_ratio,
                            "processed": done,
                            "total": total,
                            "subqueries_completed": completed_subqueries,
                            "subqueries_total": len(subqueries),
                            "ids_collected": ids_collected,
                            "pages_fetched": total_pages_fetched,
                            "fetching_done": fetching_done,
                            "details_fetched": details_fetched,
                            "images_downloaded": images_downloaded,
                            "ocr_completed": ocr_completed,
                            "failed": failed_count,
                            "elapsed_seconds": round(elapsed, 1),
                            "rate_per_second": round(rate, 2),
                            "eta_seconds": round(eta, 1),
                        },
                    )
                    await asyncio.sleep(1)
            # Run producer, workers, and monitor concurrently
            await asyncio.gather(
                producer(),
                *[worker() for _ in range(NUM_WORKERS)],
                monitor(),
            )
    except CircuitBreakerOpenError as e:
        celery_logger.error(f"Circuit breaker prevented query: {e}")
        metrics = get_throttle_metrics()
        if metrics.total_requests > 0:
            celery_logger.info(metrics.summary())
        return []
-
+    finally:
-    celery_logger.info("=" * 60)
+        metrics = get_throttle_metrics()
-    celery_logger.info("PHASE 2: Processing listings (fetch details, images, OCR)")
+        if metrics.total_requests > 0:
-    celery_logger.info("=" * 60)
+            celery_logger.info(
-
+                f"API Stats: {metrics.total_requests} requests, "
-    listing_processor = ListingProcessor(repository)
+                f"avg {metrics.average_response_time:.2f}s, "
-    celery_logger.info(f"Starting processing {len(ids_to_process)} listings")
+                f"{metrics.total_throttling_events} throttled"
-    logger.info(f"Starting processing {len(ids_to_process)} listings")
+            )
    result = await dump_listings_and_monitor(
        task=task, listing_processor=listing_processor, missing_ids=ids_to_process
    )
    elapsed = time.time() - start_time
    celery_logger.info("=" * 60)
-    celery_logger.info(f"COMPLETED: Processed {len(result)} listings in {elapsed:.1f}s")
+    celery_logger.info(
        f"COMPLETED: Processed {len(processed_listings)} listings in {elapsed:.1f}s"
    )
    celery_logger.info("=" * 60)
    invalidate_cache()
    # Send final state so the frontend has rich data even after task completes
    _update_task_state(task, "Completed", {
        "phase": "completed", "progress": 1,
-        "processed": len(result), "total": len(ids_to_process),
+        "processed": len(processed_listings), "total": ids_collected,
-        "message": f"Processed {len(result)} listings in {elapsed:.0f}s",
+        "message": f"Processed {len(processed_listings)} listings in {elapsed:.0f}s",
    })
-    return result
+    return processed_listings
 async def dump_listings_and_monitor(
    *, task: Task, listing_processor: ListingProcessor, missing_ids: set[int]
 ) -> list[Listing]:
    task_progress = {missing_id: 0 for missing_id in missing_ids}
    processed_count = 0
    failed_count = 0
    details_fetched = 0
    images_downloaded = 0
    ocr_completed = 0
    start_time = time.time()
    async def process(missing_id: int) -> Listing | None:
        nonlocal processed_count, failed_count
        def step_callback(step_name: str) -> None:
            nonlocal details_fetched, images_downloaded, ocr_completed
            if step_name == "details":
                details_fetched += 1
            elif step_name == "images":
                images_downloaded += 1
            elif step_name == "ocr":
                ocr_completed += 1
        listing = await listing_processor.process_listing(
            missing_id, on_step_complete=step_callback
        )
        task_progress[missing_id] = 1
        if listing is not None:
            processed_count += 1
        else:
            failed_count += 1
        return listing
    async def monitor() -> None:
        last_progress = 0
        while (progress := sum(task_progress.values())) < len(missing_ids):
            progress_ratio = round(progress / len(missing_ids), 2)
            elapsed = time.time() - start_time
            rate = progress / elapsed if elapsed > 0 else 0
            eta = (len(missing_ids) - progress) / rate if rate > 0 else 0
            # Log every 10% progress or at least every update
            if progress_ratio >= last_progress + 0.1 or progress == 1:
                celery_logger.info(
                    f"Progress: {progress_ratio * 100:.0f}% "
                    f"({progress}/{len(missing_ids)}) "
                    f"| Elapsed: {elapsed:.0f}s "
                    f"| Rate: {rate:.1f}/s "
                    f"| ETA: {eta:.0f}s"
                )
                last_progress = progress_ratio
            _update_task_state(
                task,
                f"Processing: {progress_ratio * 100:.0f}% ({progress}/{len(missing_ids)})",
                {
                    "phase": "processing",
                    "progress": progress_ratio,
                    "processed": progress,
                    "total": len(missing_ids),
                    "details_fetched": details_fetched,
                    "images_downloaded": images_downloaded,
                    "ocr_completed": ocr_completed,
                    "failed": failed_count,
                    "elapsed_seconds": round(elapsed, 1),
                    "rate_per_second": round(rate, 2),
                    "eta_seconds": round(eta, 1),
                },
            )
            await asyncio.sleep(1)
    processed_listings = await asyncio.gather(
        *[process(id) for id in missing_ids], *[monitor()]
    )
    filtered_listings = [listing for listing in processed_listings if listing is not None]
    celery_logger.info(
        f"Processing complete: {processed_count} successful, {failed_count} failed"
    )
    return filtered_listings
@app.on_after_finalize.connect
@ -297,227 +456,3 @@ def setup_periodic_tasks(sender, **kwargs):
            dump_listings_task.s(schedule.to_query_parameters().model_dump_json()),
            name=schedule.name,
        )
 async def get_ids_to_process(
    *,
    parameters: QueryParameters,
    repository: ListingRepository,
    task: Task,
 ) -> set[int]:
    """Fetch all listing IDs using intelligent query splitting.
    Uses the QuerySplitter to adaptively split large queries and maximize
    data extraction while respecting Rightmove's result caps.
    Args:
        parameters: Query parameters for the search.
        repository: Repository for checking existing listings.
        task: Celery task for progress updates.
    Returns:
        Set of new listing IDs that need to be processed.
    """
    config = ScraperConfig.from_env()
    splitter = QuerySplitter(config)
    # Reset throttle metrics
    reset_throttle_metrics()
    def on_progress(phase: str, message: str, **kwargs: Any) -> None:
        meta: dict[str, Any] = {"phase": phase, "message": message}
        meta.update(kwargs)
        _update_task_state(task, message, meta)
        celery_logger.info(f"[{phase}] {message}")
    celery_logger.info("Starting query splitting and probing...")
    try:
        async with create_session(config) as session:
            # Phase 1 & 2: Split and probe queries
            _update_task_state(task, "Analyzing query and splitting by price bands...", {
                "phase": "splitting", "progress": 0,
            })
            subqueries = await splitter.split(parameters, session, on_progress)
            total_estimated = splitter.calculate_total_estimated_results(subqueries)
            celery_logger.info(
                f"Query split complete: {len(subqueries)} subqueries, "
                f"~{total_estimated} estimated total results"
            )
            logger.info(
                f"Split into {len(subqueries)} subqueries, "
                f"estimated {total_estimated} total results"
            )
            # Phase 3: Fetch all pages for each subquery
            _update_task_state(
                task,
                f"Fetching listings from {len(subqueries)} subqueries...",
                {
                    "phase": "fetching",
                    "subqueries_completed": 0,
                    "subqueries_total": len(subqueries),
                    "ids_collected": 0,
                    "pages_fetched": 0,
                    "estimated_results": total_estimated,
                },
            )
            celery_logger.info(f"Fetching pages from {len(subqueries)} subqueries...")
            semaphore = asyncio.Semaphore(config.max_concurrent_requests)
            identifiers: set[int] = set()
            completed_subqueries = 0
            total_pages_fetched = 0
            async def fetch_subquery(sq: SubQuery) -> list[dict[str, Any]]:
                """Fetch all pages for a single subquery."""
                nonlocal completed_subqueries, total_pages_fetched
                results: list[dict[str, Any]] = []
                # Calculate how many pages we need based on estimated results
                estimated = sq.estimated_results or 0
                if estimated == 0:
                    completed_subqueries += 1
                    _update_task_state(
                        task,
                        f"Fetching: {completed_subqueries}/{len(subqueries)} subqueries",
                        {
                            "phase": "fetching",
                            "subqueries_completed": completed_subqueries,
                            "subqueries_total": len(subqueries),
                            "ids_collected": len(identifiers),
                            "pages_fetched": total_pages_fetched,
                        },
                    )
                    return results
                # Fetch pages up to max_pages_per_query or until no more results
                page_size = parameters.page_size
                max_pages = min(
                    config.max_pages_per_query,
                    (estimated // page_size) + 1,
                )
                for page_id in range(1, max_pages + 1):
                    async with semaphore:
                        await asyncio.sleep(config.request_delay_ms / 1000)
                        try:
                            result = await listing_query(
                                page=page_id,
                                channel=parameters.listing_type,
                                min_bedrooms=sq.min_bedrooms,
                                max_bedrooms=sq.max_bedrooms,
                                radius=parameters.radius,
                                min_price=sq.min_price,
                                max_price=sq.max_price,
                                district=sq.district,
                                page_size=page_size,
                                max_days_since_added=parameters.max_days_since_added,
                                furnish_types=parameters.furnish_types or [],
                                session=session,
                                config=config,
                            )
                            results.append(result)
                            total_pages_fetched += 1
                            # Check if we've received all results
                            properties = result.get("properties", [])
                            if len(properties) < page_size:
                                # No more results on next page
                                break
                        except CircuitBreakerOpenError as e:
                            celery_logger.error(f"Circuit breaker open: {e}")
                            break
                        except ThrottlingError as e:
                            celery_logger.warning(
                                f"Throttling on {sq.district} page {page_id}: {e}"
                            )
                            break
                        except Exception as e:
                            if "GENERIC_ERROR" in str(e):
                                # Reached end of results
                                logger.debug(
                                    f"Max page for {sq.district}: {page_id - 1}"
                                )
                                break
                            logger.warning(
                                f"Error fetching page {page_id} for {sq.district}: {e}"
                            )
                            break
                completed_subqueries += 1
                _update_task_state(
                    task,
                    f"Fetching: {completed_subqueries}/{len(subqueries)} subqueries",
                    {
                        "phase": "fetching",
                        "subqueries_completed": completed_subqueries,
                        "subqueries_total": len(subqueries),
                        "ids_collected": len(identifiers),
                        "pages_fetched": total_pages_fetched,
                    },
                )
                return results
            # Fetch all subqueries concurrently
            all_results = await asyncio.gather(
                *[fetch_subquery(sq) for sq in subqueries]
            )
            celery_logger.info(
                f"Fetch complete: {total_pages_fetched} pages from "
                f"{completed_subqueries} subqueries"
            )
            # Extract identifiers from all results
            for subquery_results in all_results:
                for response_json in subquery_results:
                    if not response_json:
                        continue
                    if response_json.get("totalAvailableResults", 0) == 0:
                        continue
                    for property_data in response_json.get("properties", []):
                        identifier = property_data.get("identifier")
                        if identifier:
                            identifiers.add(identifier)
    except CircuitBreakerOpenError as e:
        celery_logger.error(f"Circuit breaker prevented query: {e}")
        # Log throttle metrics
        metrics = get_throttle_metrics()
        if metrics.total_requests > 0:
            celery_logger.info(metrics.summary())
        return set()
    finally:
        # Log throttle metrics
        metrics = get_throttle_metrics()
        if metrics.total_requests > 0:
            celery_logger.info(f"API Stats: {metrics.total_requests} requests, "
                               f"avg {metrics.average_response_time:.2f}s, "
                               f"{metrics.total_throttling_events} throttled")
    celery_logger.info(f"Found {len(identifiers)} unique listing IDs from API")
    logger.info(f"Found {len(identifiers)} unique listings")
    # Filter out listings already in the database
    celery_logger.info("Checking database for existing listings...")
    all_listing_ids = {listing.id for listing in await repository.get_listings()}
    new_ids = identifiers - all_listing_ids
    celery_logger.info(
        f"Filtering: {len(identifiers)} total, "
        f"{len(all_listing_ids)} existing in DB, "
        f"{len(new_ids)} new to process"
    )
    _update_task_state(task, f"Found {len(new_ids)} new listings to process", {
        "phase": "filtering",
        "total_found": len(identifiers),
        "existing_in_db": len(all_listing_ids),
        "new_listings": len(new_ids),
    })
    return new_ids