Stream-process listings as IDs arrive via asyncio.Queue

Replace the sequential fetch-all-then-process pipeline with a streaming architecture where listing processing starts as soon as IDs become available from each subquery. A producer task fetches pages and enqueues new IDs (filtered inline against DB), while 20 consumer workers process listings concurrently from the queue. - Add ListingRepository.get_listing_ids() for fast ID-only projection - Refactor listing_tasks.py: remove get_ids_to_process/dump_listings_and_monitor, replace with unified producer/worker/monitor pipeline - Apply same pattern to CLI path in listing_fetcher.py - Remove 'filtering' phase from frontend, show combined fetch+process metrics - Add fetching_done flag to TaskResult for phase transition tracking
2026-02-06 23:43:54 +00:00 · 2026-02-06 23:43:54 +00:00 · b9f576ae2b
commit b9f576ae2b
parent 7e8f1f0339
6 changed files with 372 additions and 420 deletions
--- a/crawler/frontend/src/components/TaskIndicator.tsx
+++ b/crawler/frontend/src/components/TaskIndicator.tsx
@ -180,7 +180,6 @@ export function TaskIndicator({ taskID, onTaskCancelled }: TaskIndicatorProps) {
                splitting: 'Splitting',
                splitting_complete: 'Split done',
                fetching: 'Fetching',
-                filtering: 'Filtering',
            };
            return phaseLabels[taskResult.phase] ?? `${Math.round(progressPercentage)}%`;
        }
--- a/crawler/frontend/src/components/TaskProgressDrawer.tsx
+++ b/crawler/frontend/src/components/TaskProgressDrawer.tsx
@ -23,9 +23,8 @@ interface TaskProgressDrawerProps {

 const PHASES: { key: TaskPhase; label: string }[] = [
    { key: 'splitting', label: 'Splitting queries' },
-    { key: 'fetching', label: 'Fetching listings' },
-    { key: 'filtering', label: 'Filtering results' },
-    { key: 'processing', label: 'Processing listings' },
+    { key: 'fetching', label: 'Fetching & processing' },
+    { key: 'processing', label: 'Processing remaining' },
 ];

 function getPhaseIndex(phase: TaskPhase | undefined): number {
@ -175,7 +174,7 @@ function PhaseDetails({ result }: { result: TaskResult }) {
        return (
            <div className="rounded-md border p-3 space-y-1">
                <p className="text-xs font-medium text-muted-foreground uppercase tracking-wide mb-2">
-                    Fetching
+                    {result.fetching_done ? 'Fetching complete' : 'Fetching & processing'}
                </p>
                <CounterRow
                    label="Subqueries completed"
@ -184,19 +183,24 @@ function PhaseDetails({ result }: { result: TaskResult }) {
                />
                <CounterRow label="IDs collected" value={result.ids_collected} />
                <CounterRow label="Pages fetched" value={result.pages_fetched} />
-            </div>
-        );
-    }
-
-    if (phase === 'filtering') {
-        return (
-            <div className="rounded-md border p-3 space-y-1">
-                <p className="text-xs font-medium text-muted-foreground uppercase tracking-wide mb-2">
-                    Filtering
-                </p>
-                <CounterRow label="Total from API" value={result.total_found} />
-                <CounterRow label="Already in DB" value={result.existing_in_db} />
-                <CounterRow label="New to process" value={result.new_listings} />
+                {(result.details_fetched !== undefined && result.details_fetched > 0) && (
+                    <>
+                        <div className="border-t my-2" />
+                        <CounterRow
+                            label="Details fetched"
+                            value={result.details_fetched}
+                            total={result.total}
+                        />
+                        <CounterRow label="Images downloaded" value={result.images_downloaded} />
+                        <CounterRow label="OCR completed" value={result.ocr_completed} />
+                        {(result.failed ?? 0) > 0 && (
+                            <div className="flex justify-between text-sm">
+                                <span className="text-red-500">Failed</span>
+                                <span className="font-mono tabular-nums text-red-500">{result.failed}</span>
+                            </div>
+                        )}
+                    </>
+                )}
            </div>
        );
    }
@ -306,7 +310,7 @@ export function TaskProgressDrawer({

                    {taskResult && <PhaseDetails result={taskResult} />}

-                    {taskResult && taskResult.phase === 'processing' && (
+                    {taskResult && (taskResult.phase === 'processing' || taskResult.phase === 'fetching') && (taskResult.total ?? 0) > 0 && (
                        <div className="space-y-1">
                            <div className="w-full h-2 bg-primary/20 rounded-full overflow-hidden">
                                <div
--- a/crawler/frontend/src/types/index.ts
+++ b/crawler/frontend/src/types/index.ts
@ -52,7 +52,7 @@ export interface TaskStatusResponse {
  message?: string;
 }

-export type TaskPhase = 'splitting' | 'splitting_complete' | 'fetching' | 'filtering' | 'processing' | 'completed';
+export type TaskPhase = 'splitting' | 'splitting_complete' | 'fetching' | 'processing' | 'completed';

 export interface TaskResult {
  progress: number;
@ -69,10 +69,7 @@ export interface TaskResult {
  subqueries_completed?: number;
  ids_collected?: number;
  pages_fetched?: number;
-  // Filtering phase
-  total_found?: number;
-  existing_in_db?: number;
-  new_listings?: number;
+  fetching_done?: boolean;
  // Processing phase
  details_fetched?: number;
  images_downloaded?: number;
--- a/crawler/repositories/listing_repository.py
+++ b/crawler/repositories/listing_repository.py
@ -343,6 +343,20 @@ class ListingRepository:

        return model_listing

+    def get_listing_ids(
+        self,
+        listing_type: ListingType = ListingType.RENT,
+    ) -> set[int]:
+        """Get all listing IDs from the database (ID-only projection).
+
+        Much faster than get_listings() when only IDs are needed for
+        filtering against API results.
+        """
+        model = RentListing if listing_type == ListingType.RENT else BuyListing
+        with Session(self.engine) as session:
+            result = session.execute(sa_select(model.id))
+            return {row[0] for row in result.fetchall()}
+
    async def mark_seen(self, listing_id: int) -> None:
        listings = await self.get_listings(only_ids=[listing_id])
        if len(listings) == 0:
--- a/crawler/services/listing_fetcher.py
+++ b/crawler/services/listing_fetcher.py
@ -1,26 +1,25 @@
 """Listing fetcher service - fetches listing data from Rightmove API."""
 import asyncio
 import logging
-from typing import Any

 from config.scraper_config import ScraperConfig
 from listing_processor import ListingProcessor
 from rec.query import create_session, listing_query
 from rec.exceptions import CircuitBreakerOpenError, ThrottlingError
 from rec.throttle_detector import get_throttle_metrics, reset_throttle_metrics
-from models.listing import QueryParameters
+from models.listing import Listing, QueryParameters
 from repositories import ListingRepository
-from tqdm.asyncio import tqdm
-from models import Listing as modelListing
 from services.query_splitter import QuerySplitter, SubQuery

 logger = logging.getLogger("uvicorn.error")

+NUM_WORKERS = 20
+

 async def dump_listings_full(
    parameters: QueryParameters,
    repository: ListingRepository,
-) -> list[modelListing]:
+) -> list[Listing]:
    """Fetches all listings, images as well as detects floorplans."""
    new_listings = await dump_listings(parameters, repository)
    logger.debug(f"Upserted {len(new_listings)} new listings")
@ -33,11 +32,11 @@ async def dump_listings_full(
 async def dump_listings(
    parameters: QueryParameters,
    repository: ListingRepository,
-) -> list[modelListing]:
+) -> list[Listing]:
    """Fetch listings from Rightmove API and process them.

-    Uses intelligent query splitting to maximize data extraction
-    while respecting Rightmove's result caps.
+    Uses intelligent query splitting and a streaming pipeline so that
+    listing processing starts as soon as IDs become available.
    """
    config = ScraperConfig.from_env()
    splitter = QuerySplitter(config)
@ -47,7 +46,7 @@ async def dump_listings(

    try:
        async with create_session(config) as session:
-            # Phase 1 & 2: Split and probe queries
+            # Phase 1: Split and probe queries
            logger.info("Splitting query and probing result counts...")
            subqueries = await splitter.split(parameters, session)

@ -57,16 +56,22 @@ async def dump_listings(
                f"estimated {total_estimated} total results"
            )

-            # Phase 3: Fetch all pages for each subquery
-            semaphore = asyncio.Semaphore(config.max_concurrent_requests)
+            # Load existing IDs (fast, ID-only projection)
+            existing_ids = repository.get_listing_ids(parameters.listing_type)
+            logger.info(f"Found {len(existing_ids)} existing listings in DB")

-            async def fetch_subquery(sq: SubQuery) -> list[dict[str, Any]]:
-                """Fetch all pages for a single subquery."""
-                results: list[dict[str, Any]] = []
+            # Phase 2: Streaming fetch & process
+            queue: asyncio.Queue[int | None] = asyncio.Queue()
+            semaphore = asyncio.Semaphore(config.max_concurrent_requests)
+            ids_collected = 0
+            processed_listings: list[Listing] = []
+
+            async def fetch_subquery(sq: SubQuery) -> None:
+                nonlocal ids_collected

                estimated = sq.estimated_results or 0
                if estimated == 0:
-                    return results
+                    return

                page_size = parameters.page_size
                max_pages = min(
@ -93,9 +98,16 @@ async def dump_listings(
                                session=session,
                                config=config,
                            )
-                            results.append(result)

+                            # Extract and enqueue new IDs inline
                            properties = result.get("properties", [])
+                            for prop in properties:
+                                identifier = prop.get("identifier")
+                                if identifier and identifier not in existing_ids:
+                                    existing_ids.add(identifier)
+                                    ids_collected += 1
+                                    await queue.put(identifier)
+
                            if len(properties) < page_size:
                                break

@ -104,7 +116,8 @@ async def dump_listings(
                            break
                        except ThrottlingError as e:
                            logger.warning(
-                                f"Throttling error on page {page_id} for {sq.district}: {e}"
+                                f"Throttling error on page {page_id} for "
+                                f"{sq.district}: {e}"
                            )
                            break
                        except Exception as e:
@ -114,17 +127,34 @@ async def dump_listings(
                                )
                                break
                            logger.warning(
-                                f"Error fetching page {page_id} for {sq.district}: {e}"
+                                f"Error fetching page {page_id} for "
+                                f"{sq.district}: {e}"
                            )
                            break

-                return results
+            async def producer() -> None:
+                await asyncio.gather(
+                    *[fetch_subquery(sq) for sq in subqueries]
+                )
+                logger.info(f"Fetch complete: {ids_collected} new IDs found")
+                for _ in range(NUM_WORKERS):
+                    await queue.put(None)

-            # Fetch all subqueries with progress bar
-            all_results = await tqdm.gather(
-                *[fetch_subquery(sq) for sq in subqueries],
-                desc="Fetching listings",
+            async def worker() -> None:
+                while True:
+                    listing_id = await queue.get()
+                    if listing_id is None:
+                        break
+                    listing_processor = ListingProcessor(repository)
+                    listing = await listing_processor.process_listing(listing_id)
+                    if listing is not None:
+                        processed_listings.append(listing)
+
+            await asyncio.gather(
+                producer(),
+                *[worker() for _ in range(NUM_WORKERS)],
            )
+
    except CircuitBreakerOpenError as e:
        logger.error(f"Circuit breaker prevented listing fetch: {e}")
        logger.info(get_throttle_metrics().summary())
@ -135,36 +165,9 @@ async def dump_listings(
        if metrics.total_requests > 0:
            logger.info("\n" + metrics.summary())

-    # Extract listing identifiers from results
-    listing_ids: list[int] = []
-    for subquery_results in all_results:
-        for response_json in subquery_results:
-            if not response_json:
-                continue
-            if response_json.get("totalAvailableResults", 0) == 0:
-                continue
-            for property_data in response_json.get("properties", []):
-                identifier = property_data.get("identifier")
-                if identifier:
-                    listing_ids.append(identifier)
-
-    logger.info(f"Found {len(listing_ids)} total listings")
-
-    # Deduplicate
-    unique_ids = list(set(listing_ids))
-    logger.info(f"After deduplication: {len(unique_ids)} unique listings")
-
-    # Filter out listings already in database
-    all_listing_ids = [x.id for x in await repository.get_listings()]
-    missing_ids = [
-        listing_id for listing_id in unique_ids if listing_id not in all_listing_ids
-    ]
-
-    listing_processor = ListingProcessor(repository)
-    logger.info(f"Starting processing {len(missing_ids)} new listings")
-    processed_listings = await tqdm.gather(
-        *[listing_processor.process_listing(id) for id in missing_ids]
+    logger.info(
+        f"Processed {len(processed_listings)} new listings "
+        f"({ids_collected} total found)"
    )
-    filtered_listings = [x for x in processed_listings if x is not None]

-    return filtered_listings
+    return processed_listings
--- a/crawler/tasks/listing_tasks.py
+++ b/crawler/tasks/listing_tasks.py
@ -134,144 +134,303 @@ async def dump_listings_full(
 async def _dump_listings_full_inner(
    *, task: Task, parameters: QueryParameters
 ) -> list[Listing]:
-    """Inner implementation with log capture active."""
+    """Inner implementation with log capture active.
+
+    Uses a streaming pipeline: an asyncio.Queue bridges the fetcher (producer)
+    and processor workers (consumers) so that listing processing starts as
+    soon as IDs become available from each subquery.
+    """
    start_time = time.time()
+    NUM_WORKERS = 20
+
    celery_logger.info("=" * 60)
-    celery_logger.info("PHASE 1: Initializing listing fetch")
+    celery_logger.info("PHASE 1: Splitting queries")
    celery_logger.info("=" * 60)

    repository = ListingRepository(engine)
+    config = ScraperConfig.from_env()
+    splitter = QuerySplitter(config)

-    _update_task_state(task, "Identifying missing listings", {"phase": "splitting", "progress": 0})
-    celery_logger.info("Querying Rightmove API to identify new listings...")
-    ids_to_process = await get_ids_to_process(
-        parameters=parameters, repository=repository, task=task
-    )
+    # Reset throttle metrics
+    reset_throttle_metrics()

-    celery_logger.info(f"Found {len(ids_to_process)} new listings to process")
-    logger.info(f"Found {len(ids_to_process)} listings to process")
+    def on_progress(phase: str, message: str, **kwargs: Any) -> None:
+        meta: dict[str, Any] = {"phase": phase, "message": message}
+        meta.update(kwargs)
+        _update_task_state(task, message, meta)
+        celery_logger.info(f"[{phase}] {message}")

-    if len(ids_to_process) == 0:
-        elapsed = time.time() - start_time
-        celery_logger.info(f"No new listings found. Completed in {elapsed:.1f}s")
-        invalidate_cache()
-        _update_task_state(task, "No new listings found", {
-            "phase": "completed", "progress": 1, "processed": 0, "total": 0,
-            "message": "All listings are up to date",
-        })
+    _update_task_state(task, "Analyzing query and splitting by price bands...", {
+        "phase": "splitting", "progress": 0,
+    })
+    celery_logger.info("Starting query splitting and probing...")
+
+    try:
+        async with create_session(config) as session:
+            subqueries = await splitter.split(parameters, session, on_progress)
+
+            total_estimated = splitter.calculate_total_estimated_results(subqueries)
+            celery_logger.info(
+                f"Query split complete: {len(subqueries)} subqueries, "
+                f"~{total_estimated} estimated total results"
+            )
+
+            # Load existing IDs (fast, ID-only projection)
+            celery_logger.info("Loading existing listing IDs from database...")
+            existing_ids = repository.get_listing_ids(parameters.listing_type)
+            celery_logger.info(f"Found {len(existing_ids)} existing listings in DB")
+
+            celery_logger.info("=" * 60)
+            celery_logger.info("PHASE 2: Streaming fetch & process")
+            celery_logger.info("=" * 60)
+
+            # Shared state for the streaming pipeline
+            queue: asyncio.Queue[int | None] = asyncio.Queue()
+            ids_collected = 0
+            completed_subqueries = 0
+            total_pages_fetched = 0
+            fetching_done = False
+            processed_count = 0
+            failed_count = 0
+            details_fetched = 0
+            images_downloaded = 0
+            ocr_completed = 0
+            processed_listings: list[Listing] = []
+            semaphore = asyncio.Semaphore(config.max_concurrent_requests)
+
+            _update_task_state(
+                task,
+                f"Fetching listings from {len(subqueries)} subqueries...",
+                {
+                    "phase": "fetching",
+                    "subqueries_completed": 0,
+                    "subqueries_total": len(subqueries),
+                    "ids_collected": 0,
+                    "pages_fetched": 0,
+                    "estimated_results": total_estimated,
+                    "fetching_done": False,
+                },
+            )
+
+            listing_processor = ListingProcessor(repository)
+
+            # --- Producer: fetch subquery pages and enqueue new IDs ---
+            async def producer() -> None:
+                nonlocal ids_collected, completed_subqueries, total_pages_fetched
+                nonlocal fetching_done
+
+                async def fetch_subquery(sq: SubQuery) -> None:
+                    nonlocal ids_collected, completed_subqueries, total_pages_fetched
+
+                    estimated = sq.estimated_results or 0
+                    if estimated == 0:
+                        completed_subqueries += 1
+                        return
+
+                    page_size = parameters.page_size
+                    max_pages = min(
+                        config.max_pages_per_query,
+                        (estimated // page_size) + 1,
+                    )
+
+                    for page_id in range(1, max_pages + 1):
+                        async with semaphore:
+                            await asyncio.sleep(config.request_delay_ms / 1000)
+                            try:
+                                result = await listing_query(
+                                    page=page_id,
+                                    channel=parameters.listing_type,
+                                    min_bedrooms=sq.min_bedrooms,
+                                    max_bedrooms=sq.max_bedrooms,
+                                    radius=parameters.radius,
+                                    min_price=sq.min_price,
+                                    max_price=sq.max_price,
+                                    district=sq.district,
+                                    page_size=page_size,
+                                    max_days_since_added=parameters.max_days_since_added,
+                                    furnish_types=parameters.furnish_types or [],
+                                    session=session,
+                                    config=config,
+                                )
+                                total_pages_fetched += 1
+
+                                # Extract and enqueue new IDs inline
+                                properties = result.get("properties", [])
+                                for prop in properties:
+                                    identifier = prop.get("identifier")
+                                    if identifier and identifier not in existing_ids:
+                                        existing_ids.add(identifier)
+                                        ids_collected += 1
+                                        await queue.put(identifier)
+
+                                if len(properties) < page_size:
+                                    break
+
+                            except CircuitBreakerOpenError as e:
+                                celery_logger.error(f"Circuit breaker open: {e}")
+                                break
+                            except ThrottlingError as e:
+                                celery_logger.warning(
+                                    f"Throttling on {sq.district} page {page_id}: {e}"
+                                )
+                                break
+                            except Exception as e:
+                                if "GENERIC_ERROR" in str(e):
+                                    logger.debug(
+                                        f"Max page for {sq.district}: {page_id - 1}"
+                                    )
+                                    break
+                                logger.warning(
+                                    f"Error fetching page {page_id} for "
+                                    f"{sq.district}: {e}"
+                                )
+                                break
+
+                    completed_subqueries += 1
+
+                # Fetch all subqueries concurrently
+                await asyncio.gather(
+                    *[fetch_subquery(sq) for sq in subqueries]
+                )
+
+                celery_logger.info(
+                    f"Fetch complete: {total_pages_fetched} pages from "
+                    f"{completed_subqueries} subqueries, "
+                    f"{ids_collected} new IDs"
+                )
+                fetching_done = True
+
+                # Send sentinel values to stop workers
+                for _ in range(NUM_WORKERS):
+                    await queue.put(None)
+
+            # --- Consumer workers: process listings from queue ---
+            async def worker() -> None:
+                nonlocal processed_count, failed_count
+                nonlocal details_fetched, images_downloaded, ocr_completed
+
+                while True:
+                    listing_id = await queue.get()
+                    if listing_id is None:
+                        break
+
+                    def step_callback(step_name: str) -> None:
+                        nonlocal details_fetched, images_downloaded, ocr_completed
+                        if step_name == "details":
+                            details_fetched += 1
+                        elif step_name == "images":
+                            images_downloaded += 1
+                        elif step_name == "ocr":
+                            ocr_completed += 1
+
+                    listing = await listing_processor.process_listing(
+                        listing_id, on_step_complete=step_callback
+                    )
+                    if listing is not None:
+                        processed_count += 1
+                        processed_listings.append(listing)
+                    else:
+                        failed_count += 1
+
+            # --- Monitor: reports combined progress ---
+            async def monitor() -> None:
+                last_progress = 0.0
+
+                while True:
+                    total = ids_collected
+                    done = processed_count + failed_count
+
+                    if fetching_done and done >= total and total > 0:
+                        break
+                    if fetching_done and total == 0:
+                        break
+
+                    # Determine phase label
+                    phase = "processing" if fetching_done else "fetching"
+
+                    if total > 0:
+                        progress_ratio = round(done / total, 2)
+                    else:
+                        progress_ratio = 0.0
+
+                    elapsed = time.time() - start_time
+                    rate = done / elapsed if elapsed > 0 else 0
+                    remaining = (total - done) if total > 0 else 0
+                    eta = remaining / rate if rate > 0 else 0
+
+                    if progress_ratio >= last_progress + 0.1 or done == 1:
+                        celery_logger.info(
+                            f"Progress: {progress_ratio * 100:.0f}% "
+                            f"({done}/{total}) "
+                            f"| Elapsed: {elapsed:.0f}s "
+                            f"| Rate: {rate:.1f}/s "
+                            f"| ETA: {eta:.0f}s"
+                        )
+                        last_progress = progress_ratio
+
+                    _update_task_state(
+                        task,
+                        f"{'Processing' if fetching_done else 'Fetching & processing'}: "
+                        f"{done}/{total}",
+                        {
+                            "phase": phase,
+                            "progress": progress_ratio,
+                            "processed": done,
+                            "total": total,
+                            "subqueries_completed": completed_subqueries,
+                            "subqueries_total": len(subqueries),
+                            "ids_collected": ids_collected,
+                            "pages_fetched": total_pages_fetched,
+                            "fetching_done": fetching_done,
+                            "details_fetched": details_fetched,
+                            "images_downloaded": images_downloaded,
+                            "ocr_completed": ocr_completed,
+                            "failed": failed_count,
+                            "elapsed_seconds": round(elapsed, 1),
+                            "rate_per_second": round(rate, 2),
+                            "eta_seconds": round(eta, 1),
+                        },
+                    )
+                    await asyncio.sleep(1)
+
+            # Run producer, workers, and monitor concurrently
+            await asyncio.gather(
+                producer(),
+                *[worker() for _ in range(NUM_WORKERS)],
+                monitor(),
+            )
+
+    except CircuitBreakerOpenError as e:
+        celery_logger.error(f"Circuit breaker prevented query: {e}")
+        metrics = get_throttle_metrics()
+        if metrics.total_requests > 0:
+            celery_logger.info(metrics.summary())
        return []
-
-    celery_logger.info("=" * 60)
-    celery_logger.info("PHASE 2: Processing listings (fetch details, images, OCR)")
-    celery_logger.info("=" * 60)
-
-    listing_processor = ListingProcessor(repository)
-    celery_logger.info(f"Starting processing {len(ids_to_process)} listings")
-    logger.info(f"Starting processing {len(ids_to_process)} listings")
-
-    result = await dump_listings_and_monitor(
-        task=task, listing_processor=listing_processor, missing_ids=ids_to_process
-    )
+    finally:
+        metrics = get_throttle_metrics()
+        if metrics.total_requests > 0:
+            celery_logger.info(
+                f"API Stats: {metrics.total_requests} requests, "
+                f"avg {metrics.average_response_time:.2f}s, "
+                f"{metrics.total_throttling_events} throttled"
+            )

    elapsed = time.time() - start_time
    celery_logger.info("=" * 60)
-    celery_logger.info(f"COMPLETED: Processed {len(result)} listings in {elapsed:.1f}s")
+    celery_logger.info(
+        f"COMPLETED: Processed {len(processed_listings)} listings in {elapsed:.1f}s"
+    )
    celery_logger.info("=" * 60)

    invalidate_cache()

-    # Send final state so the frontend has rich data even after task completes
    _update_task_state(task, "Completed", {
        "phase": "completed", "progress": 1,
-        "processed": len(result), "total": len(ids_to_process),
-        "message": f"Processed {len(result)} listings in {elapsed:.0f}s",
+        "processed": len(processed_listings), "total": ids_collected,
+        "message": f"Processed {len(processed_listings)} listings in {elapsed:.0f}s",
    })

-    return result
-
-
-async def dump_listings_and_monitor(
-    *, task: Task, listing_processor: ListingProcessor, missing_ids: set[int]
-) -> list[Listing]:
-    task_progress = {missing_id: 0 for missing_id in missing_ids}
-    processed_count = 0
-    failed_count = 0
-    details_fetched = 0
-    images_downloaded = 0
-    ocr_completed = 0
-    start_time = time.time()
-
-    async def process(missing_id: int) -> Listing | None:
-        nonlocal processed_count, failed_count
-
-        def step_callback(step_name: str) -> None:
-            nonlocal details_fetched, images_downloaded, ocr_completed
-            if step_name == "details":
-                details_fetched += 1
-            elif step_name == "images":
-                images_downloaded += 1
-            elif step_name == "ocr":
-                ocr_completed += 1
-
-        listing = await listing_processor.process_listing(
-            missing_id, on_step_complete=step_callback
-        )
-        task_progress[missing_id] = 1
-        if listing is not None:
-            processed_count += 1
-        else:
-            failed_count += 1
-        return listing
-
-    async def monitor() -> None:
-        last_progress = 0
-        while (progress := sum(task_progress.values())) < len(missing_ids):
-            progress_ratio = round(progress / len(missing_ids), 2)
-
-            elapsed = time.time() - start_time
-            rate = progress / elapsed if elapsed > 0 else 0
-            eta = (len(missing_ids) - progress) / rate if rate > 0 else 0
-
-            # Log every 10% progress or at least every update
-            if progress_ratio >= last_progress + 0.1 or progress == 1:
-                celery_logger.info(
-                    f"Progress: {progress_ratio * 100:.0f}% "
-                    f"({progress}/{len(missing_ids)}) "
-                    f"| Elapsed: {elapsed:.0f}s "
-                    f"| Rate: {rate:.1f}/s "
-                    f"| ETA: {eta:.0f}s"
-                )
-                last_progress = progress_ratio
-
-            _update_task_state(
-                task,
-                f"Processing: {progress_ratio * 100:.0f}% ({progress}/{len(missing_ids)})",
-                {
-                    "phase": "processing",
-                    "progress": progress_ratio,
-                    "processed": progress,
-                    "total": len(missing_ids),
-                    "details_fetched": details_fetched,
-                    "images_downloaded": images_downloaded,
-                    "ocr_completed": ocr_completed,
-                    "failed": failed_count,
-                    "elapsed_seconds": round(elapsed, 1),
-                    "rate_per_second": round(rate, 2),
-                    "eta_seconds": round(eta, 1),
-                },
-            )
-            await asyncio.sleep(1)
-
-    processed_listings = await asyncio.gather(
-        *[process(id) for id in missing_ids], *[monitor()]
-    )
-    filtered_listings = [listing for listing in processed_listings if listing is not None]
-
-    celery_logger.info(
-        f"Processing complete: {processed_count} successful, {failed_count} failed"
-    )
-
-    return filtered_listings
+    return processed_listings


@app.on_after_finalize.connect
@ -297,227 +456,3 @@ def setup_periodic_tasks(sender, **kwargs):
            dump_listings_task.s(schedule.to_query_parameters().model_dump_json()),
            name=schedule.name,
        )
-
-
-async def get_ids_to_process(
-    *,
-    parameters: QueryParameters,
-    repository: ListingRepository,
-    task: Task,
-) -> set[int]:
-    """Fetch all listing IDs using intelligent query splitting.
-
-    Uses the QuerySplitter to adaptively split large queries and maximize
-    data extraction while respecting Rightmove's result caps.
-
-    Args:
-        parameters: Query parameters for the search.
-        repository: Repository for checking existing listings.
-        task: Celery task for progress updates.
-
-    Returns:
-        Set of new listing IDs that need to be processed.
-    """
-    config = ScraperConfig.from_env()
-    splitter = QuerySplitter(config)
-
-    # Reset throttle metrics
-    reset_throttle_metrics()
-
-    def on_progress(phase: str, message: str, **kwargs: Any) -> None:
-        meta: dict[str, Any] = {"phase": phase, "message": message}
-        meta.update(kwargs)
-        _update_task_state(task, message, meta)
-        celery_logger.info(f"[{phase}] {message}")
-
-    celery_logger.info("Starting query splitting and probing...")
-
-    try:
-        async with create_session(config) as session:
-            # Phase 1 & 2: Split and probe queries
-            _update_task_state(task, "Analyzing query and splitting by price bands...", {
-                "phase": "splitting", "progress": 0,
-            })
-            subqueries = await splitter.split(parameters, session, on_progress)
-
-            total_estimated = splitter.calculate_total_estimated_results(subqueries)
-            celery_logger.info(
-                f"Query split complete: {len(subqueries)} subqueries, "
-                f"~{total_estimated} estimated total results"
-            )
-            logger.info(
-                f"Split into {len(subqueries)} subqueries, "
-                f"estimated {total_estimated} total results"
-            )
-
-            # Phase 3: Fetch all pages for each subquery
-            _update_task_state(
-                task,
-                f"Fetching listings from {len(subqueries)} subqueries...",
-                {
-                    "phase": "fetching",
-                    "subqueries_completed": 0,
-                    "subqueries_total": len(subqueries),
-                    "ids_collected": 0,
-                    "pages_fetched": 0,
-                    "estimated_results": total_estimated,
-                },
-            )
-
-            celery_logger.info(f"Fetching pages from {len(subqueries)} subqueries...")
-
-            semaphore = asyncio.Semaphore(config.max_concurrent_requests)
-            identifiers: set[int] = set()
-            completed_subqueries = 0
-            total_pages_fetched = 0
-
-            async def fetch_subquery(sq: SubQuery) -> list[dict[str, Any]]:
-                """Fetch all pages for a single subquery."""
-                nonlocal completed_subqueries, total_pages_fetched
-                results: list[dict[str, Any]] = []
-
-                # Calculate how many pages we need based on estimated results
-                estimated = sq.estimated_results or 0
-                if estimated == 0:
-                    completed_subqueries += 1
-                    _update_task_state(
-                        task,
-                        f"Fetching: {completed_subqueries}/{len(subqueries)} subqueries",
-                        {
-                            "phase": "fetching",
-                            "subqueries_completed": completed_subqueries,
-                            "subqueries_total": len(subqueries),
-                            "ids_collected": len(identifiers),
-                            "pages_fetched": total_pages_fetched,
-                        },
-                    )
-                    return results
-
-                # Fetch pages up to max_pages_per_query or until no more results
-                page_size = parameters.page_size
-                max_pages = min(
-                    config.max_pages_per_query,
-                    (estimated // page_size) + 1,
-                )
-
-                for page_id in range(1, max_pages + 1):
-                    async with semaphore:
-                        await asyncio.sleep(config.request_delay_ms / 1000)
-                        try:
-                            result = await listing_query(
-                                page=page_id,
-                                channel=parameters.listing_type,
-                                min_bedrooms=sq.min_bedrooms,
-                                max_bedrooms=sq.max_bedrooms,
-                                radius=parameters.radius,
-                                min_price=sq.min_price,
-                                max_price=sq.max_price,
-                                district=sq.district,
-                                page_size=page_size,
-                                max_days_since_added=parameters.max_days_since_added,
-                                furnish_types=parameters.furnish_types or [],
-                                session=session,
-                                config=config,
-                            )
-                            results.append(result)
-                            total_pages_fetched += 1
-
-                            # Check if we've received all results
-                            properties = result.get("properties", [])
-                            if len(properties) < page_size:
-                                # No more results on next page
-                                break
-
-                        except CircuitBreakerOpenError as e:
-                            celery_logger.error(f"Circuit breaker open: {e}")
-                            break
-                        except ThrottlingError as e:
-                            celery_logger.warning(
-                                f"Throttling on {sq.district} page {page_id}: {e}"
-                            )
-                            break
-                        except Exception as e:
-                            if "GENERIC_ERROR" in str(e):
-                                # Reached end of results
-                                logger.debug(
-                                    f"Max page for {sq.district}: {page_id - 1}"
-                                )
-                                break
-                            logger.warning(
-                                f"Error fetching page {page_id} for {sq.district}: {e}"
-                            )
-                            break
-
-                completed_subqueries += 1
-                _update_task_state(
-                    task,
-                    f"Fetching: {completed_subqueries}/{len(subqueries)} subqueries",
-                    {
-                        "phase": "fetching",
-                        "subqueries_completed": completed_subqueries,
-                        "subqueries_total": len(subqueries),
-                        "ids_collected": len(identifiers),
-                        "pages_fetched": total_pages_fetched,
-                    },
-                )
-                return results
-
-            # Fetch all subqueries concurrently
-            all_results = await asyncio.gather(
-                *[fetch_subquery(sq) for sq in subqueries]
-            )
-
-            celery_logger.info(
-                f"Fetch complete: {total_pages_fetched} pages from "
-                f"{completed_subqueries} subqueries"
-            )
-
-            # Extract identifiers from all results
-            for subquery_results in all_results:
-                for response_json in subquery_results:
-                    if not response_json:
-                        continue
-                    if response_json.get("totalAvailableResults", 0) == 0:
-                        continue
-                    for property_data in response_json.get("properties", []):
-                        identifier = property_data.get("identifier")
-                        if identifier:
-                            identifiers.add(identifier)
-
-    except CircuitBreakerOpenError as e:
-        celery_logger.error(f"Circuit breaker prevented query: {e}")
-        # Log throttle metrics
-        metrics = get_throttle_metrics()
-        if metrics.total_requests > 0:
-            celery_logger.info(metrics.summary())
-        return set()
-    finally:
-        # Log throttle metrics
-        metrics = get_throttle_metrics()
-        if metrics.total_requests > 0:
-            celery_logger.info(f"API Stats: {metrics.total_requests} requests, "
-                               f"avg {metrics.average_response_time:.2f}s, "
-                               f"{metrics.total_throttling_events} throttled")
-
-    celery_logger.info(f"Found {len(identifiers)} unique listing IDs from API")
-    logger.info(f"Found {len(identifiers)} unique listings")
-
-    # Filter out listings already in the database
-    celery_logger.info("Checking database for existing listings...")
-    all_listing_ids = {listing.id for listing in await repository.get_listings()}
-    new_ids = identifiers - all_listing_ids
-
-    celery_logger.info(
-        f"Filtering: {len(identifiers)} total, "
-        f"{len(all_listing_ids)} existing in DB, "
-        f"{len(new_ids)} new to process"
-    )
-
-    _update_task_state(task, f"Found {len(new_ids)} new listings to process", {
-        "phase": "filtering",
-        "total_found": len(identifiers),
-        "existing_in_db": len(all_listing_ids),
-        "new_listings": len(new_ids),
-    })
-
-    return new_ids