Add crawl job progress drawer with phase tracking and live logs

- Add phase-aware progress reporting across all crawl phases (splitting, fetching, filtering, processing) with per-step counters - Add TaskProgressDrawer component with phase timeline stepper, detail counters, progress bar with ETA, and live worker log viewer - Add on_step_complete callback to ListingProcessor for granular tracking of details/images/OCR steps - Extend QuerySplitter on_progress callback with structured counter data - Capture celery worker logs via ring buffer handler and inject into task state updates for frontend display - Guard taskResult updates with phase presence check to prevent drawer from blanking during state transitions
2026-02-06 22:37:53 +00:00 · 2026-02-06 22:37:53 +00:00 · b4837e1603
commit b4837e1603
parent 4018503723
6 changed files with 617 additions and 24 deletions
--- a/crawler/tasks/listing_tasks.py
+++ b/crawler/tasks/listing_tasks.py
@ -1,6 +1,7 @@
 import asyncio
 import logging
 import time
+from collections import deque
 from typing import Any
 from celery import Task
 from celery.schedules import crontab
@ -31,6 +32,21 @@ if not celery_logger.handlers:
    celery_logger.setLevel(logging.INFO)

 SCRAPE_LOCK_NAME = "scrape_listings"
+LOG_BUFFER_MAX_LINES = 200
+
+
+class TaskLogHandler(logging.Handler):
+    """Captures log records into a deque for inclusion in task state updates."""
+
+    def __init__(self, buffer: deque[str]) -> None:
+        super().__init__()
+        self.buffer = buffer
+
+    def emit(self, record: logging.LogRecord) -> None:
+        try:
+            self.buffer.append(self.format(record))
+        except Exception:
+            pass


@app.task(bind=True, pydantic=True)
@ -49,9 +65,9 @@ def dump_listings_task(self: Task, parameters_json: str) -> dict[str, Any]:
        parsed_parameters = QueryParameters.model_validate_json(parameters_json)
        celery_logger.info(f"Starting scrape with parameters: {parsed_parameters}")

-        self.update_state(state="Starting...", meta={"progress": 0})
+        self.update_state(state="Starting...", meta={"phase": "splitting", "progress": 0})
        asyncio.run(dump_listings_full(task=self, parameters=parsed_parameters))
-        return {"progress": 0}
+        return {"phase": "completed", "progress": 1}


 async def async_dump_listings_task(parameters_json: str) -> dict[str, Any]:
@ -70,6 +86,39 @@ async def dump_listings_full(
    *, task: Task, parameters: QueryParameters
 ) -> list[Listing]:
    """Fetches all listings, images as well as detects floorplans"""
+    # Set up log capture: a ring buffer handler that we inject into every
+    # task.update_state() call so the frontend can display live logs.
+    log_buffer: deque[str] = deque(maxlen=LOG_BUFFER_MAX_LINES)
+    log_handler = TaskLogHandler(log_buffer)
+    log_handler.setFormatter(
+        logging.Formatter("%(asctime)s %(message)s", datefmt="%H:%M:%S")
+    )
+    celery_logger.addHandler(log_handler)
+
+    # Wrap task.update_state so every call automatically includes logs
+    _original_update_state = task.update_state
+
+    def _update_state_with_logs(
+        state: str | None = None, meta: dict[str, Any] | None = None, **kwargs: Any
+    ) -> None:
+        if meta is None:
+            meta = {}
+        meta["logs"] = list(log_buffer)
+        _original_update_state(state=state, meta=meta, **kwargs)
+
+    task.update_state = _update_state_with_logs  # type: ignore[assignment]
+
+    try:
+        return await _dump_listings_full_inner(task=task, parameters=parameters)
+    finally:
+        celery_logger.removeHandler(log_handler)
+        task.update_state = _original_update_state  # type: ignore[assignment]
+
+
+async def _dump_listings_full_inner(
+    *, task: Task, parameters: QueryParameters
+) -> list[Listing]:
+    """Inner implementation — called with log-capturing update_state wrapper."""
    start_time = time.time()
    celery_logger.info("=" * 60)
    celery_logger.info("PHASE 1: Initializing listing fetch")
@ -77,7 +126,7 @@ async def dump_listings_full(

    repository = ListingRepository(engine)

-    task.update_state(state="Identifying missing listings", meta={"progress": 0})
+    task.update_state(state="Identifying missing listings", meta={"phase": "splitting", "progress": 0})
    celery_logger.info("Querying Rightmove API to identify new listings...")
    ids_to_process = await get_ids_to_process(
        parameters=parameters, repository=repository, task=task
@ -92,7 +141,7 @@ async def dump_listings_full(
        invalidate_cache()
        task.update_state(
            state="No new listings found",
-            meta={"progress": 1, "processed": 0, "total": 0, "message": "All listings are up to date"},
+            meta={"phase": "completed", "progress": 1, "processed": 0, "total": 0, "message": "All listings are up to date"},
        )
        return []

@ -115,6 +164,18 @@ async def dump_listings_full(

    invalidate_cache()

+    # Send final state so the frontend has rich data even after task completes
+    task.update_state(
+        state="Completed",
+        meta={
+            "phase": "completed",
+            "progress": 1,
+            "processed": len(result),
+            "total": len(ids_to_process),
+            "message": f"Processed {len(result)} listings in {elapsed:.0f}s",
+        },
+    )
+
    return result


@ -124,11 +185,26 @@ async def dump_listings_and_monitor(
    task_progress = {missing_id: 0 for missing_id in missing_ids}
    processed_count = 0
    failed_count = 0
+    details_fetched = 0
+    images_downloaded = 0
+    ocr_completed = 0
    start_time = time.time()

    async def process(missing_id: int) -> Listing | None:
        nonlocal processed_count, failed_count
-        listing = await listing_processor.process_listing(missing_id)
+
+        def step_callback(step_name: str) -> None:
+            nonlocal details_fetched, images_downloaded, ocr_completed
+            if step_name == "details":
+                details_fetched += 1
+            elif step_name == "images":
+                images_downloaded += 1
+            elif step_name == "ocr":
+                ocr_completed += 1
+
+        listing = await listing_processor.process_listing(
+            missing_id, on_step_complete=step_callback
+        )
        task_progress[missing_id] = 1
        if listing is not None:
            processed_count += 1
@ -141,12 +217,12 @@ async def dump_listings_and_monitor(
        while (progress := sum(task_progress.values())) < len(missing_ids):
            progress_ratio = round(progress / len(missing_ids), 2)

+            elapsed = time.time() - start_time
+            rate = progress / elapsed if elapsed > 0 else 0
+            eta = (len(missing_ids) - progress) / rate if rate > 0 else 0
+
            # Log every 10% progress or at least every update
            if progress_ratio >= last_progress + 0.1 or progress == 1:
-                elapsed = time.time() - start_time
-                rate = progress / elapsed if elapsed > 0 else 0
-                eta = (len(missing_ids) - progress) / rate if rate > 0 else 0
-
                celery_logger.info(
                    f"Progress: {progress_ratio * 100:.0f}% "
                    f"({progress}/{len(missing_ids)}) "
@ -158,7 +234,19 @@ async def dump_listings_and_monitor(

            task.update_state(
                state=f"Processing: {progress_ratio * 100:.0f}% ({progress}/{len(missing_ids)})",
-                meta={"progress": progress_ratio, "processed": progress, "total": len(missing_ids)},
+                meta={
+                    "phase": "processing",
+                    "progress": progress_ratio,
+                    "processed": progress,
+                    "total": len(missing_ids),
+                    "details_fetched": details_fetched,
+                    "images_downloaded": images_downloaded,
+                    "ocr_completed": ocr_completed,
+                    "failed": failed_count,
+                    "elapsed_seconds": round(elapsed, 1),
+                    "rate_per_second": round(rate, 2),
+                    "eta_seconds": round(eta, 1),
+                },
            )
            await asyncio.sleep(1)

@ -224,8 +312,10 @@ async def get_ids_to_process(
    # Reset throttle metrics
    reset_throttle_metrics()

-    def on_progress(phase: str, message: str) -> None:
-        task.update_state(state=message, meta={"phase": phase})
+    def on_progress(phase: str, message: str, **kwargs: Any) -> None:
+        meta: dict[str, Any] = {"phase": phase, "message": message}
+        meta.update(kwargs)
+        task.update_state(state=message, meta=meta)
        celery_logger.info(f"[{phase}] {message}")

    celery_logger.info("Starting query splitting and probing...")
@ -254,7 +344,10 @@ async def get_ids_to_process(
                state=f"Fetching listings from {len(subqueries)} subqueries...",
                meta={
                    "phase": "fetching",
-                    "subqueries": len(subqueries),
+                    "subqueries_completed": 0,
+                    "subqueries_total": len(subqueries),
+                    "ids_collected": 0,
+                    "pages_fetched": 0,
                    "estimated_results": total_estimated,
                },
            )
@ -275,6 +368,16 @@ async def get_ids_to_process(
                estimated = sq.estimated_results or 0
                if estimated == 0:
                    completed_subqueries += 1
+                    task.update_state(
+                        state=f"Fetching: {completed_subqueries}/{len(subqueries)} subqueries",
+                        meta={
+                            "phase": "fetching",
+                            "subqueries_completed": completed_subqueries,
+                            "subqueries_total": len(subqueries),
+                            "ids_collected": len(identifiers),
+                            "pages_fetched": total_pages_fetched,
+                        },
+                    )
                    return results

                # Fetch pages up to max_pages_per_query or until no more results
@ -333,6 +436,16 @@ async def get_ids_to_process(
                            break

                completed_subqueries += 1
+                task.update_state(
+                    state=f"Fetching: {completed_subqueries}/{len(subqueries)} subqueries",
+                    meta={
+                        "phase": "fetching",
+                        "subqueries_completed": completed_subqueries,
+                        "subqueries_total": len(subqueries),
+                        "ids_collected": len(identifiers),
+                        "pages_fetched": total_pages_fetched,
+                    },
+                )
                return results

            # Fetch all subqueries concurrently
@ -391,6 +504,7 @@ async def get_ids_to_process(
        meta={
            "phase": "filtering",
            "total_found": len(identifiers),
+            "existing_in_db": len(all_listing_ids),
            "new_listings": len(new_ids),
        },
    )