Fix live logs: replace monkey-patch with direct log injection

The previous approach monkey-patched task.update_state on the Celery Task instance, but the assignment didn't take effect (Celery's Task singleton may prevent instance method shadowing). Additionally, the celery.task logger level was left at WARNING by Celery's worker setup, silencing all INFO-level log capture. Fix: - Replace wrapper with _update_task_state() helper that directly injects logs from a module-level _active_log_buffer into every meta dict - Attach TaskLogHandler to BOTH celery.task and uvicorn.error loggers - Force both loggers to INFO level during task execution - Every task.update_state call site now uses _update_task_state()
2026-02-06 22:55:40 +00:00 · 2026-02-06 22:55:40 +00:00 · 7e8f1f0339
commit 7e8f1f0339
parent b4837e1603
1 changed files with 68 additions and 57 deletions
--- a/crawler/tasks/listing_tasks.py
+++ b/crawler/tasks/listing_tasks.py
@ -34,6 +34,10 @@ if not celery_logger.handlers:
 SCRAPE_LOCK_NAME = "scrape_listings"
 LOG_BUFFER_MAX_LINES = 200

+# Module-level log buffer — active only during task execution.
+# The TaskLogHandler appends here; _update_task_state reads from here.
+_active_log_buffer: deque[str] | None = None
+

 class TaskLogHandler(logging.Handler):
    """Captures log records into a deque for inclusion in task state updates."""
@ -49,6 +53,13 @@ class TaskLogHandler(logging.Handler):
            pass


+def _update_task_state(task: Task, state: str, meta: dict[str, Any]) -> None:
+    """Call task.update_state with logs injected from the active log buffer."""
+    if _active_log_buffer is not None:
+        meta["logs"] = list(_active_log_buffer)
+    task.update_state(state=state, meta=meta)
+
+
@app.task(bind=True, pydantic=True)
 def dump_listings_task(self: Task, parameters_json: str) -> dict[str, Any]:
    with redis_lock(SCRAPE_LOCK_NAME) as acquired:
@ -86,39 +97,44 @@ async def dump_listings_full(
    *, task: Task, parameters: QueryParameters
 ) -> list[Listing]:
    """Fetches all listings, images as well as detects floorplans"""
-    # Set up log capture: a ring buffer handler that we inject into every
-    # task.update_state() call so the frontend can display live logs.
+    global _active_log_buffer
+
+    # Set up log capture into a module-level buffer so _update_task_state
+    # can inject logs into every state update.
    log_buffer: deque[str] = deque(maxlen=LOG_BUFFER_MAX_LINES)
    log_handler = TaskLogHandler(log_buffer)
    log_handler.setFormatter(
        logging.Formatter("%(asctime)s %(message)s", datefmt="%H:%M:%S")
    )
+
+    # Attach handler to both loggers used in the codebase, and ensure
+    # they accept INFO-level messages (Celery's worker setup may leave
+    # the celery.task logger at WARNING).
+    _prev_celery_level = celery_logger.level
+    _prev_logger_level = logger.level
    celery_logger.addHandler(log_handler)
+    logger.addHandler(log_handler)
+    if celery_logger.level == logging.NOTSET or celery_logger.level > logging.INFO:
+        celery_logger.setLevel(logging.INFO)
+    if logger.level == logging.NOTSET or logger.level > logging.INFO:
+        logger.setLevel(logging.INFO)

-    # Wrap task.update_state so every call automatically includes logs
-    _original_update_state = task.update_state
-
-    def _update_state_with_logs(
-        state: str | None = None, meta: dict[str, Any] | None = None, **kwargs: Any
-    ) -> None:
-        if meta is None:
-            meta = {}
-        meta["logs"] = list(log_buffer)
-        _original_update_state(state=state, meta=meta, **kwargs)
-
-    task.update_state = _update_state_with_logs  # type: ignore[assignment]
+    _active_log_buffer = log_buffer

    try:
        return await _dump_listings_full_inner(task=task, parameters=parameters)
    finally:
+        _active_log_buffer = None
        celery_logger.removeHandler(log_handler)
-        task.update_state = _original_update_state  # type: ignore[assignment]
+        logger.removeHandler(log_handler)
+        celery_logger.setLevel(_prev_celery_level)
+        logger.setLevel(_prev_logger_level)


 async def _dump_listings_full_inner(
    *, task: Task, parameters: QueryParameters
 ) -> list[Listing]:
-    """Inner implementation — called with log-capturing update_state wrapper."""
+    """Inner implementation with log capture active."""
    start_time = time.time()
    celery_logger.info("=" * 60)
    celery_logger.info("PHASE 1: Initializing listing fetch")
@ -126,7 +142,7 @@ async def _dump_listings_full_inner(

    repository = ListingRepository(engine)

-    task.update_state(state="Identifying missing listings", meta={"phase": "splitting", "progress": 0})
+    _update_task_state(task, "Identifying missing listings", {"phase": "splitting", "progress": 0})
    celery_logger.info("Querying Rightmove API to identify new listings...")
    ids_to_process = await get_ids_to_process(
        parameters=parameters, repository=repository, task=task
@ -139,10 +155,10 @@ async def _dump_listings_full_inner(
        elapsed = time.time() - start_time
        celery_logger.info(f"No new listings found. Completed in {elapsed:.1f}s")
        invalidate_cache()
-        task.update_state(
-            state="No new listings found",
-            meta={"phase": "completed", "progress": 1, "processed": 0, "total": 0, "message": "All listings are up to date"},
-        )
+        _update_task_state(task, "No new listings found", {
+            "phase": "completed", "progress": 1, "processed": 0, "total": 0,
+            "message": "All listings are up to date",
+        })
        return []

    celery_logger.info("=" * 60)
@ -165,16 +181,11 @@ async def _dump_listings_full_inner(
    invalidate_cache()

    # Send final state so the frontend has rich data even after task completes
-    task.update_state(
-        state="Completed",
-        meta={
-            "phase": "completed",
-            "progress": 1,
-            "processed": len(result),
-            "total": len(ids_to_process),
-            "message": f"Processed {len(result)} listings in {elapsed:.0f}s",
-        },
-    )
+    _update_task_state(task, "Completed", {
+        "phase": "completed", "progress": 1,
+        "processed": len(result), "total": len(ids_to_process),
+        "message": f"Processed {len(result)} listings in {elapsed:.0f}s",
+    })

    return result

@ -232,9 +243,10 @@ async def dump_listings_and_monitor(
                )
                last_progress = progress_ratio

-            task.update_state(
-                state=f"Processing: {progress_ratio * 100:.0f}% ({progress}/{len(missing_ids)})",
-                meta={
+            _update_task_state(
+                task,
+                f"Processing: {progress_ratio * 100:.0f}% ({progress}/{len(missing_ids)})",
+                {
                    "phase": "processing",
                    "progress": progress_ratio,
                    "processed": progress,
@ -315,7 +327,7 @@ async def get_ids_to_process(
    def on_progress(phase: str, message: str, **kwargs: Any) -> None:
        meta: dict[str, Any] = {"phase": phase, "message": message}
        meta.update(kwargs)
-        task.update_state(state=message, meta=meta)
+        _update_task_state(task, message, meta)
        celery_logger.info(f"[{phase}] {message}")

    celery_logger.info("Starting query splitting and probing...")
@ -323,10 +335,9 @@ async def get_ids_to_process(
    try:
        async with create_session(config) as session:
            # Phase 1 & 2: Split and probe queries
-            task.update_state(
-                state="Analyzing query and splitting by price bands...",
-                meta={"phase": "splitting", "progress": 0},
-            )
+            _update_task_state(task, "Analyzing query and splitting by price bands...", {
+                "phase": "splitting", "progress": 0,
+            })
            subqueries = await splitter.split(parameters, session, on_progress)

            total_estimated = splitter.calculate_total_estimated_results(subqueries)
@ -340,9 +351,10 @@ async def get_ids_to_process(
            )

            # Phase 3: Fetch all pages for each subquery
-            task.update_state(
-                state=f"Fetching listings from {len(subqueries)} subqueries...",
-                meta={
+            _update_task_state(
+                task,
+                f"Fetching listings from {len(subqueries)} subqueries...",
+                {
                    "phase": "fetching",
                    "subqueries_completed": 0,
                    "subqueries_total": len(subqueries),
@ -368,9 +380,10 @@ async def get_ids_to_process(
                estimated = sq.estimated_results or 0
                if estimated == 0:
                    completed_subqueries += 1
-                    task.update_state(
-                        state=f"Fetching: {completed_subqueries}/{len(subqueries)} subqueries",
-                        meta={
+                    _update_task_state(
+                        task,
+                        f"Fetching: {completed_subqueries}/{len(subqueries)} subqueries",
+                        {
                            "phase": "fetching",
                            "subqueries_completed": completed_subqueries,
                            "subqueries_total": len(subqueries),
@ -436,9 +449,10 @@ async def get_ids_to_process(
                            break

                completed_subqueries += 1
-                task.update_state(
-                    state=f"Fetching: {completed_subqueries}/{len(subqueries)} subqueries",
-                    meta={
+                _update_task_state(
+                    task,
+                    f"Fetching: {completed_subqueries}/{len(subqueries)} subqueries",
+                    {
                        "phase": "fetching",
                        "subqueries_completed": completed_subqueries,
                        "subqueries_total": len(subqueries),
@ -499,14 +513,11 @@ async def get_ids_to_process(
        f"{len(new_ids)} new to process"
    )

-    task.update_state(
-        state=f"Found {len(new_ids)} new listings to process",
-        meta={
-            "phase": "filtering",
-            "total_found": len(identifiers),
-            "existing_in_db": len(all_listing_ids),
-            "new_listings": len(new_ids),
-        },
-    )
+    _update_task_state(task, f"Found {len(new_ids)} new listings to process", {
+        "phase": "filtering",
+        "total_found": len(identifiers),
+        "existing_in_db": len(all_listing_ids),
+        "new_listings": len(new_ids),
+    })

    return new_ids