Fix live logs: replace monkey-patch with direct log injection

The previous approach monkey-patched task.update_state on the Celery
Task instance, but the assignment didn't take effect (Celery's Task
singleton may prevent instance method shadowing). Additionally, the
celery.task logger level was left at WARNING by Celery's worker setup,
silencing all INFO-level log capture.

Fix:
- Replace wrapper with _update_task_state() helper that directly injects
  logs from a module-level _active_log_buffer into every meta dict
- Attach TaskLogHandler to BOTH celery.task and uvicorn.error loggers
- Force both loggers to INFO level during task execution
- Every task.update_state call site now uses _update_task_state()
This commit is contained in:
Viktor Barzin 2026-02-06 22:55:40 +00:00
parent b4837e1603
commit 7e8f1f0339
No known key found for this signature in database
GPG key ID: 0EB088298288D958

View file

@ -34,6 +34,10 @@ if not celery_logger.handlers:
SCRAPE_LOCK_NAME = "scrape_listings"
LOG_BUFFER_MAX_LINES = 200
# Module-level log buffer — active only during task execution.
# The TaskLogHandler appends here; _update_task_state reads from here.
_active_log_buffer: deque[str] | None = None
class TaskLogHandler(logging.Handler):
"""Captures log records into a deque for inclusion in task state updates."""
@ -49,6 +53,13 @@ class TaskLogHandler(logging.Handler):
pass
def _update_task_state(task: Task, state: str, meta: dict[str, Any]) -> None:
"""Call task.update_state with logs injected from the active log buffer."""
if _active_log_buffer is not None:
meta["logs"] = list(_active_log_buffer)
task.update_state(state=state, meta=meta)
@app.task(bind=True, pydantic=True)
def dump_listings_task(self: Task, parameters_json: str) -> dict[str, Any]:
with redis_lock(SCRAPE_LOCK_NAME) as acquired:
@ -86,39 +97,44 @@ async def dump_listings_full(
*, task: Task, parameters: QueryParameters
) -> list[Listing]:
"""Fetches all listings, images as well as detects floorplans"""
# Set up log capture: a ring buffer handler that we inject into every
# task.update_state() call so the frontend can display live logs.
global _active_log_buffer
# Set up log capture into a module-level buffer so _update_task_state
# can inject logs into every state update.
log_buffer: deque[str] = deque(maxlen=LOG_BUFFER_MAX_LINES)
log_handler = TaskLogHandler(log_buffer)
log_handler.setFormatter(
logging.Formatter("%(asctime)s %(message)s", datefmt="%H:%M:%S")
)
# Attach handler to both loggers used in the codebase, and ensure
# they accept INFO-level messages (Celery's worker setup may leave
# the celery.task logger at WARNING).
_prev_celery_level = celery_logger.level
_prev_logger_level = logger.level
celery_logger.addHandler(log_handler)
logger.addHandler(log_handler)
if celery_logger.level == logging.NOTSET or celery_logger.level > logging.INFO:
celery_logger.setLevel(logging.INFO)
if logger.level == logging.NOTSET or logger.level > logging.INFO:
logger.setLevel(logging.INFO)
# Wrap task.update_state so every call automatically includes logs
_original_update_state = task.update_state
def _update_state_with_logs(
state: str | None = None, meta: dict[str, Any] | None = None, **kwargs: Any
) -> None:
if meta is None:
meta = {}
meta["logs"] = list(log_buffer)
_original_update_state(state=state, meta=meta, **kwargs)
task.update_state = _update_state_with_logs # type: ignore[assignment]
_active_log_buffer = log_buffer
try:
return await _dump_listings_full_inner(task=task, parameters=parameters)
finally:
_active_log_buffer = None
celery_logger.removeHandler(log_handler)
task.update_state = _original_update_state # type: ignore[assignment]
logger.removeHandler(log_handler)
celery_logger.setLevel(_prev_celery_level)
logger.setLevel(_prev_logger_level)
async def _dump_listings_full_inner(
*, task: Task, parameters: QueryParameters
) -> list[Listing]:
"""Inner implementation — called with log-capturing update_state wrapper."""
"""Inner implementation with log capture active."""
start_time = time.time()
celery_logger.info("=" * 60)
celery_logger.info("PHASE 1: Initializing listing fetch")
@ -126,7 +142,7 @@ async def _dump_listings_full_inner(
repository = ListingRepository(engine)
task.update_state(state="Identifying missing listings", meta={"phase": "splitting", "progress": 0})
_update_task_state(task, "Identifying missing listings", {"phase": "splitting", "progress": 0})
celery_logger.info("Querying Rightmove API to identify new listings...")
ids_to_process = await get_ids_to_process(
parameters=parameters, repository=repository, task=task
@ -139,10 +155,10 @@ async def _dump_listings_full_inner(
elapsed = time.time() - start_time
celery_logger.info(f"No new listings found. Completed in {elapsed:.1f}s")
invalidate_cache()
task.update_state(
state="No new listings found",
meta={"phase": "completed", "progress": 1, "processed": 0, "total": 0, "message": "All listings are up to date"},
)
_update_task_state(task, "No new listings found", {
"phase": "completed", "progress": 1, "processed": 0, "total": 0,
"message": "All listings are up to date",
})
return []
celery_logger.info("=" * 60)
@ -165,16 +181,11 @@ async def _dump_listings_full_inner(
invalidate_cache()
# Send final state so the frontend has rich data even after task completes
task.update_state(
state="Completed",
meta={
"phase": "completed",
"progress": 1,
"processed": len(result),
"total": len(ids_to_process),
"message": f"Processed {len(result)} listings in {elapsed:.0f}s",
},
)
_update_task_state(task, "Completed", {
"phase": "completed", "progress": 1,
"processed": len(result), "total": len(ids_to_process),
"message": f"Processed {len(result)} listings in {elapsed:.0f}s",
})
return result
@ -232,9 +243,10 @@ async def dump_listings_and_monitor(
)
last_progress = progress_ratio
task.update_state(
state=f"Processing: {progress_ratio * 100:.0f}% ({progress}/{len(missing_ids)})",
meta={
_update_task_state(
task,
f"Processing: {progress_ratio * 100:.0f}% ({progress}/{len(missing_ids)})",
{
"phase": "processing",
"progress": progress_ratio,
"processed": progress,
@ -315,7 +327,7 @@ async def get_ids_to_process(
def on_progress(phase: str, message: str, **kwargs: Any) -> None:
meta: dict[str, Any] = {"phase": phase, "message": message}
meta.update(kwargs)
task.update_state(state=message, meta=meta)
_update_task_state(task, message, meta)
celery_logger.info(f"[{phase}] {message}")
celery_logger.info("Starting query splitting and probing...")
@ -323,10 +335,9 @@ async def get_ids_to_process(
try:
async with create_session(config) as session:
# Phase 1 & 2: Split and probe queries
task.update_state(
state="Analyzing query and splitting by price bands...",
meta={"phase": "splitting", "progress": 0},
)
_update_task_state(task, "Analyzing query and splitting by price bands...", {
"phase": "splitting", "progress": 0,
})
subqueries = await splitter.split(parameters, session, on_progress)
total_estimated = splitter.calculate_total_estimated_results(subqueries)
@ -340,9 +351,10 @@ async def get_ids_to_process(
)
# Phase 3: Fetch all pages for each subquery
task.update_state(
state=f"Fetching listings from {len(subqueries)} subqueries...",
meta={
_update_task_state(
task,
f"Fetching listings from {len(subqueries)} subqueries...",
{
"phase": "fetching",
"subqueries_completed": 0,
"subqueries_total": len(subqueries),
@ -368,9 +380,10 @@ async def get_ids_to_process(
estimated = sq.estimated_results or 0
if estimated == 0:
completed_subqueries += 1
task.update_state(
state=f"Fetching: {completed_subqueries}/{len(subqueries)} subqueries",
meta={
_update_task_state(
task,
f"Fetching: {completed_subqueries}/{len(subqueries)} subqueries",
{
"phase": "fetching",
"subqueries_completed": completed_subqueries,
"subqueries_total": len(subqueries),
@ -436,9 +449,10 @@ async def get_ids_to_process(
break
completed_subqueries += 1
task.update_state(
state=f"Fetching: {completed_subqueries}/{len(subqueries)} subqueries",
meta={
_update_task_state(
task,
f"Fetching: {completed_subqueries}/{len(subqueries)} subqueries",
{
"phase": "fetching",
"subqueries_completed": completed_subqueries,
"subqueries_total": len(subqueries),
@ -499,14 +513,11 @@ async def get_ids_to_process(
f"{len(new_ids)} new to process"
)
task.update_state(
state=f"Found {len(new_ids)} new listings to process",
meta={
"phase": "filtering",
"total_found": len(identifiers),
"existing_in_db": len(all_listing_ids),
"new_listings": len(new_ids),
},
)
_update_task_state(task, f"Found {len(new_ids)} new listings to process", {
"phase": "filtering",
"total_found": len(identifiers),
"existing_in_db": len(all_listing_ids),
"new_listings": len(new_ids),
})
return new_ids