Add crawl job progress drawer with phase tracking and live logs
- Add phase-aware progress reporting across all crawl phases (splitting, fetching, filtering, processing) with per-step counters - Add TaskProgressDrawer component with phase timeline stepper, detail counters, progress bar with ETA, and live worker log viewer - Add on_step_complete callback to ListingProcessor for granular tracking of details/images/OCR steps - Extend QuerySplitter on_progress callback with structured counter data - Capture celery worker logs via ring buffer handler and inject into task state updates for frontend display - Guard taskResult updates with phase presence check to prevent drawer from blanking during state transitions
This commit is contained in:
parent
4018503723
commit
b4837e1603
6 changed files with 617 additions and 24 deletions
|
|
@ -1,6 +1,7 @@
|
|||
import asyncio
|
||||
import logging
|
||||
import time
|
||||
from collections import deque
|
||||
from typing import Any
|
||||
from celery import Task
|
||||
from celery.schedules import crontab
|
||||
|
|
@ -31,6 +32,21 @@ if not celery_logger.handlers:
|
|||
celery_logger.setLevel(logging.INFO)
|
||||
|
||||
SCRAPE_LOCK_NAME = "scrape_listings"
|
||||
LOG_BUFFER_MAX_LINES = 200
|
||||
|
||||
|
||||
class TaskLogHandler(logging.Handler):
|
||||
"""Captures log records into a deque for inclusion in task state updates."""
|
||||
|
||||
def __init__(self, buffer: deque[str]) -> None:
|
||||
super().__init__()
|
||||
self.buffer = buffer
|
||||
|
||||
def emit(self, record: logging.LogRecord) -> None:
|
||||
try:
|
||||
self.buffer.append(self.format(record))
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
@app.task(bind=True, pydantic=True)
|
||||
|
|
@ -49,9 +65,9 @@ def dump_listings_task(self: Task, parameters_json: str) -> dict[str, Any]:
|
|||
parsed_parameters = QueryParameters.model_validate_json(parameters_json)
|
||||
celery_logger.info(f"Starting scrape with parameters: {parsed_parameters}")
|
||||
|
||||
self.update_state(state="Starting...", meta={"progress": 0})
|
||||
self.update_state(state="Starting...", meta={"phase": "splitting", "progress": 0})
|
||||
asyncio.run(dump_listings_full(task=self, parameters=parsed_parameters))
|
||||
return {"progress": 0}
|
||||
return {"phase": "completed", "progress": 1}
|
||||
|
||||
|
||||
async def async_dump_listings_task(parameters_json: str) -> dict[str, Any]:
|
||||
|
|
@ -70,6 +86,39 @@ async def dump_listings_full(
|
|||
*, task: Task, parameters: QueryParameters
|
||||
) -> list[Listing]:
|
||||
"""Fetches all listings, images as well as detects floorplans"""
|
||||
# Set up log capture: a ring buffer handler that we inject into every
|
||||
# task.update_state() call so the frontend can display live logs.
|
||||
log_buffer: deque[str] = deque(maxlen=LOG_BUFFER_MAX_LINES)
|
||||
log_handler = TaskLogHandler(log_buffer)
|
||||
log_handler.setFormatter(
|
||||
logging.Formatter("%(asctime)s %(message)s", datefmt="%H:%M:%S")
|
||||
)
|
||||
celery_logger.addHandler(log_handler)
|
||||
|
||||
# Wrap task.update_state so every call automatically includes logs
|
||||
_original_update_state = task.update_state
|
||||
|
||||
def _update_state_with_logs(
|
||||
state: str | None = None, meta: dict[str, Any] | None = None, **kwargs: Any
|
||||
) -> None:
|
||||
if meta is None:
|
||||
meta = {}
|
||||
meta["logs"] = list(log_buffer)
|
||||
_original_update_state(state=state, meta=meta, **kwargs)
|
||||
|
||||
task.update_state = _update_state_with_logs # type: ignore[assignment]
|
||||
|
||||
try:
|
||||
return await _dump_listings_full_inner(task=task, parameters=parameters)
|
||||
finally:
|
||||
celery_logger.removeHandler(log_handler)
|
||||
task.update_state = _original_update_state # type: ignore[assignment]
|
||||
|
||||
|
||||
async def _dump_listings_full_inner(
|
||||
*, task: Task, parameters: QueryParameters
|
||||
) -> list[Listing]:
|
||||
"""Inner implementation — called with log-capturing update_state wrapper."""
|
||||
start_time = time.time()
|
||||
celery_logger.info("=" * 60)
|
||||
celery_logger.info("PHASE 1: Initializing listing fetch")
|
||||
|
|
@ -77,7 +126,7 @@ async def dump_listings_full(
|
|||
|
||||
repository = ListingRepository(engine)
|
||||
|
||||
task.update_state(state="Identifying missing listings", meta={"progress": 0})
|
||||
task.update_state(state="Identifying missing listings", meta={"phase": "splitting", "progress": 0})
|
||||
celery_logger.info("Querying Rightmove API to identify new listings...")
|
||||
ids_to_process = await get_ids_to_process(
|
||||
parameters=parameters, repository=repository, task=task
|
||||
|
|
@ -92,7 +141,7 @@ async def dump_listings_full(
|
|||
invalidate_cache()
|
||||
task.update_state(
|
||||
state="No new listings found",
|
||||
meta={"progress": 1, "processed": 0, "total": 0, "message": "All listings are up to date"},
|
||||
meta={"phase": "completed", "progress": 1, "processed": 0, "total": 0, "message": "All listings are up to date"},
|
||||
)
|
||||
return []
|
||||
|
||||
|
|
@ -115,6 +164,18 @@ async def dump_listings_full(
|
|||
|
||||
invalidate_cache()
|
||||
|
||||
# Send final state so the frontend has rich data even after task completes
|
||||
task.update_state(
|
||||
state="Completed",
|
||||
meta={
|
||||
"phase": "completed",
|
||||
"progress": 1,
|
||||
"processed": len(result),
|
||||
"total": len(ids_to_process),
|
||||
"message": f"Processed {len(result)} listings in {elapsed:.0f}s",
|
||||
},
|
||||
)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
|
|
@ -124,11 +185,26 @@ async def dump_listings_and_monitor(
|
|||
task_progress = {missing_id: 0 for missing_id in missing_ids}
|
||||
processed_count = 0
|
||||
failed_count = 0
|
||||
details_fetched = 0
|
||||
images_downloaded = 0
|
||||
ocr_completed = 0
|
||||
start_time = time.time()
|
||||
|
||||
async def process(missing_id: int) -> Listing | None:
|
||||
nonlocal processed_count, failed_count
|
||||
listing = await listing_processor.process_listing(missing_id)
|
||||
|
||||
def step_callback(step_name: str) -> None:
|
||||
nonlocal details_fetched, images_downloaded, ocr_completed
|
||||
if step_name == "details":
|
||||
details_fetched += 1
|
||||
elif step_name == "images":
|
||||
images_downloaded += 1
|
||||
elif step_name == "ocr":
|
||||
ocr_completed += 1
|
||||
|
||||
listing = await listing_processor.process_listing(
|
||||
missing_id, on_step_complete=step_callback
|
||||
)
|
||||
task_progress[missing_id] = 1
|
||||
if listing is not None:
|
||||
processed_count += 1
|
||||
|
|
@ -141,12 +217,12 @@ async def dump_listings_and_monitor(
|
|||
while (progress := sum(task_progress.values())) < len(missing_ids):
|
||||
progress_ratio = round(progress / len(missing_ids), 2)
|
||||
|
||||
elapsed = time.time() - start_time
|
||||
rate = progress / elapsed if elapsed > 0 else 0
|
||||
eta = (len(missing_ids) - progress) / rate if rate > 0 else 0
|
||||
|
||||
# Log every 10% progress or at least every update
|
||||
if progress_ratio >= last_progress + 0.1 or progress == 1:
|
||||
elapsed = time.time() - start_time
|
||||
rate = progress / elapsed if elapsed > 0 else 0
|
||||
eta = (len(missing_ids) - progress) / rate if rate > 0 else 0
|
||||
|
||||
celery_logger.info(
|
||||
f"Progress: {progress_ratio * 100:.0f}% "
|
||||
f"({progress}/{len(missing_ids)}) "
|
||||
|
|
@ -158,7 +234,19 @@ async def dump_listings_and_monitor(
|
|||
|
||||
task.update_state(
|
||||
state=f"Processing: {progress_ratio * 100:.0f}% ({progress}/{len(missing_ids)})",
|
||||
meta={"progress": progress_ratio, "processed": progress, "total": len(missing_ids)},
|
||||
meta={
|
||||
"phase": "processing",
|
||||
"progress": progress_ratio,
|
||||
"processed": progress,
|
||||
"total": len(missing_ids),
|
||||
"details_fetched": details_fetched,
|
||||
"images_downloaded": images_downloaded,
|
||||
"ocr_completed": ocr_completed,
|
||||
"failed": failed_count,
|
||||
"elapsed_seconds": round(elapsed, 1),
|
||||
"rate_per_second": round(rate, 2),
|
||||
"eta_seconds": round(eta, 1),
|
||||
},
|
||||
)
|
||||
await asyncio.sleep(1)
|
||||
|
||||
|
|
@ -224,8 +312,10 @@ async def get_ids_to_process(
|
|||
# Reset throttle metrics
|
||||
reset_throttle_metrics()
|
||||
|
||||
def on_progress(phase: str, message: str) -> None:
|
||||
task.update_state(state=message, meta={"phase": phase})
|
||||
def on_progress(phase: str, message: str, **kwargs: Any) -> None:
|
||||
meta: dict[str, Any] = {"phase": phase, "message": message}
|
||||
meta.update(kwargs)
|
||||
task.update_state(state=message, meta=meta)
|
||||
celery_logger.info(f"[{phase}] {message}")
|
||||
|
||||
celery_logger.info("Starting query splitting and probing...")
|
||||
|
|
@ -254,7 +344,10 @@ async def get_ids_to_process(
|
|||
state=f"Fetching listings from {len(subqueries)} subqueries...",
|
||||
meta={
|
||||
"phase": "fetching",
|
||||
"subqueries": len(subqueries),
|
||||
"subqueries_completed": 0,
|
||||
"subqueries_total": len(subqueries),
|
||||
"ids_collected": 0,
|
||||
"pages_fetched": 0,
|
||||
"estimated_results": total_estimated,
|
||||
},
|
||||
)
|
||||
|
|
@ -275,6 +368,16 @@ async def get_ids_to_process(
|
|||
estimated = sq.estimated_results or 0
|
||||
if estimated == 0:
|
||||
completed_subqueries += 1
|
||||
task.update_state(
|
||||
state=f"Fetching: {completed_subqueries}/{len(subqueries)} subqueries",
|
||||
meta={
|
||||
"phase": "fetching",
|
||||
"subqueries_completed": completed_subqueries,
|
||||
"subqueries_total": len(subqueries),
|
||||
"ids_collected": len(identifiers),
|
||||
"pages_fetched": total_pages_fetched,
|
||||
},
|
||||
)
|
||||
return results
|
||||
|
||||
# Fetch pages up to max_pages_per_query or until no more results
|
||||
|
|
@ -333,6 +436,16 @@ async def get_ids_to_process(
|
|||
break
|
||||
|
||||
completed_subqueries += 1
|
||||
task.update_state(
|
||||
state=f"Fetching: {completed_subqueries}/{len(subqueries)} subqueries",
|
||||
meta={
|
||||
"phase": "fetching",
|
||||
"subqueries_completed": completed_subqueries,
|
||||
"subqueries_total": len(subqueries),
|
||||
"ids_collected": len(identifiers),
|
||||
"pages_fetched": total_pages_fetched,
|
||||
},
|
||||
)
|
||||
return results
|
||||
|
||||
# Fetch all subqueries concurrently
|
||||
|
|
@ -391,6 +504,7 @@ async def get_ids_to_process(
|
|||
meta={
|
||||
"phase": "filtering",
|
||||
"total_found": len(identifiers),
|
||||
"existing_in_db": len(all_listing_ids),
|
||||
"new_listings": len(new_ids),
|
||||
},
|
||||
)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue