wrongmove/crawler/tasks/listing_tasks.py
Viktor Barzin 7e8f1f0339
Fix live logs: replace monkey-patch with direct log injection
The previous approach monkey-patched task.update_state on the Celery
Task instance, but the assignment didn't take effect (Celery's Task
singleton may prevent instance method shadowing). Additionally, the
celery.task logger level was left at WARNING by Celery's worker setup,
silencing all INFO-level log capture.

Fix:
- Replace wrapper with _update_task_state() helper that directly injects
  logs from a module-level _active_log_buffer into every meta dict
- Attach TaskLogHandler to BOTH celery.task and uvicorn.error loggers
- Force both loggers to INFO level during task execution
- Every task.update_state call site now uses _update_task_state()
2026-02-06 23:13:01 +00:00

523 lines
20 KiB
Python

import asyncio
import logging
import time
from collections import deque
from typing import Any
from celery import Task
from celery.schedules import crontab
from celery_app import app
from config.schedule_config import SchedulesConfig
from config.scraper_config import ScraperConfig
from listing_processor import ListingProcessor
from models.listing import Listing, QueryParameters
from rec.query import create_session, listing_query
from rec.exceptions import CircuitBreakerOpenError, ThrottlingError
from rec.throttle_detector import get_throttle_metrics, reset_throttle_metrics
from repositories.listing_repository import ListingRepository
from database import engine
from services.query_splitter import QuerySplitter, SubQuery
from utils.redis_lock import redis_lock
from services.listing_cache import invalidate_cache
logger = logging.getLogger("uvicorn.error")
# Also configure a celery-specific logger that always outputs to stdout
celery_logger = logging.getLogger("celery.task")
if not celery_logger.handlers:
handler = logging.StreamHandler()
handler.setFormatter(logging.Formatter(
"%(asctime)s [%(levelname)s] %(name)s: %(message)s"
))
celery_logger.addHandler(handler)
celery_logger.setLevel(logging.INFO)
SCRAPE_LOCK_NAME = "scrape_listings"
LOG_BUFFER_MAX_LINES = 200
# Module-level log buffer — active only during task execution.
# The TaskLogHandler appends here; _update_task_state reads from here.
_active_log_buffer: deque[str] | None = None
class TaskLogHandler(logging.Handler):
"""Captures log records into a deque for inclusion in task state updates."""
def __init__(self, buffer: deque[str]) -> None:
super().__init__()
self.buffer = buffer
def emit(self, record: logging.LogRecord) -> None:
try:
self.buffer.append(self.format(record))
except Exception:
pass
def _update_task_state(task: Task, state: str, meta: dict[str, Any]) -> None:
"""Call task.update_state with logs injected from the active log buffer."""
if _active_log_buffer is not None:
meta["logs"] = list(_active_log_buffer)
task.update_state(state=state, meta=meta)
@app.task(bind=True, pydantic=True)
def dump_listings_task(self: Task, parameters_json: str) -> dict[str, Any]:
with redis_lock(SCRAPE_LOCK_NAME) as acquired:
if not acquired:
msg = "Another scrape job is already running, skipping this execution"
logger.warning(msg)
celery_logger.warning(msg)
self.update_state(state="SKIPPED", meta={"reason": "Another scrape job is running"})
return {"status": "skipped", "reason": "another_job_running"}
celery_logger.info(f"Acquired lock: {SCRAPE_LOCK_NAME}")
logger.info(f"Acquired lock: {SCRAPE_LOCK_NAME}")
parsed_parameters = QueryParameters.model_validate_json(parameters_json)
celery_logger.info(f"Starting scrape with parameters: {parsed_parameters}")
self.update_state(state="Starting...", meta={"phase": "splitting", "progress": 0})
asyncio.run(dump_listings_full(task=self, parameters=parsed_parameters))
return {"phase": "completed", "progress": 1}
async def async_dump_listings_task(parameters_json: str) -> dict[str, Any]:
with redis_lock(SCRAPE_LOCK_NAME) as acquired:
if not acquired:
logger.warning("Another scrape job is already running, skipping this execution")
return {"status": "skipped", "reason": "another_job_running"}
logger.info(f"Acquired lock: {SCRAPE_LOCK_NAME}")
parsed_parameters = QueryParameters.model_validate_json(parameters_json)
await dump_listings_full(task=Task(), parameters=parsed_parameters)
return {"progress": 0}
async def dump_listings_full(
*, task: Task, parameters: QueryParameters
) -> list[Listing]:
"""Fetches all listings, images as well as detects floorplans"""
global _active_log_buffer
# Set up log capture into a module-level buffer so _update_task_state
# can inject logs into every state update.
log_buffer: deque[str] = deque(maxlen=LOG_BUFFER_MAX_LINES)
log_handler = TaskLogHandler(log_buffer)
log_handler.setFormatter(
logging.Formatter("%(asctime)s %(message)s", datefmt="%H:%M:%S")
)
# Attach handler to both loggers used in the codebase, and ensure
# they accept INFO-level messages (Celery's worker setup may leave
# the celery.task logger at WARNING).
_prev_celery_level = celery_logger.level
_prev_logger_level = logger.level
celery_logger.addHandler(log_handler)
logger.addHandler(log_handler)
if celery_logger.level == logging.NOTSET or celery_logger.level > logging.INFO:
celery_logger.setLevel(logging.INFO)
if logger.level == logging.NOTSET or logger.level > logging.INFO:
logger.setLevel(logging.INFO)
_active_log_buffer = log_buffer
try:
return await _dump_listings_full_inner(task=task, parameters=parameters)
finally:
_active_log_buffer = None
celery_logger.removeHandler(log_handler)
logger.removeHandler(log_handler)
celery_logger.setLevel(_prev_celery_level)
logger.setLevel(_prev_logger_level)
async def _dump_listings_full_inner(
*, task: Task, parameters: QueryParameters
) -> list[Listing]:
"""Inner implementation with log capture active."""
start_time = time.time()
celery_logger.info("=" * 60)
celery_logger.info("PHASE 1: Initializing listing fetch")
celery_logger.info("=" * 60)
repository = ListingRepository(engine)
_update_task_state(task, "Identifying missing listings", {"phase": "splitting", "progress": 0})
celery_logger.info("Querying Rightmove API to identify new listings...")
ids_to_process = await get_ids_to_process(
parameters=parameters, repository=repository, task=task
)
celery_logger.info(f"Found {len(ids_to_process)} new listings to process")
logger.info(f"Found {len(ids_to_process)} listings to process")
if len(ids_to_process) == 0:
elapsed = time.time() - start_time
celery_logger.info(f"No new listings found. Completed in {elapsed:.1f}s")
invalidate_cache()
_update_task_state(task, "No new listings found", {
"phase": "completed", "progress": 1, "processed": 0, "total": 0,
"message": "All listings are up to date",
})
return []
celery_logger.info("=" * 60)
celery_logger.info("PHASE 2: Processing listings (fetch details, images, OCR)")
celery_logger.info("=" * 60)
listing_processor = ListingProcessor(repository)
celery_logger.info(f"Starting processing {len(ids_to_process)} listings")
logger.info(f"Starting processing {len(ids_to_process)} listings")
result = await dump_listings_and_monitor(
task=task, listing_processor=listing_processor, missing_ids=ids_to_process
)
elapsed = time.time() - start_time
celery_logger.info("=" * 60)
celery_logger.info(f"COMPLETED: Processed {len(result)} listings in {elapsed:.1f}s")
celery_logger.info("=" * 60)
invalidate_cache()
# Send final state so the frontend has rich data even after task completes
_update_task_state(task, "Completed", {
"phase": "completed", "progress": 1,
"processed": len(result), "total": len(ids_to_process),
"message": f"Processed {len(result)} listings in {elapsed:.0f}s",
})
return result
async def dump_listings_and_monitor(
*, task: Task, listing_processor: ListingProcessor, missing_ids: set[int]
) -> list[Listing]:
task_progress = {missing_id: 0 for missing_id in missing_ids}
processed_count = 0
failed_count = 0
details_fetched = 0
images_downloaded = 0
ocr_completed = 0
start_time = time.time()
async def process(missing_id: int) -> Listing | None:
nonlocal processed_count, failed_count
def step_callback(step_name: str) -> None:
nonlocal details_fetched, images_downloaded, ocr_completed
if step_name == "details":
details_fetched += 1
elif step_name == "images":
images_downloaded += 1
elif step_name == "ocr":
ocr_completed += 1
listing = await listing_processor.process_listing(
missing_id, on_step_complete=step_callback
)
task_progress[missing_id] = 1
if listing is not None:
processed_count += 1
else:
failed_count += 1
return listing
async def monitor() -> None:
last_progress = 0
while (progress := sum(task_progress.values())) < len(missing_ids):
progress_ratio = round(progress / len(missing_ids), 2)
elapsed = time.time() - start_time
rate = progress / elapsed if elapsed > 0 else 0
eta = (len(missing_ids) - progress) / rate if rate > 0 else 0
# Log every 10% progress or at least every update
if progress_ratio >= last_progress + 0.1 or progress == 1:
celery_logger.info(
f"Progress: {progress_ratio * 100:.0f}% "
f"({progress}/{len(missing_ids)}) "
f"| Elapsed: {elapsed:.0f}s "
f"| Rate: {rate:.1f}/s "
f"| ETA: {eta:.0f}s"
)
last_progress = progress_ratio
_update_task_state(
task,
f"Processing: {progress_ratio * 100:.0f}% ({progress}/{len(missing_ids)})",
{
"phase": "processing",
"progress": progress_ratio,
"processed": progress,
"total": len(missing_ids),
"details_fetched": details_fetched,
"images_downloaded": images_downloaded,
"ocr_completed": ocr_completed,
"failed": failed_count,
"elapsed_seconds": round(elapsed, 1),
"rate_per_second": round(rate, 2),
"eta_seconds": round(eta, 1),
},
)
await asyncio.sleep(1)
processed_listings = await asyncio.gather(
*[process(id) for id in missing_ids], *[monitor()]
)
filtered_listings = [listing for listing in processed_listings if listing is not None]
celery_logger.info(
f"Processing complete: {processed_count} successful, {failed_count} failed"
)
return filtered_listings
@app.on_after_finalize.connect
def setup_periodic_tasks(sender, **kwargs):
"""Register periodic tasks from environment configuration."""
try:
config = SchedulesConfig.from_env()
except ValueError as e:
logger.error(f"Failed to load schedule configuration: {e}")
return
for schedule in config.get_enabled_schedules():
logger.info(
f"Registering periodic task: {schedule.name} at {schedule.hour}:{schedule.minute}"
)
sender.add_periodic_task(
crontab(
minute=schedule.minute,
hour=schedule.hour,
day_of_week=schedule.day_of_week,
),
dump_listings_task.s(schedule.to_query_parameters().model_dump_json()),
name=schedule.name,
)
async def get_ids_to_process(
*,
parameters: QueryParameters,
repository: ListingRepository,
task: Task,
) -> set[int]:
"""Fetch all listing IDs using intelligent query splitting.
Uses the QuerySplitter to adaptively split large queries and maximize
data extraction while respecting Rightmove's result caps.
Args:
parameters: Query parameters for the search.
repository: Repository for checking existing listings.
task: Celery task for progress updates.
Returns:
Set of new listing IDs that need to be processed.
"""
config = ScraperConfig.from_env()
splitter = QuerySplitter(config)
# Reset throttle metrics
reset_throttle_metrics()
def on_progress(phase: str, message: str, **kwargs: Any) -> None:
meta: dict[str, Any] = {"phase": phase, "message": message}
meta.update(kwargs)
_update_task_state(task, message, meta)
celery_logger.info(f"[{phase}] {message}")
celery_logger.info("Starting query splitting and probing...")
try:
async with create_session(config) as session:
# Phase 1 & 2: Split and probe queries
_update_task_state(task, "Analyzing query and splitting by price bands...", {
"phase": "splitting", "progress": 0,
})
subqueries = await splitter.split(parameters, session, on_progress)
total_estimated = splitter.calculate_total_estimated_results(subqueries)
celery_logger.info(
f"Query split complete: {len(subqueries)} subqueries, "
f"~{total_estimated} estimated total results"
)
logger.info(
f"Split into {len(subqueries)} subqueries, "
f"estimated {total_estimated} total results"
)
# Phase 3: Fetch all pages for each subquery
_update_task_state(
task,
f"Fetching listings from {len(subqueries)} subqueries...",
{
"phase": "fetching",
"subqueries_completed": 0,
"subqueries_total": len(subqueries),
"ids_collected": 0,
"pages_fetched": 0,
"estimated_results": total_estimated,
},
)
celery_logger.info(f"Fetching pages from {len(subqueries)} subqueries...")
semaphore = asyncio.Semaphore(config.max_concurrent_requests)
identifiers: set[int] = set()
completed_subqueries = 0
total_pages_fetched = 0
async def fetch_subquery(sq: SubQuery) -> list[dict[str, Any]]:
"""Fetch all pages for a single subquery."""
nonlocal completed_subqueries, total_pages_fetched
results: list[dict[str, Any]] = []
# Calculate how many pages we need based on estimated results
estimated = sq.estimated_results or 0
if estimated == 0:
completed_subqueries += 1
_update_task_state(
task,
f"Fetching: {completed_subqueries}/{len(subqueries)} subqueries",
{
"phase": "fetching",
"subqueries_completed": completed_subqueries,
"subqueries_total": len(subqueries),
"ids_collected": len(identifiers),
"pages_fetched": total_pages_fetched,
},
)
return results
# Fetch pages up to max_pages_per_query or until no more results
page_size = parameters.page_size
max_pages = min(
config.max_pages_per_query,
(estimated // page_size) + 1,
)
for page_id in range(1, max_pages + 1):
async with semaphore:
await asyncio.sleep(config.request_delay_ms / 1000)
try:
result = await listing_query(
page=page_id,
channel=parameters.listing_type,
min_bedrooms=sq.min_bedrooms,
max_bedrooms=sq.max_bedrooms,
radius=parameters.radius,
min_price=sq.min_price,
max_price=sq.max_price,
district=sq.district,
page_size=page_size,
max_days_since_added=parameters.max_days_since_added,
furnish_types=parameters.furnish_types or [],
session=session,
config=config,
)
results.append(result)
total_pages_fetched += 1
# Check if we've received all results
properties = result.get("properties", [])
if len(properties) < page_size:
# No more results on next page
break
except CircuitBreakerOpenError as e:
celery_logger.error(f"Circuit breaker open: {e}")
break
except ThrottlingError as e:
celery_logger.warning(
f"Throttling on {sq.district} page {page_id}: {e}"
)
break
except Exception as e:
if "GENERIC_ERROR" in str(e):
# Reached end of results
logger.debug(
f"Max page for {sq.district}: {page_id - 1}"
)
break
logger.warning(
f"Error fetching page {page_id} for {sq.district}: {e}"
)
break
completed_subqueries += 1
_update_task_state(
task,
f"Fetching: {completed_subqueries}/{len(subqueries)} subqueries",
{
"phase": "fetching",
"subqueries_completed": completed_subqueries,
"subqueries_total": len(subqueries),
"ids_collected": len(identifiers),
"pages_fetched": total_pages_fetched,
},
)
return results
# Fetch all subqueries concurrently
all_results = await asyncio.gather(
*[fetch_subquery(sq) for sq in subqueries]
)
celery_logger.info(
f"Fetch complete: {total_pages_fetched} pages from "
f"{completed_subqueries} subqueries"
)
# Extract identifiers from all results
for subquery_results in all_results:
for response_json in subquery_results:
if not response_json:
continue
if response_json.get("totalAvailableResults", 0) == 0:
continue
for property_data in response_json.get("properties", []):
identifier = property_data.get("identifier")
if identifier:
identifiers.add(identifier)
except CircuitBreakerOpenError as e:
celery_logger.error(f"Circuit breaker prevented query: {e}")
# Log throttle metrics
metrics = get_throttle_metrics()
if metrics.total_requests > 0:
celery_logger.info(metrics.summary())
return set()
finally:
# Log throttle metrics
metrics = get_throttle_metrics()
if metrics.total_requests > 0:
celery_logger.info(f"API Stats: {metrics.total_requests} requests, "
f"avg {metrics.average_response_time:.2f}s, "
f"{metrics.total_throttling_events} throttled")
celery_logger.info(f"Found {len(identifiers)} unique listing IDs from API")
logger.info(f"Found {len(identifiers)} unique listings")
# Filter out listings already in the database
celery_logger.info("Checking database for existing listings...")
all_listing_ids = {listing.id for listing in await repository.get_listings()}
new_ids = identifiers - all_listing_ids
celery_logger.info(
f"Filtering: {len(identifiers)} total, "
f"{len(all_listing_ids)} existing in DB, "
f"{len(new_ids)} new to process"
)
_update_task_state(task, f"Found {len(new_ids)} new listings to process", {
"phase": "filtering",
"total_found": len(identifiers),
"existing_in_db": len(all_listing_ids),
"new_listings": len(new_ids),
})
return new_ids