Refactor codebase following Clean Code principles and add 229 tests
- Extract helpers to reduce function sizes (listing_tasks, app.py, query.py, listing_fetcher) - Replace nonlocal mutations with _PipelineState dataclass in listing_tasks - Fix bugs: isinstance→equality check in repository, verify_exp for OIDC tokens - Consolidate duplicate filter methods in listing_repository - Move hardcoded config to env vars with backward-compatible defaults - Simplify CLI decorator to auto-build QueryParameters - Add deprecation docstring to data_access.py - Test count: 158 → 387 (all passing)
This commit is contained in:
parent
7e05b3c971
commit
150342bb9e
48 changed files with 5029 additions and 990 deletions
|
|
@ -2,6 +2,7 @@ import asyncio
|
|||
import logging
|
||||
import time
|
||||
from collections import deque
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Any
|
||||
from celery import Task
|
||||
from celery.schedules import crontab
|
||||
|
|
@ -34,11 +35,38 @@ if not celery_logger.handlers:
|
|||
SCRAPE_LOCK_NAME = "scrape_listings"
|
||||
LOG_BUFFER_MAX_LINES = 200
|
||||
|
||||
# Number of concurrent consumer workers that process listings from the queue.
|
||||
NUM_WORKERS = 20
|
||||
|
||||
# Phase constants for task state reporting
|
||||
PHASE_SPLITTING = "splitting"
|
||||
PHASE_FETCHING = "fetching"
|
||||
PHASE_PROCESSING = "processing"
|
||||
PHASE_COMPLETED = "completed"
|
||||
|
||||
# Module-level log buffer — active only during task execution.
|
||||
# The TaskLogHandler appends here; _update_task_state reads from here.
|
||||
# This is safe as module-level mutable state because Celery workers use a
|
||||
# prefork pool: each worker process handles one task at a time, so there is
|
||||
# no concurrent access within a single process. The TaskLogHandler appends
|
||||
# here; _update_task_state reads from here.
|
||||
_active_log_buffer: deque[str] | None = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class _PipelineState:
|
||||
"""Shared mutable state for the streaming fetch-and-process pipeline."""
|
||||
ids_collected: int = 0
|
||||
completed_subqueries: int = 0
|
||||
total_pages_fetched: int = 0
|
||||
fetching_done: bool = False
|
||||
processed_count: int = 0
|
||||
failed_count: int = 0
|
||||
details_fetched: int = 0
|
||||
images_downloaded: int = 0
|
||||
ocr_completed: int = 0
|
||||
processed_listings: list[Listing] = field(default_factory=list)
|
||||
|
||||
|
||||
class TaskLogHandler(logging.Handler):
|
||||
"""Captures log records into a deque for inclusion in task state updates."""
|
||||
|
||||
|
|
@ -60,34 +88,204 @@ def _update_task_state(task: Task, state: str, meta: dict[str, Any]) -> None:
|
|||
task.update_state(state=state, meta=meta)
|
||||
|
||||
|
||||
async def _fetch_subquery(
|
||||
sq: SubQuery,
|
||||
parameters: QueryParameters,
|
||||
session: object,
|
||||
config: ScraperConfig,
|
||||
semaphore: asyncio.Semaphore,
|
||||
existing_ids: set[int],
|
||||
queue: asyncio.Queue[int | None],
|
||||
state: _PipelineState,
|
||||
) -> None:
|
||||
"""Fetch pages for a single subquery and enqueue new listing IDs."""
|
||||
estimated = sq.estimated_results or 0
|
||||
if estimated == 0:
|
||||
state.completed_subqueries += 1
|
||||
return
|
||||
|
||||
page_size = parameters.page_size
|
||||
max_pages = min(
|
||||
config.max_pages_per_query,
|
||||
(estimated // page_size) + 1,
|
||||
)
|
||||
|
||||
for page_id in range(1, max_pages + 1):
|
||||
async with semaphore:
|
||||
await asyncio.sleep(config.request_delay_ms / 1000)
|
||||
try:
|
||||
result = await listing_query(
|
||||
page=page_id,
|
||||
channel=parameters.listing_type,
|
||||
min_bedrooms=sq.min_bedrooms,
|
||||
max_bedrooms=sq.max_bedrooms,
|
||||
radius=parameters.radius,
|
||||
min_price=sq.min_price,
|
||||
max_price=sq.max_price,
|
||||
district=sq.district,
|
||||
page_size=page_size,
|
||||
max_days_since_added=parameters.max_days_since_added,
|
||||
furnish_types=parameters.furnish_types or [],
|
||||
session=session,
|
||||
config=config,
|
||||
)
|
||||
state.total_pages_fetched += 1
|
||||
|
||||
properties = result.get("properties", [])
|
||||
for prop in properties:
|
||||
identifier = prop.get("identifier")
|
||||
if identifier and identifier not in existing_ids:
|
||||
existing_ids.add(identifier)
|
||||
state.ids_collected += 1
|
||||
await queue.put(identifier)
|
||||
|
||||
if len(properties) < page_size:
|
||||
break
|
||||
|
||||
except CircuitBreakerOpenError as e:
|
||||
celery_logger.error(f"Circuit breaker open: {e}")
|
||||
break
|
||||
except ThrottlingError as e:
|
||||
celery_logger.warning(
|
||||
f"Throttling on {sq.district} page {page_id}: {e}"
|
||||
)
|
||||
break
|
||||
except Exception as e:
|
||||
if "GENERIC_ERROR" in str(e):
|
||||
celery_logger.debug(
|
||||
f"Max page for {sq.district}: {page_id - 1}"
|
||||
)
|
||||
break
|
||||
celery_logger.warning(
|
||||
f"Error fetching page {page_id} for "
|
||||
f"{sq.district}: {e}"
|
||||
)
|
||||
break
|
||||
|
||||
state.completed_subqueries += 1
|
||||
|
||||
|
||||
async def _process_worker(
|
||||
queue: asyncio.Queue[int | None],
|
||||
processor: ListingProcessor,
|
||||
state: _PipelineState,
|
||||
) -> None:
|
||||
"""Consumer worker: pull listing IDs from the queue and process them."""
|
||||
while True:
|
||||
listing_id = await queue.get()
|
||||
if listing_id is None:
|
||||
break
|
||||
|
||||
def step_callback(step_name: str) -> None:
|
||||
if step_name == "details":
|
||||
state.details_fetched += 1
|
||||
elif step_name == "images":
|
||||
state.images_downloaded += 1
|
||||
elif step_name == "ocr":
|
||||
state.ocr_completed += 1
|
||||
|
||||
listing = await processor.process_listing(
|
||||
listing_id, on_step_complete=step_callback
|
||||
)
|
||||
if listing is not None:
|
||||
state.processed_count += 1
|
||||
state.processed_listings.append(listing)
|
||||
else:
|
||||
state.failed_count += 1
|
||||
|
||||
|
||||
async def _monitor_progress(
|
||||
task: Task,
|
||||
state: _PipelineState,
|
||||
subqueries_total: int,
|
||||
start_time: float,
|
||||
) -> None:
|
||||
"""Periodically report pipeline progress via task state updates."""
|
||||
last_progress = 0.0
|
||||
|
||||
while True:
|
||||
total = state.ids_collected
|
||||
done = state.processed_count + state.failed_count
|
||||
|
||||
if state.fetching_done and done >= total and total > 0:
|
||||
break
|
||||
if state.fetching_done and total == 0:
|
||||
break
|
||||
|
||||
phase = PHASE_PROCESSING if state.fetching_done else PHASE_FETCHING
|
||||
|
||||
if total > 0:
|
||||
progress_ratio = round(done / total, 2)
|
||||
else:
|
||||
progress_ratio = 0.0
|
||||
|
||||
elapsed = time.time() - start_time
|
||||
rate = done / elapsed if elapsed > 0 else 0
|
||||
remaining = (total - done) if total > 0 else 0
|
||||
eta = remaining / rate if rate > 0 else 0
|
||||
|
||||
if progress_ratio >= last_progress + 0.1 or done == 1:
|
||||
celery_logger.info(
|
||||
f"Progress: {progress_ratio * 100:.0f}% "
|
||||
f"({done}/{total}) "
|
||||
f"| Elapsed: {elapsed:.0f}s "
|
||||
f"| Rate: {rate:.1f}/s "
|
||||
f"| ETA: {eta:.0f}s"
|
||||
)
|
||||
last_progress = progress_ratio
|
||||
|
||||
_update_task_state(
|
||||
task,
|
||||
f"{'Processing' if state.fetching_done else 'Fetching & processing'}: "
|
||||
f"{done}/{total}",
|
||||
{
|
||||
"phase": phase,
|
||||
"progress": progress_ratio,
|
||||
"processed": done,
|
||||
"total": total,
|
||||
"subqueries_completed": state.completed_subqueries,
|
||||
"subqueries_total": subqueries_total,
|
||||
"ids_collected": state.ids_collected,
|
||||
"pages_fetched": state.total_pages_fetched,
|
||||
"fetching_done": state.fetching_done,
|
||||
"details_fetched": state.details_fetched,
|
||||
"images_downloaded": state.images_downloaded,
|
||||
"ocr_completed": state.ocr_completed,
|
||||
"failed": state.failed_count,
|
||||
"elapsed_seconds": round(elapsed, 1),
|
||||
"rate_per_second": round(rate, 2),
|
||||
"eta_seconds": round(eta, 1),
|
||||
},
|
||||
)
|
||||
await asyncio.sleep(1)
|
||||
|
||||
|
||||
@app.task(bind=True, pydantic=True)
|
||||
def dump_listings_task(self: Task, parameters_json: str) -> dict[str, Any]:
|
||||
with redis_lock(SCRAPE_LOCK_NAME) as acquired:
|
||||
if not acquired:
|
||||
msg = "Another scrape job is already running, skipping this execution"
|
||||
logger.warning(msg)
|
||||
celery_logger.warning(msg)
|
||||
self.update_state(state="SKIPPED", meta={"reason": "Another scrape job is running"})
|
||||
return {"status": "skipped", "reason": "another_job_running"}
|
||||
|
||||
celery_logger.info(f"Acquired lock: {SCRAPE_LOCK_NAME}")
|
||||
logger.info(f"Acquired lock: {SCRAPE_LOCK_NAME}")
|
||||
|
||||
parsed_parameters = QueryParameters.model_validate_json(parameters_json)
|
||||
celery_logger.info(f"Starting scrape with parameters: {parsed_parameters}")
|
||||
|
||||
self.update_state(state="Starting...", meta={"phase": "splitting", "progress": 0})
|
||||
self.update_state(state="Starting...", meta={"phase": PHASE_SPLITTING, "progress": 0})
|
||||
asyncio.run(dump_listings_full(task=self, parameters=parsed_parameters))
|
||||
return {"phase": "completed", "progress": 1}
|
||||
return {"phase": PHASE_COMPLETED, "progress": 1}
|
||||
|
||||
|
||||
async def async_dump_listings_task(parameters_json: str) -> dict[str, Any]:
|
||||
with redis_lock(SCRAPE_LOCK_NAME) as acquired:
|
||||
if not acquired:
|
||||
logger.warning("Another scrape job is already running, skipping this execution")
|
||||
celery_logger.warning("Another scrape job is already running, skipping this execution")
|
||||
return {"status": "skipped", "reason": "another_job_running"}
|
||||
|
||||
logger.info(f"Acquired lock: {SCRAPE_LOCK_NAME}")
|
||||
celery_logger.info(f"Acquired lock: {SCRAPE_LOCK_NAME}")
|
||||
parsed_parameters = QueryParameters.model_validate_json(parameters_json)
|
||||
await dump_listings_full(task=Task(), parameters=parsed_parameters)
|
||||
return {"progress": 0}
|
||||
|
|
@ -141,17 +339,16 @@ async def _dump_listings_full_inner(
|
|||
soon as IDs become available from each subquery.
|
||||
"""
|
||||
start_time = time.time()
|
||||
NUM_WORKERS = 20
|
||||
state = _PipelineState()
|
||||
|
||||
celery_logger.info("=" * 60)
|
||||
celery_logger.info("PHASE 1: Splitting queries")
|
||||
celery_logger.info(f"PHASE 1: Splitting queries")
|
||||
celery_logger.info("=" * 60)
|
||||
|
||||
repository = ListingRepository(engine)
|
||||
config = ScraperConfig.from_env()
|
||||
splitter = QuerySplitter(config)
|
||||
|
||||
# Reset throttle metrics
|
||||
reset_throttle_metrics()
|
||||
|
||||
def on_progress(phase: str, message: str, **kwargs: Any) -> None:
|
||||
|
|
@ -161,7 +358,7 @@ async def _dump_listings_full_inner(
|
|||
celery_logger.info(f"[{phase}] {message}")
|
||||
|
||||
_update_task_state(task, "Analyzing query and splitting by price bands...", {
|
||||
"phase": "splitting", "progress": 0,
|
||||
"phase": PHASE_SPLITTING, "progress": 0,
|
||||
})
|
||||
celery_logger.info("Starting query splitting and probing...")
|
||||
|
||||
|
|
@ -175,34 +372,22 @@ async def _dump_listings_full_inner(
|
|||
f"~{total_estimated} estimated total results"
|
||||
)
|
||||
|
||||
# Load existing IDs (fast, ID-only projection)
|
||||
celery_logger.info("Loading existing listing IDs from database...")
|
||||
existing_ids = repository.get_listing_ids(parameters.listing_type)
|
||||
celery_logger.info(f"Found {len(existing_ids)} existing listings in DB")
|
||||
|
||||
celery_logger.info("=" * 60)
|
||||
celery_logger.info("PHASE 2: Streaming fetch & process")
|
||||
celery_logger.info(f"PHASE 2: Streaming fetch & process")
|
||||
celery_logger.info("=" * 60)
|
||||
|
||||
# Shared state for the streaming pipeline
|
||||
queue: asyncio.Queue[int | None] = asyncio.Queue()
|
||||
ids_collected = 0
|
||||
completed_subqueries = 0
|
||||
total_pages_fetched = 0
|
||||
fetching_done = False
|
||||
processed_count = 0
|
||||
failed_count = 0
|
||||
details_fetched = 0
|
||||
images_downloaded = 0
|
||||
ocr_completed = 0
|
||||
processed_listings: list[Listing] = []
|
||||
semaphore = asyncio.Semaphore(config.max_concurrent_requests)
|
||||
|
||||
_update_task_state(
|
||||
task,
|
||||
f"Fetching listings from {len(subqueries)} subqueries...",
|
||||
{
|
||||
"phase": "fetching",
|
||||
"phase": PHASE_FETCHING,
|
||||
"subqueries_completed": 0,
|
||||
"subqueries_total": len(subqueries),
|
||||
"ids_collected": 0,
|
||||
|
|
@ -214,190 +399,32 @@ async def _dump_listings_full_inner(
|
|||
|
||||
listing_processor = ListingProcessor(repository)
|
||||
|
||||
# --- Producer: fetch subquery pages and enqueue new IDs ---
|
||||
# Producer: fetch all subqueries concurrently, then signal workers to stop
|
||||
async def producer() -> None:
|
||||
nonlocal ids_collected, completed_subqueries, total_pages_fetched
|
||||
nonlocal fetching_done
|
||||
|
||||
async def fetch_subquery(sq: SubQuery) -> None:
|
||||
nonlocal ids_collected, completed_subqueries, total_pages_fetched
|
||||
|
||||
estimated = sq.estimated_results or 0
|
||||
if estimated == 0:
|
||||
completed_subqueries += 1
|
||||
return
|
||||
|
||||
page_size = parameters.page_size
|
||||
max_pages = min(
|
||||
config.max_pages_per_query,
|
||||
(estimated // page_size) + 1,
|
||||
)
|
||||
|
||||
for page_id in range(1, max_pages + 1):
|
||||
async with semaphore:
|
||||
await asyncio.sleep(config.request_delay_ms / 1000)
|
||||
try:
|
||||
result = await listing_query(
|
||||
page=page_id,
|
||||
channel=parameters.listing_type,
|
||||
min_bedrooms=sq.min_bedrooms,
|
||||
max_bedrooms=sq.max_bedrooms,
|
||||
radius=parameters.radius,
|
||||
min_price=sq.min_price,
|
||||
max_price=sq.max_price,
|
||||
district=sq.district,
|
||||
page_size=page_size,
|
||||
max_days_since_added=parameters.max_days_since_added,
|
||||
furnish_types=parameters.furnish_types or [],
|
||||
session=session,
|
||||
config=config,
|
||||
)
|
||||
total_pages_fetched += 1
|
||||
|
||||
# Extract and enqueue new IDs inline
|
||||
properties = result.get("properties", [])
|
||||
for prop in properties:
|
||||
identifier = prop.get("identifier")
|
||||
if identifier and identifier not in existing_ids:
|
||||
existing_ids.add(identifier)
|
||||
ids_collected += 1
|
||||
await queue.put(identifier)
|
||||
|
||||
if len(properties) < page_size:
|
||||
break
|
||||
|
||||
except CircuitBreakerOpenError as e:
|
||||
celery_logger.error(f"Circuit breaker open: {e}")
|
||||
break
|
||||
except ThrottlingError as e:
|
||||
celery_logger.warning(
|
||||
f"Throttling on {sq.district} page {page_id}: {e}"
|
||||
)
|
||||
break
|
||||
except Exception as e:
|
||||
if "GENERIC_ERROR" in str(e):
|
||||
logger.debug(
|
||||
f"Max page for {sq.district}: {page_id - 1}"
|
||||
)
|
||||
break
|
||||
logger.warning(
|
||||
f"Error fetching page {page_id} for "
|
||||
f"{sq.district}: {e}"
|
||||
)
|
||||
break
|
||||
|
||||
completed_subqueries += 1
|
||||
|
||||
# Fetch all subqueries concurrently
|
||||
await asyncio.gather(
|
||||
*[fetch_subquery(sq) for sq in subqueries]
|
||||
*[
|
||||
_fetch_subquery(
|
||||
sq, parameters, session, config,
|
||||
semaphore, existing_ids, queue, state,
|
||||
)
|
||||
for sq in subqueries
|
||||
]
|
||||
)
|
||||
|
||||
celery_logger.info(
|
||||
f"Fetch complete: {total_pages_fetched} pages from "
|
||||
f"{completed_subqueries} subqueries, "
|
||||
f"{ids_collected} new IDs"
|
||||
f"Fetch complete: {state.total_pages_fetched} pages from "
|
||||
f"{state.completed_subqueries} subqueries, "
|
||||
f"{state.ids_collected} new IDs"
|
||||
)
|
||||
fetching_done = True
|
||||
state.fetching_done = True
|
||||
|
||||
# Send sentinel values to stop workers
|
||||
for _ in range(NUM_WORKERS):
|
||||
await queue.put(None)
|
||||
|
||||
# --- Consumer workers: process listings from queue ---
|
||||
async def worker() -> None:
|
||||
nonlocal processed_count, failed_count
|
||||
nonlocal details_fetched, images_downloaded, ocr_completed
|
||||
|
||||
while True:
|
||||
listing_id = await queue.get()
|
||||
if listing_id is None:
|
||||
break
|
||||
|
||||
def step_callback(step_name: str) -> None:
|
||||
nonlocal details_fetched, images_downloaded, ocr_completed
|
||||
if step_name == "details":
|
||||
details_fetched += 1
|
||||
elif step_name == "images":
|
||||
images_downloaded += 1
|
||||
elif step_name == "ocr":
|
||||
ocr_completed += 1
|
||||
|
||||
listing = await listing_processor.process_listing(
|
||||
listing_id, on_step_complete=step_callback
|
||||
)
|
||||
if listing is not None:
|
||||
processed_count += 1
|
||||
processed_listings.append(listing)
|
||||
else:
|
||||
failed_count += 1
|
||||
|
||||
# --- Monitor: reports combined progress ---
|
||||
async def monitor() -> None:
|
||||
last_progress = 0.0
|
||||
|
||||
while True:
|
||||
total = ids_collected
|
||||
done = processed_count + failed_count
|
||||
|
||||
if fetching_done and done >= total and total > 0:
|
||||
break
|
||||
if fetching_done and total == 0:
|
||||
break
|
||||
|
||||
# Determine phase label
|
||||
phase = "processing" if fetching_done else "fetching"
|
||||
|
||||
if total > 0:
|
||||
progress_ratio = round(done / total, 2)
|
||||
else:
|
||||
progress_ratio = 0.0
|
||||
|
||||
elapsed = time.time() - start_time
|
||||
rate = done / elapsed if elapsed > 0 else 0
|
||||
remaining = (total - done) if total > 0 else 0
|
||||
eta = remaining / rate if rate > 0 else 0
|
||||
|
||||
if progress_ratio >= last_progress + 0.1 or done == 1:
|
||||
celery_logger.info(
|
||||
f"Progress: {progress_ratio * 100:.0f}% "
|
||||
f"({done}/{total}) "
|
||||
f"| Elapsed: {elapsed:.0f}s "
|
||||
f"| Rate: {rate:.1f}/s "
|
||||
f"| ETA: {eta:.0f}s"
|
||||
)
|
||||
last_progress = progress_ratio
|
||||
|
||||
_update_task_state(
|
||||
task,
|
||||
f"{'Processing' if fetching_done else 'Fetching & processing'}: "
|
||||
f"{done}/{total}",
|
||||
{
|
||||
"phase": phase,
|
||||
"progress": progress_ratio,
|
||||
"processed": done,
|
||||
"total": total,
|
||||
"subqueries_completed": completed_subqueries,
|
||||
"subqueries_total": len(subqueries),
|
||||
"ids_collected": ids_collected,
|
||||
"pages_fetched": total_pages_fetched,
|
||||
"fetching_done": fetching_done,
|
||||
"details_fetched": details_fetched,
|
||||
"images_downloaded": images_downloaded,
|
||||
"ocr_completed": ocr_completed,
|
||||
"failed": failed_count,
|
||||
"elapsed_seconds": round(elapsed, 1),
|
||||
"rate_per_second": round(rate, 2),
|
||||
"eta_seconds": round(eta, 1),
|
||||
},
|
||||
)
|
||||
await asyncio.sleep(1)
|
||||
|
||||
# Run producer, workers, and monitor concurrently
|
||||
await asyncio.gather(
|
||||
producer(),
|
||||
*[worker() for _ in range(NUM_WORKERS)],
|
||||
monitor(),
|
||||
*[_process_worker(queue, listing_processor, state) for _ in range(NUM_WORKERS)],
|
||||
_monitor_progress(task, state, len(subqueries), start_time),
|
||||
)
|
||||
|
||||
except CircuitBreakerOpenError as e:
|
||||
|
|
@ -418,19 +445,19 @@ async def _dump_listings_full_inner(
|
|||
elapsed = time.time() - start_time
|
||||
celery_logger.info("=" * 60)
|
||||
celery_logger.info(
|
||||
f"COMPLETED: Processed {len(processed_listings)} listings in {elapsed:.1f}s"
|
||||
f"COMPLETED: Processed {len(state.processed_listings)} listings in {elapsed:.1f}s"
|
||||
)
|
||||
celery_logger.info("=" * 60)
|
||||
|
||||
invalidate_cache()
|
||||
|
||||
_update_task_state(task, "Completed", {
|
||||
"phase": "completed", "progress": 1,
|
||||
"processed": len(processed_listings), "total": ids_collected,
|
||||
"message": f"Processed {len(processed_listings)} listings in {elapsed:.0f}s",
|
||||
"phase": PHASE_COMPLETED, "progress": 1,
|
||||
"processed": len(state.processed_listings), "total": state.ids_collected,
|
||||
"message": f"Processed {len(state.processed_listings)} listings in {elapsed:.0f}s",
|
||||
})
|
||||
|
||||
return processed_listings
|
||||
return state.processed_listings
|
||||
|
||||
|
||||
@app.on_after_finalize.connect
|
||||
|
|
@ -439,11 +466,11 @@ def setup_periodic_tasks(sender, **kwargs):
|
|||
try:
|
||||
config = SchedulesConfig.from_env()
|
||||
except ValueError as e:
|
||||
logger.error(f"Failed to load schedule configuration: {e}")
|
||||
celery_logger.error(f"Failed to load schedule configuration: {e}")
|
||||
return
|
||||
|
||||
for schedule in config.get_enabled_schedules():
|
||||
logger.info(
|
||||
celery_logger.info(
|
||||
f"Registering periodic task: {schedule.name} at {schedule.hour}:{schedule.minute}"
|
||||
)
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue