Refactor codebase following Clean Code principles and add 229 tests

- Extract helpers to reduce function sizes (listing_tasks, app.py, query.py, listing_fetcher) - Replace nonlocal mutations with _PipelineState dataclass in listing_tasks - Fix bugs: isinstance→equality check in repository, verify_exp for OIDC tokens - Consolidate duplicate filter methods in listing_repository - Move hardcoded config to env vars with backward-compatible defaults - Simplify CLI decorator to auto-build QueryParameters - Add deprecation docstring to data_access.py - Test count: 158 → 387 (all passing)
2026-02-07 20:19:57 +00:00 · 2026-02-07 20:19:57 +00:00 · 150342bb9e
commit 150342bb9e
parent 7e05b3c971
48 changed files with 5029 additions and 990 deletions
--- a/crawler/tasks/listing_tasks.py
+++ b/crawler/tasks/listing_tasks.py
@ -2,6 +2,7 @@ import asyncio
 import logging
 import time
 from collections import deque
+from dataclasses import dataclass, field
 from typing import Any
 from celery import Task
 from celery.schedules import crontab
@ -34,11 +35,38 @@ if not celery_logger.handlers:
 SCRAPE_LOCK_NAME = "scrape_listings"
 LOG_BUFFER_MAX_LINES = 200

+# Number of concurrent consumer workers that process listings from the queue.
+NUM_WORKERS = 20
+
+# Phase constants for task state reporting
+PHASE_SPLITTING = "splitting"
+PHASE_FETCHING = "fetching"
+PHASE_PROCESSING = "processing"
+PHASE_COMPLETED = "completed"
+
 # Module-level log buffer — active only during task execution.
-# The TaskLogHandler appends here; _update_task_state reads from here.
+# This is safe as module-level mutable state because Celery workers use a
+# prefork pool: each worker process handles one task at a time, so there is
+# no concurrent access within a single process.  The TaskLogHandler appends
+# here; _update_task_state reads from here.
 _active_log_buffer: deque[str] | None = None


+@dataclass
+class _PipelineState:
+    """Shared mutable state for the streaming fetch-and-process pipeline."""
+    ids_collected: int = 0
+    completed_subqueries: int = 0
+    total_pages_fetched: int = 0
+    fetching_done: bool = False
+    processed_count: int = 0
+    failed_count: int = 0
+    details_fetched: int = 0
+    images_downloaded: int = 0
+    ocr_completed: int = 0
+    processed_listings: list[Listing] = field(default_factory=list)
+
+
 class TaskLogHandler(logging.Handler):
    """Captures log records into a deque for inclusion in task state updates."""

@ -60,34 +88,204 @@ def _update_task_state(task: Task, state: str, meta: dict[str, Any]) -> None:
    task.update_state(state=state, meta=meta)


+async def _fetch_subquery(
+    sq: SubQuery,
+    parameters: QueryParameters,
+    session: object,
+    config: ScraperConfig,
+    semaphore: asyncio.Semaphore,
+    existing_ids: set[int],
+    queue: asyncio.Queue[int | None],
+    state: _PipelineState,
+) -> None:
+    """Fetch pages for a single subquery and enqueue new listing IDs."""
+    estimated = sq.estimated_results or 0
+    if estimated == 0:
+        state.completed_subqueries += 1
+        return
+
+    page_size = parameters.page_size
+    max_pages = min(
+        config.max_pages_per_query,
+        (estimated // page_size) + 1,
+    )
+
+    for page_id in range(1, max_pages + 1):
+        async with semaphore:
+            await asyncio.sleep(config.request_delay_ms / 1000)
+            try:
+                result = await listing_query(
+                    page=page_id,
+                    channel=parameters.listing_type,
+                    min_bedrooms=sq.min_bedrooms,
+                    max_bedrooms=sq.max_bedrooms,
+                    radius=parameters.radius,
+                    min_price=sq.min_price,
+                    max_price=sq.max_price,
+                    district=sq.district,
+                    page_size=page_size,
+                    max_days_since_added=parameters.max_days_since_added,
+                    furnish_types=parameters.furnish_types or [],
+                    session=session,
+                    config=config,
+                )
+                state.total_pages_fetched += 1
+
+                properties = result.get("properties", [])
+                for prop in properties:
+                    identifier = prop.get("identifier")
+                    if identifier and identifier not in existing_ids:
+                        existing_ids.add(identifier)
+                        state.ids_collected += 1
+                        await queue.put(identifier)
+
+                if len(properties) < page_size:
+                    break
+
+            except CircuitBreakerOpenError as e:
+                celery_logger.error(f"Circuit breaker open: {e}")
+                break
+            except ThrottlingError as e:
+                celery_logger.warning(
+                    f"Throttling on {sq.district} page {page_id}: {e}"
+                )
+                break
+            except Exception as e:
+                if "GENERIC_ERROR" in str(e):
+                    celery_logger.debug(
+                        f"Max page for {sq.district}: {page_id - 1}"
+                    )
+                    break
+                celery_logger.warning(
+                    f"Error fetching page {page_id} for "
+                    f"{sq.district}: {e}"
+                )
+                break
+
+    state.completed_subqueries += 1
+
+
+async def _process_worker(
+    queue: asyncio.Queue[int | None],
+    processor: ListingProcessor,
+    state: _PipelineState,
+) -> None:
+    """Consumer worker: pull listing IDs from the queue and process them."""
+    while True:
+        listing_id = await queue.get()
+        if listing_id is None:
+            break
+
+        def step_callback(step_name: str) -> None:
+            if step_name == "details":
+                state.details_fetched += 1
+            elif step_name == "images":
+                state.images_downloaded += 1
+            elif step_name == "ocr":
+                state.ocr_completed += 1
+
+        listing = await processor.process_listing(
+            listing_id, on_step_complete=step_callback
+        )
+        if listing is not None:
+            state.processed_count += 1
+            state.processed_listings.append(listing)
+        else:
+            state.failed_count += 1
+
+
+async def _monitor_progress(
+    task: Task,
+    state: _PipelineState,
+    subqueries_total: int,
+    start_time: float,
+) -> None:
+    """Periodically report pipeline progress via task state updates."""
+    last_progress = 0.0
+
+    while True:
+        total = state.ids_collected
+        done = state.processed_count + state.failed_count
+
+        if state.fetching_done and done >= total and total > 0:
+            break
+        if state.fetching_done and total == 0:
+            break
+
+        phase = PHASE_PROCESSING if state.fetching_done else PHASE_FETCHING
+
+        if total > 0:
+            progress_ratio = round(done / total, 2)
+        else:
+            progress_ratio = 0.0
+
+        elapsed = time.time() - start_time
+        rate = done / elapsed if elapsed > 0 else 0
+        remaining = (total - done) if total > 0 else 0
+        eta = remaining / rate if rate > 0 else 0
+
+        if progress_ratio >= last_progress + 0.1 or done == 1:
+            celery_logger.info(
+                f"Progress: {progress_ratio * 100:.0f}% "
+                f"({done}/{total}) "
+                f"| Elapsed: {elapsed:.0f}s "
+                f"| Rate: {rate:.1f}/s "
+                f"| ETA: {eta:.0f}s"
+            )
+            last_progress = progress_ratio
+
+        _update_task_state(
+            task,
+            f"{'Processing' if state.fetching_done else 'Fetching & processing'}: "
+            f"{done}/{total}",
+            {
+                "phase": phase,
+                "progress": progress_ratio,
+                "processed": done,
+                "total": total,
+                "subqueries_completed": state.completed_subqueries,
+                "subqueries_total": subqueries_total,
+                "ids_collected": state.ids_collected,
+                "pages_fetched": state.total_pages_fetched,
+                "fetching_done": state.fetching_done,
+                "details_fetched": state.details_fetched,
+                "images_downloaded": state.images_downloaded,
+                "ocr_completed": state.ocr_completed,
+                "failed": state.failed_count,
+                "elapsed_seconds": round(elapsed, 1),
+                "rate_per_second": round(rate, 2),
+                "eta_seconds": round(eta, 1),
+            },
+        )
+        await asyncio.sleep(1)
+
+
@app.task(bind=True, pydantic=True)
 def dump_listings_task(self: Task, parameters_json: str) -> dict[str, Any]:
    with redis_lock(SCRAPE_LOCK_NAME) as acquired:
        if not acquired:
            msg = "Another scrape job is already running, skipping this execution"
-            logger.warning(msg)
            celery_logger.warning(msg)
            self.update_state(state="SKIPPED", meta={"reason": "Another scrape job is running"})
            return {"status": "skipped", "reason": "another_job_running"}

        celery_logger.info(f"Acquired lock: {SCRAPE_LOCK_NAME}")
-        logger.info(f"Acquired lock: {SCRAPE_LOCK_NAME}")

        parsed_parameters = QueryParameters.model_validate_json(parameters_json)
        celery_logger.info(f"Starting scrape with parameters: {parsed_parameters}")

-        self.update_state(state="Starting...", meta={"phase": "splitting", "progress": 0})
+        self.update_state(state="Starting...", meta={"phase": PHASE_SPLITTING, "progress": 0})
        asyncio.run(dump_listings_full(task=self, parameters=parsed_parameters))
-        return {"phase": "completed", "progress": 1}
+        return {"phase": PHASE_COMPLETED, "progress": 1}


 async def async_dump_listings_task(parameters_json: str) -> dict[str, Any]:
    with redis_lock(SCRAPE_LOCK_NAME) as acquired:
        if not acquired:
-            logger.warning("Another scrape job is already running, skipping this execution")
+            celery_logger.warning("Another scrape job is already running, skipping this execution")
            return {"status": "skipped", "reason": "another_job_running"}

-        logger.info(f"Acquired lock: {SCRAPE_LOCK_NAME}")
+        celery_logger.info(f"Acquired lock: {SCRAPE_LOCK_NAME}")
        parsed_parameters = QueryParameters.model_validate_json(parameters_json)
        await dump_listings_full(task=Task(), parameters=parsed_parameters)
        return {"progress": 0}
@ -141,17 +339,16 @@ async def _dump_listings_full_inner(
    soon as IDs become available from each subquery.
    """
    start_time = time.time()
-    NUM_WORKERS = 20
+    state = _PipelineState()

    celery_logger.info("=" * 60)
-    celery_logger.info("PHASE 1: Splitting queries")
+    celery_logger.info(f"PHASE 1: Splitting queries")
    celery_logger.info("=" * 60)

    repository = ListingRepository(engine)
    config = ScraperConfig.from_env()
    splitter = QuerySplitter(config)

-    # Reset throttle metrics
    reset_throttle_metrics()

    def on_progress(phase: str, message: str, **kwargs: Any) -> None:
@ -161,7 +358,7 @@ async def _dump_listings_full_inner(
        celery_logger.info(f"[{phase}] {message}")

    _update_task_state(task, "Analyzing query and splitting by price bands...", {
-        "phase": "splitting", "progress": 0,
+        "phase": PHASE_SPLITTING, "progress": 0,
    })
    celery_logger.info("Starting query splitting and probing...")

@ -175,34 +372,22 @@ async def _dump_listings_full_inner(
                f"~{total_estimated} estimated total results"
            )

-            # Load existing IDs (fast, ID-only projection)
            celery_logger.info("Loading existing listing IDs from database...")
            existing_ids = repository.get_listing_ids(parameters.listing_type)
            celery_logger.info(f"Found {len(existing_ids)} existing listings in DB")

            celery_logger.info("=" * 60)
-            celery_logger.info("PHASE 2: Streaming fetch & process")
+            celery_logger.info(f"PHASE 2: Streaming fetch & process")
            celery_logger.info("=" * 60)

-            # Shared state for the streaming pipeline
            queue: asyncio.Queue[int | None] = asyncio.Queue()
-            ids_collected = 0
-            completed_subqueries = 0
-            total_pages_fetched = 0
-            fetching_done = False
-            processed_count = 0
-            failed_count = 0
-            details_fetched = 0
-            images_downloaded = 0
-            ocr_completed = 0
-            processed_listings: list[Listing] = []
            semaphore = asyncio.Semaphore(config.max_concurrent_requests)

            _update_task_state(
                task,
                f"Fetching listings from {len(subqueries)} subqueries...",
                {
-                    "phase": "fetching",
+                    "phase": PHASE_FETCHING,
                    "subqueries_completed": 0,
                    "subqueries_total": len(subqueries),
                    "ids_collected": 0,
@ -214,190 +399,32 @@ async def _dump_listings_full_inner(

            listing_processor = ListingProcessor(repository)

-            # --- Producer: fetch subquery pages and enqueue new IDs ---
+            # Producer: fetch all subqueries concurrently, then signal workers to stop
            async def producer() -> None:
-                nonlocal ids_collected, completed_subqueries, total_pages_fetched
-                nonlocal fetching_done
-
-                async def fetch_subquery(sq: SubQuery) -> None:
-                    nonlocal ids_collected, completed_subqueries, total_pages_fetched
-
-                    estimated = sq.estimated_results or 0
-                    if estimated == 0:
-                        completed_subqueries += 1
-                        return
-
-                    page_size = parameters.page_size
-                    max_pages = min(
-                        config.max_pages_per_query,
-                        (estimated // page_size) + 1,
-                    )
-
-                    for page_id in range(1, max_pages + 1):
-                        async with semaphore:
-                            await asyncio.sleep(config.request_delay_ms / 1000)
-                            try:
-                                result = await listing_query(
-                                    page=page_id,
-                                    channel=parameters.listing_type,
-                                    min_bedrooms=sq.min_bedrooms,
-                                    max_bedrooms=sq.max_bedrooms,
-                                    radius=parameters.radius,
-                                    min_price=sq.min_price,
-                                    max_price=sq.max_price,
-                                    district=sq.district,
-                                    page_size=page_size,
-                                    max_days_since_added=parameters.max_days_since_added,
-                                    furnish_types=parameters.furnish_types or [],
-                                    session=session,
-                                    config=config,
-                                )
-                                total_pages_fetched += 1
-
-                                # Extract and enqueue new IDs inline
-                                properties = result.get("properties", [])
-                                for prop in properties:
-                                    identifier = prop.get("identifier")
-                                    if identifier and identifier not in existing_ids:
-                                        existing_ids.add(identifier)
-                                        ids_collected += 1
-                                        await queue.put(identifier)
-
-                                if len(properties) < page_size:
-                                    break
-
-                            except CircuitBreakerOpenError as e:
-                                celery_logger.error(f"Circuit breaker open: {e}")
-                                break
-                            except ThrottlingError as e:
-                                celery_logger.warning(
-                                    f"Throttling on {sq.district} page {page_id}: {e}"
-                                )
-                                break
-                            except Exception as e:
-                                if "GENERIC_ERROR" in str(e):
-                                    logger.debug(
-                                        f"Max page for {sq.district}: {page_id - 1}"
-                                    )
-                                    break
-                                logger.warning(
-                                    f"Error fetching page {page_id} for "
-                                    f"{sq.district}: {e}"
-                                )
-                                break
-
-                    completed_subqueries += 1
-
-                # Fetch all subqueries concurrently
                await asyncio.gather(
-                    *[fetch_subquery(sq) for sq in subqueries]
+                    *[
+                        _fetch_subquery(
+                            sq, parameters, session, config,
+                            semaphore, existing_ids, queue, state,
+                        )
+                        for sq in subqueries
+                    ]
                )

                celery_logger.info(
-                    f"Fetch complete: {total_pages_fetched} pages from "
-                    f"{completed_subqueries} subqueries, "
-                    f"{ids_collected} new IDs"
+                    f"Fetch complete: {state.total_pages_fetched} pages from "
+                    f"{state.completed_subqueries} subqueries, "
+                    f"{state.ids_collected} new IDs"
                )
-                fetching_done = True
+                state.fetching_done = True

-                # Send sentinel values to stop workers
                for _ in range(NUM_WORKERS):
                    await queue.put(None)

-            # --- Consumer workers: process listings from queue ---
-            async def worker() -> None:
-                nonlocal processed_count, failed_count
-                nonlocal details_fetched, images_downloaded, ocr_completed
-
-                while True:
-                    listing_id = await queue.get()
-                    if listing_id is None:
-                        break
-
-                    def step_callback(step_name: str) -> None:
-                        nonlocal details_fetched, images_downloaded, ocr_completed
-                        if step_name == "details":
-                            details_fetched += 1
-                        elif step_name == "images":
-                            images_downloaded += 1
-                        elif step_name == "ocr":
-                            ocr_completed += 1
-
-                    listing = await listing_processor.process_listing(
-                        listing_id, on_step_complete=step_callback
-                    )
-                    if listing is not None:
-                        processed_count += 1
-                        processed_listings.append(listing)
-                    else:
-                        failed_count += 1
-
-            # --- Monitor: reports combined progress ---
-            async def monitor() -> None:
-                last_progress = 0.0
-
-                while True:
-                    total = ids_collected
-                    done = processed_count + failed_count
-
-                    if fetching_done and done >= total and total > 0:
-                        break
-                    if fetching_done and total == 0:
-                        break
-
-                    # Determine phase label
-                    phase = "processing" if fetching_done else "fetching"
-
-                    if total > 0:
-                        progress_ratio = round(done / total, 2)
-                    else:
-                        progress_ratio = 0.0
-
-                    elapsed = time.time() - start_time
-                    rate = done / elapsed if elapsed > 0 else 0
-                    remaining = (total - done) if total > 0 else 0
-                    eta = remaining / rate if rate > 0 else 0
-
-                    if progress_ratio >= last_progress + 0.1 or done == 1:
-                        celery_logger.info(
-                            f"Progress: {progress_ratio * 100:.0f}% "
-                            f"({done}/{total}) "
-                            f"| Elapsed: {elapsed:.0f}s "
-                            f"| Rate: {rate:.1f}/s "
-                            f"| ETA: {eta:.0f}s"
-                        )
-                        last_progress = progress_ratio
-
-                    _update_task_state(
-                        task,
-                        f"{'Processing' if fetching_done else 'Fetching & processing'}: "
-                        f"{done}/{total}",
-                        {
-                            "phase": phase,
-                            "progress": progress_ratio,
-                            "processed": done,
-                            "total": total,
-                            "subqueries_completed": completed_subqueries,
-                            "subqueries_total": len(subqueries),
-                            "ids_collected": ids_collected,
-                            "pages_fetched": total_pages_fetched,
-                            "fetching_done": fetching_done,
-                            "details_fetched": details_fetched,
-                            "images_downloaded": images_downloaded,
-                            "ocr_completed": ocr_completed,
-                            "failed": failed_count,
-                            "elapsed_seconds": round(elapsed, 1),
-                            "rate_per_second": round(rate, 2),
-                            "eta_seconds": round(eta, 1),
-                        },
-                    )
-                    await asyncio.sleep(1)
-
-            # Run producer, workers, and monitor concurrently
            await asyncio.gather(
                producer(),
-                *[worker() for _ in range(NUM_WORKERS)],
-                monitor(),
+                *[_process_worker(queue, listing_processor, state) for _ in range(NUM_WORKERS)],
+                _monitor_progress(task, state, len(subqueries), start_time),
            )

    except CircuitBreakerOpenError as e:
@ -418,19 +445,19 @@ async def _dump_listings_full_inner(
    elapsed = time.time() - start_time
    celery_logger.info("=" * 60)
    celery_logger.info(
-        f"COMPLETED: Processed {len(processed_listings)} listings in {elapsed:.1f}s"
+        f"COMPLETED: Processed {len(state.processed_listings)} listings in {elapsed:.1f}s"
    )
    celery_logger.info("=" * 60)

    invalidate_cache()

    _update_task_state(task, "Completed", {
-        "phase": "completed", "progress": 1,
-        "processed": len(processed_listings), "total": ids_collected,
-        "message": f"Processed {len(processed_listings)} listings in {elapsed:.0f}s",
+        "phase": PHASE_COMPLETED, "progress": 1,
+        "processed": len(state.processed_listings), "total": state.ids_collected,
+        "message": f"Processed {len(state.processed_listings)} listings in {elapsed:.0f}s",
    })

-    return processed_listings
+    return state.processed_listings


@app.on_after_finalize.connect
@ -439,11 +466,11 @@ def setup_periodic_tasks(sender, **kwargs):
    try:
        config = SchedulesConfig.from_env()
    except ValueError as e:
-        logger.error(f"Failed to load schedule configuration: {e}")
+        celery_logger.error(f"Failed to load schedule configuration: {e}")
        return

    for schedule in config.get_enabled_schedules():
-        logger.info(
+        celery_logger.info(
            f"Registering periodic task: {schedule.name} at {schedule.hour}:{schedule.minute}"
        )