Flatten repo structure: move crawler/ to root, remove vqa/ and immoweb/

The crawler subdirectory was the only active project. Moving it to the
repo root simplifies paths and removes the unnecessary nesting. The
vqa/ and immoweb/ directories were legacy/unused and have been removed.

Updated .drone.yml, .gitignore, .claude/ docs, and skills to reflect
the new flat structure.
This commit is contained in:
Viktor Barzin 2026-02-07 23:01:20 +00:00
parent e2247be700
commit eafbc1ac52
No known key found for this signature in database
GPG key ID: 0EB088298288D958
221 changed files with 70 additions and 146140 deletions

485
tasks/listing_tasks.py Normal file
View file

@ -0,0 +1,485 @@
import asyncio
import logging
import time
from collections import deque
from dataclasses import dataclass, field
from typing import Any
from celery import Task
from celery.schedules import crontab
from celery_app import app
from config.schedule_config import SchedulesConfig
from config.scraper_config import ScraperConfig
from listing_processor import ListingProcessor
from models.listing import Listing, QueryParameters
from rec.query import create_session, listing_query
from rec.exceptions import CircuitBreakerOpenError, ThrottlingError
from rec.throttle_detector import get_throttle_metrics, reset_throttle_metrics
from repositories.listing_repository import ListingRepository
from database import engine
from services.query_splitter import QuerySplitter, SubQuery
from utils.redis_lock import redis_lock
from services.listing_cache import invalidate_cache
logger = logging.getLogger("uvicorn.error")
# Also configure a celery-specific logger that always outputs to stdout
celery_logger = logging.getLogger("celery.task")
if not celery_logger.handlers:
handler = logging.StreamHandler()
handler.setFormatter(logging.Formatter(
"%(asctime)s [%(levelname)s] %(name)s: %(message)s"
))
celery_logger.addHandler(handler)
celery_logger.setLevel(logging.INFO)
SCRAPE_LOCK_NAME = "scrape_listings"
LOG_BUFFER_MAX_LINES = 200
# Number of concurrent consumer workers that process listings from the queue.
NUM_WORKERS = 20
# Phase constants for task state reporting
PHASE_SPLITTING = "splitting"
PHASE_FETCHING = "fetching"
PHASE_PROCESSING = "processing"
PHASE_COMPLETED = "completed"
# Module-level log buffer — active only during task execution.
# This is safe as module-level mutable state because Celery workers use a
# prefork pool: each worker process handles one task at a time, so there is
# no concurrent access within a single process. The TaskLogHandler appends
# here; _update_task_state reads from here.
_active_log_buffer: deque[str] | None = None
@dataclass
class _PipelineState:
"""Shared mutable state for the streaming fetch-and-process pipeline."""
ids_collected: int = 0
completed_subqueries: int = 0
total_pages_fetched: int = 0
fetching_done: bool = False
processed_count: int = 0
failed_count: int = 0
details_fetched: int = 0
images_downloaded: int = 0
ocr_completed: int = 0
processed_listings: list[Listing] = field(default_factory=list)
class TaskLogHandler(logging.Handler):
"""Captures log records into a deque for inclusion in task state updates."""
def __init__(self, buffer: deque[str]) -> None:
super().__init__()
self.buffer = buffer
def emit(self, record: logging.LogRecord) -> None:
try:
self.buffer.append(self.format(record))
except Exception:
pass
def _update_task_state(task: Task, state: str, meta: dict[str, Any]) -> None:
"""Call task.update_state with logs injected from the active log buffer."""
if _active_log_buffer is not None:
meta["logs"] = list(_active_log_buffer)
task.update_state(state=state, meta=meta)
async def _fetch_subquery(
sq: SubQuery,
parameters: QueryParameters,
session: object,
config: ScraperConfig,
semaphore: asyncio.Semaphore,
existing_ids: set[int],
queue: asyncio.Queue[int | None],
state: _PipelineState,
) -> None:
"""Fetch pages for a single subquery and enqueue new listing IDs."""
estimated = sq.estimated_results or 0
if estimated == 0:
state.completed_subqueries += 1
return
page_size = parameters.page_size
max_pages = min(
config.max_pages_per_query,
(estimated // page_size) + 1,
)
for page_id in range(1, max_pages + 1):
async with semaphore:
await asyncio.sleep(config.request_delay_ms / 1000)
try:
result = await listing_query(
page=page_id,
channel=parameters.listing_type,
min_bedrooms=sq.min_bedrooms,
max_bedrooms=sq.max_bedrooms,
radius=parameters.radius,
min_price=sq.min_price,
max_price=sq.max_price,
district=sq.district,
page_size=page_size,
max_days_since_added=parameters.max_days_since_added,
furnish_types=parameters.furnish_types or [],
session=session,
config=config,
)
state.total_pages_fetched += 1
properties = result.get("properties", [])
for prop in properties:
identifier = prop.get("identifier")
if identifier and identifier not in existing_ids:
existing_ids.add(identifier)
state.ids_collected += 1
await queue.put(identifier)
if len(properties) < page_size:
break
except CircuitBreakerOpenError as e:
celery_logger.error(f"Circuit breaker open: {e}")
break
except ThrottlingError as e:
celery_logger.warning(
f"Throttling on {sq.district} page {page_id}: {e}"
)
break
except Exception as e:
if "GENERIC_ERROR" in str(e):
celery_logger.debug(
f"Max page for {sq.district}: {page_id - 1}"
)
break
celery_logger.warning(
f"Error fetching page {page_id} for "
f"{sq.district}: {e}"
)
break
state.completed_subqueries += 1
async def _process_worker(
queue: asyncio.Queue[int | None],
processor: ListingProcessor,
state: _PipelineState,
) -> None:
"""Consumer worker: pull listing IDs from the queue and process them."""
while True:
listing_id = await queue.get()
if listing_id is None:
break
def step_callback(step_name: str) -> None:
if step_name == "details":
state.details_fetched += 1
elif step_name == "images":
state.images_downloaded += 1
elif step_name == "ocr":
state.ocr_completed += 1
listing = await processor.process_listing(
listing_id, on_step_complete=step_callback
)
if listing is not None:
state.processed_count += 1
state.processed_listings.append(listing)
else:
state.failed_count += 1
async def _monitor_progress(
task: Task,
state: _PipelineState,
subqueries_total: int,
start_time: float,
) -> None:
"""Periodically report pipeline progress via task state updates."""
last_progress = 0.0
while True:
total = state.ids_collected
done = state.processed_count + state.failed_count
if state.fetching_done and done >= total and total > 0:
break
if state.fetching_done and total == 0:
break
phase = PHASE_PROCESSING if state.fetching_done else PHASE_FETCHING
if total > 0:
progress_ratio = round(done / total, 2)
else:
progress_ratio = 0.0
elapsed = time.time() - start_time
rate = done / elapsed if elapsed > 0 else 0
remaining = (total - done) if total > 0 else 0
eta = remaining / rate if rate > 0 else 0
if progress_ratio >= last_progress + 0.1 or done == 1:
celery_logger.info(
f"Progress: {progress_ratio * 100:.0f}% "
f"({done}/{total}) "
f"| Elapsed: {elapsed:.0f}s "
f"| Rate: {rate:.1f}/s "
f"| ETA: {eta:.0f}s"
)
last_progress = progress_ratio
_update_task_state(
task,
f"{'Processing' if state.fetching_done else 'Fetching & processing'}: "
f"{done}/{total}",
{
"phase": phase,
"progress": progress_ratio,
"processed": done,
"total": total,
"subqueries_completed": state.completed_subqueries,
"subqueries_total": subqueries_total,
"ids_collected": state.ids_collected,
"pages_fetched": state.total_pages_fetched,
"fetching_done": state.fetching_done,
"details_fetched": state.details_fetched,
"images_downloaded": state.images_downloaded,
"ocr_completed": state.ocr_completed,
"failed": state.failed_count,
"elapsed_seconds": round(elapsed, 1),
"rate_per_second": round(rate, 2),
"eta_seconds": round(eta, 1),
},
)
await asyncio.sleep(1)
@app.task(bind=True, pydantic=True)
def dump_listings_task(self: Task, parameters_json: str) -> dict[str, Any]:
with redis_lock(SCRAPE_LOCK_NAME) as acquired:
if not acquired:
msg = "Another scrape job is already running, skipping this execution"
celery_logger.warning(msg)
self.update_state(state="SKIPPED", meta={"reason": "Another scrape job is running"})
return {"status": "skipped", "reason": "another_job_running"}
celery_logger.info(f"Acquired lock: {SCRAPE_LOCK_NAME}")
parsed_parameters = QueryParameters.model_validate_json(parameters_json)
celery_logger.info(f"Starting scrape with parameters: {parsed_parameters}")
self.update_state(state="Starting...", meta={"phase": PHASE_SPLITTING, "progress": 0})
asyncio.run(dump_listings_full(task=self, parameters=parsed_parameters))
return {"phase": PHASE_COMPLETED, "progress": 1}
async def async_dump_listings_task(parameters_json: str) -> dict[str, Any]:
with redis_lock(SCRAPE_LOCK_NAME) as acquired:
if not acquired:
celery_logger.warning("Another scrape job is already running, skipping this execution")
return {"status": "skipped", "reason": "another_job_running"}
celery_logger.info(f"Acquired lock: {SCRAPE_LOCK_NAME}")
parsed_parameters = QueryParameters.model_validate_json(parameters_json)
await dump_listings_full(task=Task(), parameters=parsed_parameters)
return {"progress": 0}
async def dump_listings_full(
*, task: Task, parameters: QueryParameters
) -> list[Listing]:
"""Fetches all listings, images as well as detects floorplans"""
global _active_log_buffer
# Set up log capture into a module-level buffer so _update_task_state
# can inject logs into every state update.
log_buffer: deque[str] = deque(maxlen=LOG_BUFFER_MAX_LINES)
log_handler = TaskLogHandler(log_buffer)
log_handler.setFormatter(
logging.Formatter("%(asctime)s %(message)s", datefmt="%H:%M:%S")
)
# Attach handler to both loggers used in the codebase, and ensure
# they accept INFO-level messages (Celery's worker setup may leave
# the celery.task logger at WARNING).
_prev_celery_level = celery_logger.level
_prev_logger_level = logger.level
celery_logger.addHandler(log_handler)
logger.addHandler(log_handler)
if celery_logger.level == logging.NOTSET or celery_logger.level > logging.INFO:
celery_logger.setLevel(logging.INFO)
if logger.level == logging.NOTSET or logger.level > logging.INFO:
logger.setLevel(logging.INFO)
_active_log_buffer = log_buffer
try:
return await _dump_listings_full_inner(task=task, parameters=parameters)
finally:
_active_log_buffer = None
celery_logger.removeHandler(log_handler)
logger.removeHandler(log_handler)
celery_logger.setLevel(_prev_celery_level)
logger.setLevel(_prev_logger_level)
async def _dump_listings_full_inner(
*, task: Task, parameters: QueryParameters
) -> list[Listing]:
"""Inner implementation with log capture active.
Uses a streaming pipeline: an asyncio.Queue bridges the fetcher (producer)
and processor workers (consumers) so that listing processing starts as
soon as IDs become available from each subquery.
"""
start_time = time.time()
state = _PipelineState()
celery_logger.info("=" * 60)
celery_logger.info(f"PHASE 1: Splitting queries")
celery_logger.info("=" * 60)
repository = ListingRepository(engine)
config = ScraperConfig.from_env()
splitter = QuerySplitter(config)
reset_throttle_metrics()
def on_progress(phase: str, message: str, **kwargs: Any) -> None:
meta: dict[str, Any] = {"phase": phase, "message": message}
meta.update(kwargs)
_update_task_state(task, message, meta)
celery_logger.info(f"[{phase}] {message}")
_update_task_state(task, "Analyzing query and splitting by price bands...", {
"phase": PHASE_SPLITTING, "progress": 0,
})
celery_logger.info("Starting query splitting and probing...")
try:
async with create_session(config) as session:
subqueries = await splitter.split(parameters, session, on_progress)
total_estimated = splitter.calculate_total_estimated_results(subqueries)
celery_logger.info(
f"Query split complete: {len(subqueries)} subqueries, "
f"~{total_estimated} estimated total results"
)
celery_logger.info("Loading existing listing IDs from database...")
existing_ids = repository.get_listing_ids(parameters.listing_type)
celery_logger.info(f"Found {len(existing_ids)} existing listings in DB")
celery_logger.info("=" * 60)
celery_logger.info(f"PHASE 2: Streaming fetch & process")
celery_logger.info("=" * 60)
queue: asyncio.Queue[int | None] = asyncio.Queue()
semaphore = asyncio.Semaphore(config.max_concurrent_requests)
_update_task_state(
task,
f"Fetching listings from {len(subqueries)} subqueries...",
{
"phase": PHASE_FETCHING,
"subqueries_completed": 0,
"subqueries_total": len(subqueries),
"ids_collected": 0,
"pages_fetched": 0,
"estimated_results": total_estimated,
"fetching_done": False,
},
)
listing_processor = ListingProcessor(repository)
# Producer: fetch all subqueries concurrently, then signal workers to stop
async def producer() -> None:
await asyncio.gather(
*[
_fetch_subquery(
sq, parameters, session, config,
semaphore, existing_ids, queue, state,
)
for sq in subqueries
]
)
celery_logger.info(
f"Fetch complete: {state.total_pages_fetched} pages from "
f"{state.completed_subqueries} subqueries, "
f"{state.ids_collected} new IDs"
)
state.fetching_done = True
for _ in range(NUM_WORKERS):
await queue.put(None)
await asyncio.gather(
producer(),
*[_process_worker(queue, listing_processor, state) for _ in range(NUM_WORKERS)],
_monitor_progress(task, state, len(subqueries), start_time),
)
except CircuitBreakerOpenError as e:
celery_logger.error(f"Circuit breaker prevented query: {e}")
metrics = get_throttle_metrics()
if metrics.total_requests > 0:
celery_logger.info(metrics.summary())
return []
finally:
metrics = get_throttle_metrics()
if metrics.total_requests > 0:
celery_logger.info(
f"API Stats: {metrics.total_requests} requests, "
f"avg {metrics.average_response_time:.2f}s, "
f"{metrics.total_throttling_events} throttled"
)
elapsed = time.time() - start_time
celery_logger.info("=" * 60)
celery_logger.info(
f"COMPLETED: Processed {len(state.processed_listings)} listings in {elapsed:.1f}s"
)
celery_logger.info("=" * 60)
invalidate_cache()
_update_task_state(task, "Completed", {
"phase": PHASE_COMPLETED, "progress": 1,
"processed": len(state.processed_listings), "total": state.ids_collected,
"message": f"Processed {len(state.processed_listings)} listings in {elapsed:.0f}s",
})
return state.processed_listings
@app.on_after_finalize.connect
def setup_periodic_tasks(sender, **kwargs):
"""Register periodic tasks from environment configuration."""
try:
config = SchedulesConfig.from_env()
except ValueError as e:
celery_logger.error(f"Failed to load schedule configuration: {e}")
return
for schedule in config.get_enabled_schedules():
celery_logger.info(
f"Registering periodic task: {schedule.name} at {schedule.hour}:{schedule.minute}"
)
sender.add_periodic_task(
crontab(
minute=schedule.minute,
hour=schedule.hour,
day_of_week=schedule.day_of_week,
),
dump_listings_task.s(schedule.to_query_parameters().model_dump_json()),
name=schedule.name,
)

8
tasks/task_state.py Normal file
View file

@ -0,0 +1,8 @@
import enum
class TaskStatus(enum.StrEnum):
QUEUED = "queued"
PROCESSING = "processing"
COMPLETED = "completed"
FAILED = "failed"