wrongmove/crawler/tests/unit/test_listing_tasks.py
Viktor Barzin 150342bb9e
Refactor codebase following Clean Code principles and add 229 tests
- Extract helpers to reduce function sizes (listing_tasks, app.py, query.py, listing_fetcher)
  - Replace nonlocal mutations with _PipelineState dataclass in listing_tasks
  - Fix bugs: isinstance→equality check in repository, verify_exp for OIDC tokens
  - Consolidate duplicate filter methods in listing_repository
  - Move hardcoded config to env vars with backward-compatible defaults
  - Simplify CLI decorator to auto-build QueryParameters
  - Add deprecation docstring to data_access.py
  - Test count: 158 → 387 (all passing)
2026-02-07 20:19:57 +00:00

295 lines
10 KiB
Python

"""Unit tests for tasks/listing_tasks.py."""
import json
import os
from collections import deque
from unittest.mock import MagicMock, patch, AsyncMock, call
import pytest
import tasks.listing_tasks as module
from tasks.listing_tasks import (
_update_task_state,
_PipelineState,
TaskLogHandler,
SCRAPE_LOCK_NAME,
LOG_BUFFER_MAX_LINES,
NUM_WORKERS,
PHASE_SPLITTING,
PHASE_FETCHING,
PHASE_PROCESSING,
PHASE_COMPLETED,
)
class TestUpdateTaskState:
"""Tests for _update_task_state."""
def test_injects_logs_from_active_buffer(self):
task = MagicMock()
original = module._active_log_buffer
try:
module._active_log_buffer = deque(["log line 1", "log line 2"])
_update_task_state(task, "test_state", {"key": "value"})
task.update_state.assert_called_once()
call_meta = task.update_state.call_args[1]["meta"]
assert call_meta["logs"] == ["log line 1", "log line 2"]
assert call_meta["key"] == "value"
finally:
module._active_log_buffer = original
def test_works_when_buffer_is_none(self):
task = MagicMock()
original = module._active_log_buffer
try:
module._active_log_buffer = None
_update_task_state(task, "some_state", {"phase": "testing"})
task.update_state.assert_called_once_with(
state="some_state", meta={"phase": "testing"}
)
# No "logs" key should be injected
call_meta = task.update_state.call_args[1]["meta"]
assert "logs" not in call_meta
finally:
module._active_log_buffer = original
def test_state_string_is_passed_through(self):
task = MagicMock()
original = module._active_log_buffer
try:
module._active_log_buffer = None
_update_task_state(task, "PROGRESS", {})
task.update_state.assert_called_once_with(state="PROGRESS", meta={})
finally:
module._active_log_buffer = original
def test_empty_buffer_injects_empty_list(self):
task = MagicMock()
original = module._active_log_buffer
try:
module._active_log_buffer = deque()
_update_task_state(task, "state", {"a": 1})
call_meta = task.update_state.call_args[1]["meta"]
assert call_meta["logs"] == []
finally:
module._active_log_buffer = original
class TestTaskLogHandler:
"""Tests for the TaskLogHandler."""
def test_emit_appends_to_buffer(self):
buf = deque(maxlen=10)
handler = TaskLogHandler(buf)
handler.setFormatter(
__import__("logging").Formatter("%(message)s")
)
record = __import__("logging").LogRecord(
name="test", level=20, pathname="", lineno=0,
msg="hello", args=(), exc_info=None,
)
handler.emit(record)
assert "hello" in buf
def test_buffer_respects_maxlen(self):
buf = deque(maxlen=2)
handler = TaskLogHandler(buf)
handler.setFormatter(
__import__("logging").Formatter("%(message)s")
)
for i in range(5):
record = __import__("logging").LogRecord(
name="test", level=20, pathname="", lineno=0,
msg=f"msg{i}", args=(), exc_info=None,
)
handler.emit(record)
assert len(buf) == 2
assert list(buf) == ["msg3", "msg4"]
class TestDumpListingsTask:
"""Tests for dump_listings_task Celery task."""
@patch("tasks.listing_tasks.redis_lock")
def test_skips_when_lock_not_acquired(self, mock_redis_lock):
"""Task should skip when another scrape is running."""
mock_cm = MagicMock()
mock_cm.__enter__ = MagicMock(return_value=False)
mock_cm.__exit__ = MagicMock(return_value=False)
mock_redis_lock.return_value = mock_cm
from tasks.listing_tasks import dump_listings_task
# Use run() which handles bind=True properly
task_instance = dump_listings_task
task_instance.update_state = MagicMock()
result = dump_listings_task.run('{"listing_type": "RENT"}')
assert result["status"] == "skipped"
assert result["reason"] == "another_job_running"
mock_redis_lock.assert_called_once_with(SCRAPE_LOCK_NAME)
@patch("tasks.listing_tasks.asyncio.run")
@patch("tasks.listing_tasks.redis_lock")
def test_calls_dump_listings_full_when_lock_acquired(
self, mock_redis_lock, mock_asyncio_run
):
"""Task should call dump_listings_full when lock is acquired."""
mock_cm = MagicMock()
mock_cm.__enter__ = MagicMock(return_value=True)
mock_cm.__exit__ = MagicMock(return_value=False)
mock_redis_lock.return_value = mock_cm
mock_asyncio_run.return_value = []
from tasks.listing_tasks import dump_listings_task
task_instance = dump_listings_task
task_instance.update_state = MagicMock()
params_json = '{"listing_type": "RENT", "min_price": 1000, "max_price": 5000}'
result = dump_listings_task.run(params_json)
assert result["phase"] == "completed"
assert result["progress"] == 1
mock_asyncio_run.assert_called_once()
mock_redis_lock.assert_called_once_with(SCRAPE_LOCK_NAME)
class TestSetupPeriodicTasks:
"""Tests for setup_periodic_tasks."""
@patch("tasks.listing_tasks.SchedulesConfig.from_env")
def test_registers_enabled_schedules(self, mock_from_env):
from config.schedule_config import ScheduleConfig
from models.listing import ListingType
schedule = ScheduleConfig(
name="Test Schedule",
listing_type=ListingType.RENT,
hour="3",
minute="30",
)
mock_config = MagicMock()
mock_config.get_enabled_schedules.return_value = [schedule]
mock_from_env.return_value = mock_config
sender = MagicMock()
module.setup_periodic_tasks(sender)
sender.add_periodic_task.assert_called_once()
call_args = sender.add_periodic_task.call_args
assert call_args[1]["name"] == "Test Schedule"
@patch("tasks.listing_tasks.SchedulesConfig.from_env")
def test_handles_config_error_gracefully(self, mock_from_env):
mock_from_env.side_effect = ValueError("bad config")
sender = MagicMock()
module.setup_periodic_tasks(sender)
sender.add_periodic_task.assert_not_called()
@patch("tasks.listing_tasks.SchedulesConfig.from_env")
def test_registers_nothing_when_no_schedules(self, mock_from_env):
mock_config = MagicMock()
mock_config.get_enabled_schedules.return_value = []
mock_from_env.return_value = mock_config
sender = MagicMock()
module.setup_periodic_tasks(sender)
sender.add_periodic_task.assert_not_called()
@patch("tasks.listing_tasks.SchedulesConfig.from_env")
def test_registers_multiple_schedules(self, mock_from_env):
from config.schedule_config import ScheduleConfig
from models.listing import ListingType
schedules = [
ScheduleConfig(name="Rent", listing_type=ListingType.RENT, hour="2"),
ScheduleConfig(name="Buy", listing_type=ListingType.BUY, hour="4"),
]
mock_config = MagicMock()
mock_config.get_enabled_schedules.return_value = schedules
mock_from_env.return_value = mock_config
sender = MagicMock()
module.setup_periodic_tasks(sender)
assert sender.add_periodic_task.call_count == 2
class TestPipelineState:
"""Tests for _PipelineState dataclass."""
def test_default_initialization(self):
state = _PipelineState()
assert state.ids_collected == 0
assert state.completed_subqueries == 0
assert state.total_pages_fetched == 0
assert state.fetching_done is False
assert state.processed_count == 0
assert state.failed_count == 0
assert state.details_fetched == 0
assert state.images_downloaded == 0
assert state.ocr_completed == 0
assert state.processed_listings == []
def test_incrementing_counters(self):
state = _PipelineState()
state.ids_collected += 5
state.completed_subqueries += 3
state.total_pages_fetched += 10
state.processed_count += 4
state.failed_count += 1
state.details_fetched += 4
state.images_downloaded += 3
state.ocr_completed += 2
assert state.ids_collected == 5
assert state.completed_subqueries == 3
assert state.total_pages_fetched == 10
assert state.processed_count == 4
assert state.failed_count == 1
assert state.details_fetched == 4
assert state.images_downloaded == 3
assert state.ocr_completed == 2
def test_appending_to_processed_listings(self):
state = _PipelineState()
state.processed_listings.append("listing_a")
state.processed_listings.append("listing_b")
assert len(state.processed_listings) == 2
assert state.processed_listings == ["listing_a", "listing_b"]
def test_separate_instances_have_independent_lists(self):
state_a = _PipelineState()
state_b = _PipelineState()
state_a.processed_listings.append("only_a")
assert state_b.processed_listings == []
def test_fetching_done_toggle(self):
state = _PipelineState()
assert state.fetching_done is False
state.fetching_done = True
assert state.fetching_done is True
class TestPhaseConstants:
"""Tests for phase constant values."""
def test_phase_splitting(self):
assert PHASE_SPLITTING == "splitting"
def test_phase_fetching(self):
assert PHASE_FETCHING == "fetching"
def test_phase_processing(self):
assert PHASE_PROCESSING == "processing"
def test_phase_completed(self):
assert PHASE_COMPLETED == "completed"
def test_num_workers(self):
assert NUM_WORKERS == 20