wrongmove/crawler/tests/unit/test_query_splitter.py

375 lines
13 KiB
Python
Raw Normal View History

"""Unit tests for QuerySplitter service."""
import pytest
from unittest.mock import AsyncMock, patch
from config.scraper_config import ScraperConfig
from models.listing import ListingType, QueryParameters
from services.query_splitter import QuerySplitter, SubQuery
class TestScraperConfig:
"""Tests for the ScraperConfig dataclass."""
def test_default_values(self) -> None:
"""Test that default values are set correctly."""
config = ScraperConfig()
assert config.max_concurrent_requests == 5
assert config.request_delay_ms == 100
assert config.result_cap == 1500
assert config.split_threshold == 1200
assert config.min_price_band == 100
assert config.max_pages_per_query == 60
assert config.proxy_url is None
def test_from_env(self) -> None:
"""Test loading configuration from environment variables."""
with patch.dict(
"os.environ",
{
"RIGHTMOVE_MAX_CONCURRENT": "10",
"RIGHTMOVE_REQUEST_DELAY_MS": "200",
"RIGHTMOVE_SPLIT_THRESHOLD": "1000",
"RIGHTMOVE_MIN_PRICE_BAND": "50",
"RIGHTMOVE_MAX_PAGES": "30",
"RIGHTMOVE_PROXY_URL": "socks5://localhost:9050",
},
):
config = ScraperConfig.from_env()
assert config.max_concurrent_requests == 10
assert config.request_delay_ms == 200
assert config.split_threshold == 1000
assert config.min_price_band == 50
assert config.max_pages_per_query == 30
assert config.proxy_url == "socks5://localhost:9050"
def test_from_env_empty_proxy(self) -> None:
"""Test that empty proxy URL is converted to None."""
with patch.dict(
"os.environ",
{
"RIGHTMOVE_PROXY_URL": "",
},
clear=False,
):
config = ScraperConfig.from_env()
assert config.proxy_url is None
class TestSubQuery:
"""Tests for the SubQuery dataclass."""
def test_price_range_calculation(self) -> None:
"""Test that price_range is calculated correctly."""
sq = SubQuery(
district="Kings Cross",
min_bedrooms=2,
max_bedrooms=2,
min_price=1000,
max_price=2000,
)
assert sq.price_range == 1000
class TestQuerySplitter:
"""Tests for the QuerySplitter class."""
@pytest.fixture
def config(self) -> ScraperConfig:
"""Create a test configuration."""
return ScraperConfig(
max_concurrent_requests=5,
request_delay_ms=10, # Faster for testing
result_cap=1500,
split_threshold=1200,
min_price_band=100,
max_pages_per_query=60,
proxy_url=None,
)
@pytest.fixture
def splitter(self, config: ScraperConfig) -> QuerySplitter:
"""Create a QuerySplitter instance."""
return QuerySplitter(config)
@pytest.fixture
def parameters(self) -> QueryParameters:
"""Create test query parameters."""
return QueryParameters(
listing_type=ListingType.RENT,
min_bedrooms=2,
max_bedrooms=3,
min_price=1000,
max_price=5000,
district_names={"Kings Cross", "Angel"},
)
def test_create_initial_subqueries(
self, splitter: QuerySplitter, parameters: QueryParameters
) -> None:
"""Test that initial subqueries are created correctly."""
districts = {"Kings Cross": "STATION^5168", "Angel": "STATION^1234"}
subqueries = splitter.create_initial_subqueries(parameters, districts)
# 2 districts × 2 bedroom counts (2,3) = 4 subqueries
assert len(subqueries) == 4
# Check first subquery
assert subqueries[0].district == "Kings Cross"
assert subqueries[0].min_bedrooms == 2
assert subqueries[0].max_bedrooms == 2
assert subqueries[0].min_price == 1000
assert subqueries[0].max_price == 5000
def test_split_by_price(self, splitter: QuerySplitter) -> None:
"""Test that price splitting works correctly."""
sq = SubQuery(
district="Kings Cross",
min_bedrooms=2,
max_bedrooms=2,
min_price=1000,
max_price=5000,
)
halves = splitter.split_by_price(sq)
assert len(halves) == 2
assert halves[0].min_price == 1000
assert halves[0].max_price == 3000 # midpoint
assert halves[1].min_price == 3000
assert halves[1].max_price == 5000
# Both should have same bedroom range and district
for half in halves:
assert half.district == "Kings Cross"
assert half.min_bedrooms == 2
assert half.max_bedrooms == 2
@pytest.mark.asyncio
async def test_probe_result_count(
self, splitter: QuerySplitter, parameters: QueryParameters
) -> None:
"""Test probing API for result count."""
sq = SubQuery(
district="Kings Cross",
min_bedrooms=2,
max_bedrooms=2,
min_price=1000,
max_price=5000,
)
mock_session = AsyncMock()
# Mock the probe_query function
with patch("services.query_splitter.probe_query") as mock_probe:
mock_probe.return_value = {"totalAvailableResults": 800}
count = await splitter.probe_result_count(sq, mock_session, parameters)
assert count == 800
mock_probe.assert_called_once()
@pytest.mark.asyncio
async def test_probe_result_count_handles_error(
self, splitter: QuerySplitter, parameters: QueryParameters
) -> None:
"""Test that probe_result_count handles errors gracefully."""
sq = SubQuery(
district="Kings Cross",
min_bedrooms=2,
max_bedrooms=2,
min_price=1000,
max_price=5000,
)
mock_session = AsyncMock()
with patch("services.query_splitter.probe_query") as mock_probe:
mock_probe.side_effect = Exception("API error")
count = await splitter.probe_result_count(sq, mock_session, parameters)
# Should return 0 on error
assert count == 0
@pytest.mark.asyncio
async def test_adaptive_split_no_split_needed(
self, splitter: QuerySplitter, parameters: QueryParameters
) -> None:
"""Test adaptive split when results are below threshold."""
sq = SubQuery(
district="Kings Cross",
min_bedrooms=2,
max_bedrooms=2,
min_price=1000,
max_price=2000,
)
mock_session = AsyncMock()
mock_semaphore = AsyncMock()
with patch("services.query_splitter.probe_query") as mock_probe:
# First half has 600 results, second half has 500
mock_probe.side_effect = [
{"totalAvailableResults": 600},
{"totalAvailableResults": 500},
]
result = await splitter.adaptive_split(
sq, mock_session, parameters, mock_semaphore
)
# Both halves are under threshold (1200), so we get 2 subqueries back
assert len(result) == 2
assert result[0].estimated_results == 600
assert result[1].estimated_results == 500
@pytest.mark.asyncio
async def test_adaptive_split_recursive_splitting(
self, splitter: QuerySplitter, parameters: QueryParameters
) -> None:
"""Test adaptive split performs recursive splitting when needed."""
sq = SubQuery(
district="Kings Cross",
min_bedrooms=2,
max_bedrooms=2,
min_price=1000,
max_price=5000,
)
mock_session = AsyncMock()
mock_semaphore = AsyncMock()
with patch("services.query_splitter.probe_query") as mock_probe:
# First split: 1000-3000 has 1300 (over threshold), 3000-5000 has 800
# Second split of 1000-3000: 1000-2000 has 700, 2000-3000 has 600
mock_probe.side_effect = [
{"totalAvailableResults": 1300}, # First half - needs more splitting
{"totalAvailableResults": 800}, # Second half - OK
{"totalAvailableResults": 700}, # First quarter - OK
{"totalAvailableResults": 600}, # Second quarter - OK
]
result = await splitter.adaptive_split(
sq, mock_session, parameters, mock_semaphore
)
# Should get 3 subqueries: [1000-2000 (700), 2000-3000 (600), 3000-5000 (800)]
assert len(result) == 3
@pytest.mark.asyncio
async def test_adaptive_split_respects_min_price_band(
self, splitter: QuerySplitter, parameters: QueryParameters
) -> None:
"""Test that adaptive split stops at min_price_band."""
sq = SubQuery(
district="Kings Cross",
min_bedrooms=2,
max_bedrooms=2,
min_price=1000,
max_price=1050, # Only 50 range, below min_price_band of 100
estimated_results=1500, # Over threshold but can't split
)
mock_session = AsyncMock()
mock_semaphore = AsyncMock()
result = await splitter.adaptive_split(
sq, mock_session, parameters, mock_semaphore
)
# Can't split below min_price_band, should return original
assert len(result) == 1
assert result[0].min_price == 1000
assert result[0].max_price == 1050
def test_calculate_total_estimated_results(
self, splitter: QuerySplitter
) -> None:
"""Test calculation of total estimated results."""
subqueries = [
SubQuery(
district="Kings Cross",
min_bedrooms=2,
max_bedrooms=2,
min_price=1000,
max_price=2000,
estimated_results=500,
),
SubQuery(
district="Kings Cross",
min_bedrooms=3,
max_bedrooms=3,
min_price=1000,
max_price=2000,
estimated_results=300,
),
SubQuery(
district="Angel",
min_bedrooms=2,
max_bedrooms=2,
min_price=1000,
max_price=2000,
estimated_results=None, # Not probed
),
]
total = splitter.calculate_total_estimated_results(subqueries)
assert total == 800 # 500 + 300 + 0
@pytest.mark.asyncio
async def test_split_integration(
self, splitter: QuerySplitter, parameters: QueryParameters
) -> None:
"""Integration test for the full split workflow."""
mock_session = AsyncMock()
mock_districts = {"Kings Cross": "STATION^5168", "Angel": "STATION^1234"}
with patch("services.query_splitter.get_districts", return_value=mock_districts):
with patch("services.query_splitter.probe_query") as mock_probe:
# Mock probe results for each initial subquery
# 2 districts × 2 bedroom counts = 4 initial subqueries
mock_probe.side_effect = [
{"totalAvailableResults": 500}, # KC 2BR - OK
{"totalAvailableResults": 1300}, # KC 3BR - needs split
{"totalAvailableResults": 600}, # Angel 2BR - OK
{"totalAvailableResults": 800}, # Angel 3BR - OK
# Split KC 3BR
{"totalAvailableResults": 700}, # KC 3BR first half
{"totalAvailableResults": 600}, # KC 3BR second half
]
result = await splitter.split(parameters, mock_session)
# Should have 5 subqueries total:
# KC 2BR (500), KC 3BR split into 2 (700+600), Angel 2BR (600), Angel 3BR (800)
assert len(result) == 5
# Verify total estimated results
total = splitter.calculate_total_estimated_results(result)
assert total == 3200 # 500 + 700 + 600 + 600 + 800
@pytest.mark.asyncio
async def test_split_with_on_progress_callback(
self, splitter: QuerySplitter, parameters: QueryParameters
) -> None:
"""Test that on_progress callback is called during split."""
mock_session = AsyncMock()
mock_districts = {"Kings Cross": "STATION^5168", "Angel": "STATION^1234"}
progress_calls = []
def on_progress(phase: str, message: str) -> None:
progress_calls.append((phase, message))
with patch("services.query_splitter.get_districts", return_value=mock_districts):
with patch("services.query_splitter.probe_query") as mock_probe:
mock_probe.return_value = {"totalAvailableResults": 500}
await splitter.split(parameters, mock_session, on_progress)
# Should have received at least 2 progress updates
assert len(progress_calls) >= 2
phases = [call[0] for call in progress_calls]
assert "splitting" in phases
assert "splitting_complete" in phases