wrongmove/crawler/tests/unit/test_query_splitter.py

374 lines
13 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""Unit tests for QuerySplitter service."""
import pytest
from unittest.mock import AsyncMock, patch
from config.scraper_config import ScraperConfig
from models.listing import ListingType, QueryParameters
from services.query_splitter import QuerySplitter, SubQuery
class TestScraperConfig:
"""Tests for the ScraperConfig dataclass."""
def test_default_values(self) -> None:
"""Test that default values are set correctly."""
config = ScraperConfig()
assert config.max_concurrent_requests == 5
assert config.request_delay_ms == 100
assert config.result_cap == 1500
assert config.split_threshold == 1200
assert config.min_price_band == 100
assert config.max_pages_per_query == 60
assert config.proxy_url is None
def test_from_env(self) -> None:
"""Test loading configuration from environment variables."""
with patch.dict(
"os.environ",
{
"RIGHTMOVE_MAX_CONCURRENT": "10",
"RIGHTMOVE_REQUEST_DELAY_MS": "200",
"RIGHTMOVE_SPLIT_THRESHOLD": "1000",
"RIGHTMOVE_MIN_PRICE_BAND": "50",
"RIGHTMOVE_MAX_PAGES": "30",
"RIGHTMOVE_PROXY_URL": "socks5://localhost:9050",
},
):
config = ScraperConfig.from_env()
assert config.max_concurrent_requests == 10
assert config.request_delay_ms == 200
assert config.split_threshold == 1000
assert config.min_price_band == 50
assert config.max_pages_per_query == 30
assert config.proxy_url == "socks5://localhost:9050"
def test_from_env_empty_proxy(self) -> None:
"""Test that empty proxy URL is converted to None."""
with patch.dict(
"os.environ",
{
"RIGHTMOVE_PROXY_URL": "",
},
clear=False,
):
config = ScraperConfig.from_env()
assert config.proxy_url is None
class TestSubQuery:
"""Tests for the SubQuery dataclass."""
def test_price_range_calculation(self) -> None:
"""Test that price_range is calculated correctly."""
sq = SubQuery(
district="Kings Cross",
min_bedrooms=2,
max_bedrooms=2,
min_price=1000,
max_price=2000,
)
assert sq.price_range == 1000
class TestQuerySplitter:
"""Tests for the QuerySplitter class."""
@pytest.fixture
def config(self) -> ScraperConfig:
"""Create a test configuration."""
return ScraperConfig(
max_concurrent_requests=5,
request_delay_ms=10, # Faster for testing
result_cap=1500,
split_threshold=1200,
min_price_band=100,
max_pages_per_query=60,
proxy_url=None,
)
@pytest.fixture
def splitter(self, config: ScraperConfig) -> QuerySplitter:
"""Create a QuerySplitter instance."""
return QuerySplitter(config)
@pytest.fixture
def parameters(self) -> QueryParameters:
"""Create test query parameters."""
return QueryParameters(
listing_type=ListingType.RENT,
min_bedrooms=2,
max_bedrooms=3,
min_price=1000,
max_price=5000,
district_names={"Kings Cross", "Angel"},
)
def test_create_initial_subqueries(
self, splitter: QuerySplitter, parameters: QueryParameters
) -> None:
"""Test that initial subqueries are created correctly."""
districts = {"Kings Cross": "STATION^5168", "Angel": "STATION^1234"}
subqueries = splitter.create_initial_subqueries(parameters, districts)
# 2 districts × 2 bedroom counts (2,3) = 4 subqueries
assert len(subqueries) == 4
# Check first subquery
assert subqueries[0].district == "Kings Cross"
assert subqueries[0].min_bedrooms == 2
assert subqueries[0].max_bedrooms == 2
assert subqueries[0].min_price == 1000
assert subqueries[0].max_price == 5000
def test_split_by_price(self, splitter: QuerySplitter) -> None:
"""Test that price splitting works correctly."""
sq = SubQuery(
district="Kings Cross",
min_bedrooms=2,
max_bedrooms=2,
min_price=1000,
max_price=5000,
)
halves = splitter.split_by_price(sq)
assert len(halves) == 2
assert halves[0].min_price == 1000
assert halves[0].max_price == 3000 # midpoint
assert halves[1].min_price == 3000
assert halves[1].max_price == 5000
# Both should have same bedroom range and district
for half in halves:
assert half.district == "Kings Cross"
assert half.min_bedrooms == 2
assert half.max_bedrooms == 2
@pytest.mark.asyncio
async def test_probe_result_count(
self, splitter: QuerySplitter, parameters: QueryParameters
) -> None:
"""Test probing API for result count."""
sq = SubQuery(
district="Kings Cross",
min_bedrooms=2,
max_bedrooms=2,
min_price=1000,
max_price=5000,
)
mock_session = AsyncMock()
# Mock the probe_query function
with patch("services.query_splitter.probe_query") as mock_probe:
mock_probe.return_value = {"totalAvailableResults": 800}
count = await splitter.probe_result_count(sq, mock_session, parameters)
assert count == 800
mock_probe.assert_called_once()
@pytest.mark.asyncio
async def test_probe_result_count_handles_error(
self, splitter: QuerySplitter, parameters: QueryParameters
) -> None:
"""Test that probe_result_count handles errors gracefully."""
sq = SubQuery(
district="Kings Cross",
min_bedrooms=2,
max_bedrooms=2,
min_price=1000,
max_price=5000,
)
mock_session = AsyncMock()
with patch("services.query_splitter.probe_query") as mock_probe:
mock_probe.side_effect = Exception("API error")
count = await splitter.probe_result_count(sq, mock_session, parameters)
# Should return 0 on error
assert count == 0
@pytest.mark.asyncio
async def test_adaptive_split_no_split_needed(
self, splitter: QuerySplitter, parameters: QueryParameters
) -> None:
"""Test adaptive split when results are below threshold."""
sq = SubQuery(
district="Kings Cross",
min_bedrooms=2,
max_bedrooms=2,
min_price=1000,
max_price=2000,
)
mock_session = AsyncMock()
mock_semaphore = AsyncMock()
with patch("services.query_splitter.probe_query") as mock_probe:
# First half has 600 results, second half has 500
mock_probe.side_effect = [
{"totalAvailableResults": 600},
{"totalAvailableResults": 500},
]
result = await splitter.adaptive_split(
sq, mock_session, parameters, mock_semaphore
)
# Both halves are under threshold (1200), so we get 2 subqueries back
assert len(result) == 2
assert result[0].estimated_results == 600
assert result[1].estimated_results == 500
@pytest.mark.asyncio
async def test_adaptive_split_recursive_splitting(
self, splitter: QuerySplitter, parameters: QueryParameters
) -> None:
"""Test adaptive split performs recursive splitting when needed."""
sq = SubQuery(
district="Kings Cross",
min_bedrooms=2,
max_bedrooms=2,
min_price=1000,
max_price=5000,
)
mock_session = AsyncMock()
mock_semaphore = AsyncMock()
with patch("services.query_splitter.probe_query") as mock_probe:
# First split: 1000-3000 has 1300 (over threshold), 3000-5000 has 800
# Second split of 1000-3000: 1000-2000 has 700, 2000-3000 has 600
mock_probe.side_effect = [
{"totalAvailableResults": 1300}, # First half - needs more splitting
{"totalAvailableResults": 800}, # Second half - OK
{"totalAvailableResults": 700}, # First quarter - OK
{"totalAvailableResults": 600}, # Second quarter - OK
]
result = await splitter.adaptive_split(
sq, mock_session, parameters, mock_semaphore
)
# Should get 3 subqueries: [1000-2000 (700), 2000-3000 (600), 3000-5000 (800)]
assert len(result) == 3
@pytest.mark.asyncio
async def test_adaptive_split_respects_min_price_band(
self, splitter: QuerySplitter, parameters: QueryParameters
) -> None:
"""Test that adaptive split stops at min_price_band."""
sq = SubQuery(
district="Kings Cross",
min_bedrooms=2,
max_bedrooms=2,
min_price=1000,
max_price=1050, # Only 50 range, below min_price_band of 100
estimated_results=1500, # Over threshold but can't split
)
mock_session = AsyncMock()
mock_semaphore = AsyncMock()
result = await splitter.adaptive_split(
sq, mock_session, parameters, mock_semaphore
)
# Can't split below min_price_band, should return original
assert len(result) == 1
assert result[0].min_price == 1000
assert result[0].max_price == 1050
def test_calculate_total_estimated_results(
self, splitter: QuerySplitter
) -> None:
"""Test calculation of total estimated results."""
subqueries = [
SubQuery(
district="Kings Cross",
min_bedrooms=2,
max_bedrooms=2,
min_price=1000,
max_price=2000,
estimated_results=500,
),
SubQuery(
district="Kings Cross",
min_bedrooms=3,
max_bedrooms=3,
min_price=1000,
max_price=2000,
estimated_results=300,
),
SubQuery(
district="Angel",
min_bedrooms=2,
max_bedrooms=2,
min_price=1000,
max_price=2000,
estimated_results=None, # Not probed
),
]
total = splitter.calculate_total_estimated_results(subqueries)
assert total == 800 # 500 + 300 + 0
@pytest.mark.asyncio
async def test_split_integration(
self, splitter: QuerySplitter, parameters: QueryParameters
) -> None:
"""Integration test for the full split workflow."""
mock_session = AsyncMock()
mock_districts = {"Kings Cross": "STATION^5168", "Angel": "STATION^1234"}
with patch("services.query_splitter.get_districts", return_value=mock_districts):
with patch("services.query_splitter.probe_query") as mock_probe:
# Mock probe results for each initial subquery
# 2 districts × 2 bedroom counts = 4 initial subqueries
mock_probe.side_effect = [
{"totalAvailableResults": 500}, # KC 2BR - OK
{"totalAvailableResults": 1300}, # KC 3BR - needs split
{"totalAvailableResults": 600}, # Angel 2BR - OK
{"totalAvailableResults": 800}, # Angel 3BR - OK
# Split KC 3BR
{"totalAvailableResults": 700}, # KC 3BR first half
{"totalAvailableResults": 600}, # KC 3BR second half
]
result = await splitter.split(parameters, mock_session)
# Should have 5 subqueries total:
# KC 2BR (500), KC 3BR split into 2 (700+600), Angel 2BR (600), Angel 3BR (800)
assert len(result) == 5
# Verify total estimated results
total = splitter.calculate_total_estimated_results(result)
assert total == 3200 # 500 + 700 + 600 + 600 + 800
@pytest.mark.asyncio
async def test_split_with_on_progress_callback(
self, splitter: QuerySplitter, parameters: QueryParameters
) -> None:
"""Test that on_progress callback is called during split."""
mock_session = AsyncMock()
mock_districts = {"Kings Cross": "STATION^5168", "Angel": "STATION^1234"}
progress_calls = []
def on_progress(phase: str, message: str) -> None:
progress_calls.append((phase, message))
with patch("services.query_splitter.get_districts", return_value=mock_districts):
with patch("services.query_splitter.probe_query") as mock_probe:
mock_probe.return_value = {"totalAvailableResults": 500}
await splitter.split(parameters, mock_session, on_progress)
# Should have received at least 2 progress updates
assert len(progress_calls) >= 2
phases = [call[0] for call in progress_calls]
assert "splitting" in phases
assert "splitting_complete" in phases