375 lines
13 KiB
Python
375 lines
13 KiB
Python
|
|
"""Unit tests for QuerySplitter service."""
|
|||
|
|
import pytest
|
|||
|
|
from unittest.mock import AsyncMock, patch
|
|||
|
|
|
|||
|
|
from config.scraper_config import ScraperConfig
|
|||
|
|
from models.listing import ListingType, QueryParameters
|
|||
|
|
from services.query_splitter import QuerySplitter, SubQuery
|
|||
|
|
|
|||
|
|
|
|||
|
|
class TestScraperConfig:
|
|||
|
|
"""Tests for the ScraperConfig dataclass."""
|
|||
|
|
|
|||
|
|
def test_default_values(self) -> None:
|
|||
|
|
"""Test that default values are set correctly."""
|
|||
|
|
config = ScraperConfig()
|
|||
|
|
assert config.max_concurrent_requests == 5
|
|||
|
|
assert config.request_delay_ms == 100
|
|||
|
|
assert config.result_cap == 1500
|
|||
|
|
assert config.split_threshold == 1200
|
|||
|
|
assert config.min_price_band == 100
|
|||
|
|
assert config.max_pages_per_query == 60
|
|||
|
|
assert config.proxy_url is None
|
|||
|
|
|
|||
|
|
def test_from_env(self) -> None:
|
|||
|
|
"""Test loading configuration from environment variables."""
|
|||
|
|
with patch.dict(
|
|||
|
|
"os.environ",
|
|||
|
|
{
|
|||
|
|
"RIGHTMOVE_MAX_CONCURRENT": "10",
|
|||
|
|
"RIGHTMOVE_REQUEST_DELAY_MS": "200",
|
|||
|
|
"RIGHTMOVE_SPLIT_THRESHOLD": "1000",
|
|||
|
|
"RIGHTMOVE_MIN_PRICE_BAND": "50",
|
|||
|
|
"RIGHTMOVE_MAX_PAGES": "30",
|
|||
|
|
"RIGHTMOVE_PROXY_URL": "socks5://localhost:9050",
|
|||
|
|
},
|
|||
|
|
):
|
|||
|
|
config = ScraperConfig.from_env()
|
|||
|
|
assert config.max_concurrent_requests == 10
|
|||
|
|
assert config.request_delay_ms == 200
|
|||
|
|
assert config.split_threshold == 1000
|
|||
|
|
assert config.min_price_band == 50
|
|||
|
|
assert config.max_pages_per_query == 30
|
|||
|
|
assert config.proxy_url == "socks5://localhost:9050"
|
|||
|
|
|
|||
|
|
def test_from_env_empty_proxy(self) -> None:
|
|||
|
|
"""Test that empty proxy URL is converted to None."""
|
|||
|
|
with patch.dict(
|
|||
|
|
"os.environ",
|
|||
|
|
{
|
|||
|
|
"RIGHTMOVE_PROXY_URL": "",
|
|||
|
|
},
|
|||
|
|
clear=False,
|
|||
|
|
):
|
|||
|
|
config = ScraperConfig.from_env()
|
|||
|
|
assert config.proxy_url is None
|
|||
|
|
|
|||
|
|
|
|||
|
|
class TestSubQuery:
|
|||
|
|
"""Tests for the SubQuery dataclass."""
|
|||
|
|
|
|||
|
|
def test_price_range_calculation(self) -> None:
|
|||
|
|
"""Test that price_range is calculated correctly."""
|
|||
|
|
sq = SubQuery(
|
|||
|
|
district="Kings Cross",
|
|||
|
|
min_bedrooms=2,
|
|||
|
|
max_bedrooms=2,
|
|||
|
|
min_price=1000,
|
|||
|
|
max_price=2000,
|
|||
|
|
)
|
|||
|
|
assert sq.price_range == 1000
|
|||
|
|
|
|||
|
|
|
|||
|
|
class TestQuerySplitter:
|
|||
|
|
"""Tests for the QuerySplitter class."""
|
|||
|
|
|
|||
|
|
@pytest.fixture
|
|||
|
|
def config(self) -> ScraperConfig:
|
|||
|
|
"""Create a test configuration."""
|
|||
|
|
return ScraperConfig(
|
|||
|
|
max_concurrent_requests=5,
|
|||
|
|
request_delay_ms=10, # Faster for testing
|
|||
|
|
result_cap=1500,
|
|||
|
|
split_threshold=1200,
|
|||
|
|
min_price_band=100,
|
|||
|
|
max_pages_per_query=60,
|
|||
|
|
proxy_url=None,
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
@pytest.fixture
|
|||
|
|
def splitter(self, config: ScraperConfig) -> QuerySplitter:
|
|||
|
|
"""Create a QuerySplitter instance."""
|
|||
|
|
return QuerySplitter(config)
|
|||
|
|
|
|||
|
|
@pytest.fixture
|
|||
|
|
def parameters(self) -> QueryParameters:
|
|||
|
|
"""Create test query parameters."""
|
|||
|
|
return QueryParameters(
|
|||
|
|
listing_type=ListingType.RENT,
|
|||
|
|
min_bedrooms=2,
|
|||
|
|
max_bedrooms=3,
|
|||
|
|
min_price=1000,
|
|||
|
|
max_price=5000,
|
|||
|
|
district_names={"Kings Cross", "Angel"},
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
def test_create_initial_subqueries(
|
|||
|
|
self, splitter: QuerySplitter, parameters: QueryParameters
|
|||
|
|
) -> None:
|
|||
|
|
"""Test that initial subqueries are created correctly."""
|
|||
|
|
districts = {"Kings Cross": "STATION^5168", "Angel": "STATION^1234"}
|
|||
|
|
|
|||
|
|
subqueries = splitter.create_initial_subqueries(parameters, districts)
|
|||
|
|
|
|||
|
|
# 2 districts × 2 bedroom counts (2,3) = 4 subqueries
|
|||
|
|
assert len(subqueries) == 4
|
|||
|
|
|
|||
|
|
# Check first subquery
|
|||
|
|
assert subqueries[0].district == "Kings Cross"
|
|||
|
|
assert subqueries[0].min_bedrooms == 2
|
|||
|
|
assert subqueries[0].max_bedrooms == 2
|
|||
|
|
assert subqueries[0].min_price == 1000
|
|||
|
|
assert subqueries[0].max_price == 5000
|
|||
|
|
|
|||
|
|
def test_split_by_price(self, splitter: QuerySplitter) -> None:
|
|||
|
|
"""Test that price splitting works correctly."""
|
|||
|
|
sq = SubQuery(
|
|||
|
|
district="Kings Cross",
|
|||
|
|
min_bedrooms=2,
|
|||
|
|
max_bedrooms=2,
|
|||
|
|
min_price=1000,
|
|||
|
|
max_price=5000,
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
halves = splitter.split_by_price(sq)
|
|||
|
|
|
|||
|
|
assert len(halves) == 2
|
|||
|
|
assert halves[0].min_price == 1000
|
|||
|
|
assert halves[0].max_price == 3000 # midpoint
|
|||
|
|
assert halves[1].min_price == 3000
|
|||
|
|
assert halves[1].max_price == 5000
|
|||
|
|
|
|||
|
|
# Both should have same bedroom range and district
|
|||
|
|
for half in halves:
|
|||
|
|
assert half.district == "Kings Cross"
|
|||
|
|
assert half.min_bedrooms == 2
|
|||
|
|
assert half.max_bedrooms == 2
|
|||
|
|
|
|||
|
|
@pytest.mark.asyncio
|
|||
|
|
async def test_probe_result_count(
|
|||
|
|
self, splitter: QuerySplitter, parameters: QueryParameters
|
|||
|
|
) -> None:
|
|||
|
|
"""Test probing API for result count."""
|
|||
|
|
sq = SubQuery(
|
|||
|
|
district="Kings Cross",
|
|||
|
|
min_bedrooms=2,
|
|||
|
|
max_bedrooms=2,
|
|||
|
|
min_price=1000,
|
|||
|
|
max_price=5000,
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
mock_session = AsyncMock()
|
|||
|
|
|
|||
|
|
# Mock the probe_query function
|
|||
|
|
with patch("services.query_splitter.probe_query") as mock_probe:
|
|||
|
|
mock_probe.return_value = {"totalAvailableResults": 800}
|
|||
|
|
|
|||
|
|
count = await splitter.probe_result_count(sq, mock_session, parameters)
|
|||
|
|
|
|||
|
|
assert count == 800
|
|||
|
|
mock_probe.assert_called_once()
|
|||
|
|
|
|||
|
|
@pytest.mark.asyncio
|
|||
|
|
async def test_probe_result_count_handles_error(
|
|||
|
|
self, splitter: QuerySplitter, parameters: QueryParameters
|
|||
|
|
) -> None:
|
|||
|
|
"""Test that probe_result_count handles errors gracefully."""
|
|||
|
|
sq = SubQuery(
|
|||
|
|
district="Kings Cross",
|
|||
|
|
min_bedrooms=2,
|
|||
|
|
max_bedrooms=2,
|
|||
|
|
min_price=1000,
|
|||
|
|
max_price=5000,
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
mock_session = AsyncMock()
|
|||
|
|
|
|||
|
|
with patch("services.query_splitter.probe_query") as mock_probe:
|
|||
|
|
mock_probe.side_effect = Exception("API error")
|
|||
|
|
|
|||
|
|
count = await splitter.probe_result_count(sq, mock_session, parameters)
|
|||
|
|
|
|||
|
|
# Should return 0 on error
|
|||
|
|
assert count == 0
|
|||
|
|
|
|||
|
|
@pytest.mark.asyncio
|
|||
|
|
async def test_adaptive_split_no_split_needed(
|
|||
|
|
self, splitter: QuerySplitter, parameters: QueryParameters
|
|||
|
|
) -> None:
|
|||
|
|
"""Test adaptive split when results are below threshold."""
|
|||
|
|
sq = SubQuery(
|
|||
|
|
district="Kings Cross",
|
|||
|
|
min_bedrooms=2,
|
|||
|
|
max_bedrooms=2,
|
|||
|
|
min_price=1000,
|
|||
|
|
max_price=2000,
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
mock_session = AsyncMock()
|
|||
|
|
mock_semaphore = AsyncMock()
|
|||
|
|
|
|||
|
|
with patch("services.query_splitter.probe_query") as mock_probe:
|
|||
|
|
# First half has 600 results, second half has 500
|
|||
|
|
mock_probe.side_effect = [
|
|||
|
|
{"totalAvailableResults": 600},
|
|||
|
|
{"totalAvailableResults": 500},
|
|||
|
|
]
|
|||
|
|
|
|||
|
|
result = await splitter.adaptive_split(
|
|||
|
|
sq, mock_session, parameters, mock_semaphore
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
# Both halves are under threshold (1200), so we get 2 subqueries back
|
|||
|
|
assert len(result) == 2
|
|||
|
|
assert result[0].estimated_results == 600
|
|||
|
|
assert result[1].estimated_results == 500
|
|||
|
|
|
|||
|
|
@pytest.mark.asyncio
|
|||
|
|
async def test_adaptive_split_recursive_splitting(
|
|||
|
|
self, splitter: QuerySplitter, parameters: QueryParameters
|
|||
|
|
) -> None:
|
|||
|
|
"""Test adaptive split performs recursive splitting when needed."""
|
|||
|
|
sq = SubQuery(
|
|||
|
|
district="Kings Cross",
|
|||
|
|
min_bedrooms=2,
|
|||
|
|
max_bedrooms=2,
|
|||
|
|
min_price=1000,
|
|||
|
|
max_price=5000,
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
mock_session = AsyncMock()
|
|||
|
|
mock_semaphore = AsyncMock()
|
|||
|
|
|
|||
|
|
with patch("services.query_splitter.probe_query") as mock_probe:
|
|||
|
|
# First split: 1000-3000 has 1300 (over threshold), 3000-5000 has 800
|
|||
|
|
# Second split of 1000-3000: 1000-2000 has 700, 2000-3000 has 600
|
|||
|
|
mock_probe.side_effect = [
|
|||
|
|
{"totalAvailableResults": 1300}, # First half - needs more splitting
|
|||
|
|
{"totalAvailableResults": 800}, # Second half - OK
|
|||
|
|
{"totalAvailableResults": 700}, # First quarter - OK
|
|||
|
|
{"totalAvailableResults": 600}, # Second quarter - OK
|
|||
|
|
]
|
|||
|
|
|
|||
|
|
result = await splitter.adaptive_split(
|
|||
|
|
sq, mock_session, parameters, mock_semaphore
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
# Should get 3 subqueries: [1000-2000 (700), 2000-3000 (600), 3000-5000 (800)]
|
|||
|
|
assert len(result) == 3
|
|||
|
|
|
|||
|
|
@pytest.mark.asyncio
|
|||
|
|
async def test_adaptive_split_respects_min_price_band(
|
|||
|
|
self, splitter: QuerySplitter, parameters: QueryParameters
|
|||
|
|
) -> None:
|
|||
|
|
"""Test that adaptive split stops at min_price_band."""
|
|||
|
|
sq = SubQuery(
|
|||
|
|
district="Kings Cross",
|
|||
|
|
min_bedrooms=2,
|
|||
|
|
max_bedrooms=2,
|
|||
|
|
min_price=1000,
|
|||
|
|
max_price=1050, # Only 50 range, below min_price_band of 100
|
|||
|
|
estimated_results=1500, # Over threshold but can't split
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
mock_session = AsyncMock()
|
|||
|
|
mock_semaphore = AsyncMock()
|
|||
|
|
|
|||
|
|
result = await splitter.adaptive_split(
|
|||
|
|
sq, mock_session, parameters, mock_semaphore
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
# Can't split below min_price_band, should return original
|
|||
|
|
assert len(result) == 1
|
|||
|
|
assert result[0].min_price == 1000
|
|||
|
|
assert result[0].max_price == 1050
|
|||
|
|
|
|||
|
|
def test_calculate_total_estimated_results(
|
|||
|
|
self, splitter: QuerySplitter
|
|||
|
|
) -> None:
|
|||
|
|
"""Test calculation of total estimated results."""
|
|||
|
|
subqueries = [
|
|||
|
|
SubQuery(
|
|||
|
|
district="Kings Cross",
|
|||
|
|
min_bedrooms=2,
|
|||
|
|
max_bedrooms=2,
|
|||
|
|
min_price=1000,
|
|||
|
|
max_price=2000,
|
|||
|
|
estimated_results=500,
|
|||
|
|
),
|
|||
|
|
SubQuery(
|
|||
|
|
district="Kings Cross",
|
|||
|
|
min_bedrooms=3,
|
|||
|
|
max_bedrooms=3,
|
|||
|
|
min_price=1000,
|
|||
|
|
max_price=2000,
|
|||
|
|
estimated_results=300,
|
|||
|
|
),
|
|||
|
|
SubQuery(
|
|||
|
|
district="Angel",
|
|||
|
|
min_bedrooms=2,
|
|||
|
|
max_bedrooms=2,
|
|||
|
|
min_price=1000,
|
|||
|
|
max_price=2000,
|
|||
|
|
estimated_results=None, # Not probed
|
|||
|
|
),
|
|||
|
|
]
|
|||
|
|
|
|||
|
|
total = splitter.calculate_total_estimated_results(subqueries)
|
|||
|
|
assert total == 800 # 500 + 300 + 0
|
|||
|
|
|
|||
|
|
@pytest.mark.asyncio
|
|||
|
|
async def test_split_integration(
|
|||
|
|
self, splitter: QuerySplitter, parameters: QueryParameters
|
|||
|
|
) -> None:
|
|||
|
|
"""Integration test for the full split workflow."""
|
|||
|
|
mock_session = AsyncMock()
|
|||
|
|
mock_districts = {"Kings Cross": "STATION^5168", "Angel": "STATION^1234"}
|
|||
|
|
|
|||
|
|
with patch("services.query_splitter.get_districts", return_value=mock_districts):
|
|||
|
|
with patch("services.query_splitter.probe_query") as mock_probe:
|
|||
|
|
# Mock probe results for each initial subquery
|
|||
|
|
# 2 districts × 2 bedroom counts = 4 initial subqueries
|
|||
|
|
mock_probe.side_effect = [
|
|||
|
|
{"totalAvailableResults": 500}, # KC 2BR - OK
|
|||
|
|
{"totalAvailableResults": 1300}, # KC 3BR - needs split
|
|||
|
|
{"totalAvailableResults": 600}, # Angel 2BR - OK
|
|||
|
|
{"totalAvailableResults": 800}, # Angel 3BR - OK
|
|||
|
|
# Split KC 3BR
|
|||
|
|
{"totalAvailableResults": 700}, # KC 3BR first half
|
|||
|
|
{"totalAvailableResults": 600}, # KC 3BR second half
|
|||
|
|
]
|
|||
|
|
|
|||
|
|
result = await splitter.split(parameters, mock_session)
|
|||
|
|
|
|||
|
|
# Should have 5 subqueries total:
|
|||
|
|
# KC 2BR (500), KC 3BR split into 2 (700+600), Angel 2BR (600), Angel 3BR (800)
|
|||
|
|
assert len(result) == 5
|
|||
|
|
|
|||
|
|
# Verify total estimated results
|
|||
|
|
total = splitter.calculate_total_estimated_results(result)
|
|||
|
|
assert total == 3200 # 500 + 700 + 600 + 600 + 800
|
|||
|
|
|
|||
|
|
@pytest.mark.asyncio
|
|||
|
|
async def test_split_with_on_progress_callback(
|
|||
|
|
self, splitter: QuerySplitter, parameters: QueryParameters
|
|||
|
|
) -> None:
|
|||
|
|
"""Test that on_progress callback is called during split."""
|
|||
|
|
mock_session = AsyncMock()
|
|||
|
|
mock_districts = {"Kings Cross": "STATION^5168", "Angel": "STATION^1234"}
|
|||
|
|
progress_calls = []
|
|||
|
|
|
|||
|
|
def on_progress(phase: str, message: str) -> None:
|
|||
|
|
progress_calls.append((phase, message))
|
|||
|
|
|
|||
|
|
with patch("services.query_splitter.get_districts", return_value=mock_districts):
|
|||
|
|
with patch("services.query_splitter.probe_query") as mock_probe:
|
|||
|
|
mock_probe.return_value = {"totalAvailableResults": 500}
|
|||
|
|
|
|||
|
|
await splitter.split(parameters, mock_session, on_progress)
|
|||
|
|
|
|||
|
|
# Should have received at least 2 progress updates
|
|||
|
|
assert len(progress_calls) >= 2
|
|||
|
|
phases = [call[0] for call in progress_calls]
|
|||
|
|
assert "splitting" in phases
|
|||
|
|
assert "splitting_complete" in phases
|