Refactor codebase following Clean Code principles and add 229 tests
- Extract helpers to reduce function sizes (listing_tasks, app.py, query.py, listing_fetcher) - Replace nonlocal mutations with _PipelineState dataclass in listing_tasks - Fix bugs: isinstance→equality check in repository, verify_exp for OIDC tokens - Consolidate duplicate filter methods in listing_repository - Move hardcoded config to env vars with backward-compatible defaults - Simplify CLI decorator to auto-build QueryParameters - Add deprecation docstring to data_access.py - Test count: 158 → 387 (all passing)
This commit is contained in:
parent
7e05b3c971
commit
150342bb9e
48 changed files with 5029 additions and 990 deletions
|
|
@ -28,6 +28,11 @@ logger = logging.getLogger("uvicorn.error")
|
|||
# Global circuit breaker instance
|
||||
_circuit_breaker: CircuitBreaker | None = None
|
||||
|
||||
# API constants
|
||||
ANDROID_APP_VERSION = "3.70.0"
|
||||
ANDROID_APP_VERSION_LISTING = "4.28.0"
|
||||
RIGHTMOVE_API_BASE = "https://api.rightmove.co.uk/api"
|
||||
PROPERTY_LISTING_ENDPOINT = f"{RIGHTMOVE_API_BASE}/property-listing"
|
||||
|
||||
DEFAULT_HEADERS = {
|
||||
"Host": "api.rightmove.co.uk",
|
||||
|
|
@ -35,6 +40,11 @@ DEFAULT_HEADERS = {
|
|||
"Connection": "keep-alive",
|
||||
}
|
||||
|
||||
LISTING_HEADERS = {
|
||||
**DEFAULT_HEADERS,
|
||||
"Accept-Encoding": "gzip, deflate, br",
|
||||
}
|
||||
|
||||
|
||||
class PropertyType(enum.StrEnum):
|
||||
BUNGALOW = "bungalow"
|
||||
|
|
@ -129,6 +139,177 @@ def check_circuit_breaker(config: ScraperConfig | None = None) -> None:
|
|||
cb.call()
|
||||
|
||||
|
||||
def _build_base_params(
|
||||
*,
|
||||
channel: ListingType,
|
||||
page: int,
|
||||
page_size: int,
|
||||
radius: float,
|
||||
min_price: int,
|
||||
max_price: int,
|
||||
min_bedrooms: int,
|
||||
max_bedrooms: int,
|
||||
district: str,
|
||||
) -> dict[str, str]:
|
||||
return {
|
||||
"locationIdentifier": districts.get_districts()[district],
|
||||
"channel": str(channel).upper(),
|
||||
"page": str(page),
|
||||
"numberOfPropertiesPerPage": str(page_size),
|
||||
"radius": str(radius),
|
||||
"sortBy": "distance",
|
||||
"includeUnavailableProperties": "false",
|
||||
"minPrice": str(min_price),
|
||||
"maxPrice": str(max_price),
|
||||
"minBedrooms": str(min_bedrooms),
|
||||
"maxBedrooms": str(max_bedrooms),
|
||||
"apiApplication": "ANDROID",
|
||||
"appVersion": ANDROID_APP_VERSION_LISTING,
|
||||
}
|
||||
|
||||
|
||||
def _build_listing_params(
|
||||
*,
|
||||
page: int,
|
||||
channel: ListingType,
|
||||
min_bedrooms: int,
|
||||
max_bedrooms: int,
|
||||
radius: float,
|
||||
min_price: int,
|
||||
max_price: int,
|
||||
district: str,
|
||||
mustNewHome: bool,
|
||||
max_days_since_added: int,
|
||||
property_type: list[PropertyType],
|
||||
page_size: int,
|
||||
furnish_types: list[FurnishType],
|
||||
) -> dict[str, str]:
|
||||
params = _build_base_params(
|
||||
channel=channel,
|
||||
page=page,
|
||||
page_size=page_size,
|
||||
radius=radius,
|
||||
min_price=min_price,
|
||||
max_price=max_price,
|
||||
min_bedrooms=min_bedrooms,
|
||||
max_bedrooms=max_bedrooms,
|
||||
district=district,
|
||||
)
|
||||
if channel is ListingType.BUY:
|
||||
params["dontShow"] = "sharedOwnership,retirement"
|
||||
if len(property_type) > 0:
|
||||
params["propertyTypes"] = ",".join(property_type)
|
||||
if max_days_since_added is not None and max_days_since_added not in [
|
||||
1,
|
||||
3,
|
||||
7,
|
||||
14,
|
||||
]:
|
||||
raise Exception(
|
||||
f"Invalid max days - {max_days_since_added} Can only be got",
|
||||
[1, 3, 7, 14],
|
||||
)
|
||||
params["maxDaysSinceAdded"] = str(max_days_since_added)
|
||||
|
||||
if mustNewHome:
|
||||
params["mustHave"] = "newHome"
|
||||
if channel is ListingType.RENT:
|
||||
if furnish_types:
|
||||
params["furnishTypes"] = ",".join(furnish_types)
|
||||
return params
|
||||
|
||||
|
||||
def _build_probe_params(
|
||||
*,
|
||||
channel: ListingType,
|
||||
min_bedrooms: int,
|
||||
max_bedrooms: int,
|
||||
radius: float,
|
||||
min_price: int,
|
||||
max_price: int,
|
||||
district: str,
|
||||
max_days_since_added: int,
|
||||
furnish_types: list[FurnishType],
|
||||
) -> dict[str, str]:
|
||||
params = _build_base_params(
|
||||
channel=channel,
|
||||
page=1,
|
||||
page_size=1, # Minimal page size for probing
|
||||
radius=radius,
|
||||
min_price=min_price,
|
||||
max_price=max_price,
|
||||
min_bedrooms=min_bedrooms,
|
||||
max_bedrooms=max_bedrooms,
|
||||
district=district,
|
||||
)
|
||||
if channel is ListingType.BUY:
|
||||
params["dontShow"] = "sharedOwnership,retirement"
|
||||
if max_days_since_added is not None and max_days_since_added in [
|
||||
1,
|
||||
3,
|
||||
7,
|
||||
14,
|
||||
]:
|
||||
params["maxDaysSinceAdded"] = str(max_days_since_added)
|
||||
|
||||
if channel is ListingType.RENT:
|
||||
if furnish_types:
|
||||
params["furnishTypes"] = ",".join(furnish_types)
|
||||
return params
|
||||
|
||||
|
||||
async def _execute_api_request(
|
||||
*,
|
||||
url: str,
|
||||
params: dict[str, str],
|
||||
headers: dict[str, str],
|
||||
session: aiohttp.ClientSession | None,
|
||||
config: ScraperConfig,
|
||||
expect_data: bool = True,
|
||||
error_context: str = "",
|
||||
) -> dict[str, Any]:
|
||||
check_circuit_breaker(config)
|
||||
cb = get_circuit_breaker(config)
|
||||
|
||||
async def do_request(s: aiohttp.ClientSession) -> dict[str, Any]:
|
||||
start_time = time.time()
|
||||
try:
|
||||
async with s.get(url, params=params, headers=headers) as response:
|
||||
response_time = time.time() - start_time
|
||||
body = await response.json() if response.status == 200 else None
|
||||
|
||||
validate_response(
|
||||
response,
|
||||
response_time,
|
||||
body,
|
||||
config.slow_response_threshold,
|
||||
expect_data=expect_data,
|
||||
)
|
||||
|
||||
if response.status != 200:
|
||||
raise Exception(
|
||||
f"{error_context}Failed due to: {await response.text()}"
|
||||
)
|
||||
|
||||
if cb is not None:
|
||||
cb.record_success()
|
||||
return body # type: ignore
|
||||
except ThrottlingError:
|
||||
if cb is not None:
|
||||
cb.record_failure()
|
||||
raise
|
||||
except Exception as e:
|
||||
if cb is not None:
|
||||
cb.record_failure()
|
||||
raise e
|
||||
|
||||
if session:
|
||||
return await do_request(session)
|
||||
else:
|
||||
async with aiohttp.ClientSession(trust_env=True) as new_session:
|
||||
return await do_request(new_session)
|
||||
|
||||
|
||||
@retry(
|
||||
retry=retry_if_exception_type(ThrottlingError),
|
||||
wait=wait_exponential(multiplier=2, min=2, max=120),
|
||||
|
|
@ -156,54 +337,21 @@ async def detail_query(
|
|||
if config is None:
|
||||
config = ScraperConfig.from_env()
|
||||
|
||||
check_circuit_breaker(config)
|
||||
cb = get_circuit_breaker(config)
|
||||
|
||||
params = {
|
||||
"apiApplication": "ANDROID",
|
||||
"appVersion": "3.70.0",
|
||||
"appVersion": ANDROID_APP_VERSION,
|
||||
}
|
||||
url = f"https://api.rightmove.co.uk/api/property/{detail_id}"
|
||||
url = f"{RIGHTMOVE_API_BASE}/property/{detail_id}"
|
||||
|
||||
async def do_request(s: aiohttp.ClientSession) -> dict[str, Any]:
|
||||
start_time = time.time()
|
||||
try:
|
||||
async with s.get(url, params=params, headers=DEFAULT_HEADERS) as response:
|
||||
response_time = time.time() - start_time
|
||||
body = await response.json() if response.status == 200 else None
|
||||
|
||||
# Validate response for throttling
|
||||
validate_response(
|
||||
response,
|
||||
response_time,
|
||||
body,
|
||||
config.slow_response_threshold,
|
||||
expect_data=True,
|
||||
)
|
||||
|
||||
if response.status != 200:
|
||||
raise Exception(
|
||||
f"""id: {detail_id}. Status Code: {response.status}."""
|
||||
f"""Failed due to: {await response.text()}"""
|
||||
)
|
||||
|
||||
if cb is not None:
|
||||
cb.record_success()
|
||||
return body # type: ignore
|
||||
except ThrottlingError:
|
||||
if cb is not None:
|
||||
cb.record_failure()
|
||||
raise
|
||||
except Exception as e:
|
||||
if cb is not None:
|
||||
cb.record_failure()
|
||||
raise e
|
||||
|
||||
if session:
|
||||
return await do_request(session)
|
||||
else:
|
||||
async with aiohttp.ClientSession(trust_env=True) as new_session:
|
||||
return await do_request(new_session)
|
||||
return await _execute_api_request(
|
||||
url=url,
|
||||
params=params,
|
||||
headers=DEFAULT_HEADERS,
|
||||
session=session,
|
||||
config=config,
|
||||
expect_data=True,
|
||||
error_context=f"id: {detail_id}. Status Code: ",
|
||||
)
|
||||
|
||||
|
||||
@retry(
|
||||
|
|
@ -223,9 +371,9 @@ async def listing_query(
|
|||
district: str, # = "STATION^5168", # kings cross station
|
||||
mustNewHome: bool = False,
|
||||
max_days_since_added: int = 30,
|
||||
property_type: list[PropertyType] = [],
|
||||
property_type: list[PropertyType] | None = None,
|
||||
page_size: int = 25,
|
||||
furnish_types: list[FurnishType] = [],
|
||||
furnish_types: list[FurnishType] | None = None,
|
||||
session: aiohttp.ClientSession | None = None,
|
||||
config: ScraperConfig | None = None,
|
||||
) -> dict[str, Any]:
|
||||
|
|
@ -257,94 +405,35 @@ async def listing_query(
|
|||
"""
|
||||
if config is None:
|
||||
config = ScraperConfig.from_env()
|
||||
if property_type is None:
|
||||
property_type = []
|
||||
if furnish_types is None:
|
||||
furnish_types = []
|
||||
|
||||
check_circuit_breaker(config)
|
||||
cb = get_circuit_breaker(config)
|
||||
params = _build_listing_params(
|
||||
page=page,
|
||||
channel=channel,
|
||||
min_bedrooms=min_bedrooms,
|
||||
max_bedrooms=max_bedrooms,
|
||||
radius=radius,
|
||||
min_price=min_price,
|
||||
max_price=max_price,
|
||||
district=district,
|
||||
mustNewHome=mustNewHome,
|
||||
max_days_since_added=max_days_since_added,
|
||||
property_type=property_type,
|
||||
page_size=page_size,
|
||||
furnish_types=furnish_types,
|
||||
)
|
||||
|
||||
params: dict[str, str] = {
|
||||
"locationIdentifier": districts.get_districts()[district],
|
||||
"channel": str(channel).upper(),
|
||||
"page": str(page),
|
||||
"numberOfPropertiesPerPage": str(page_size),
|
||||
"radius": str(radius),
|
||||
"sortBy": "distance",
|
||||
"includeUnavailableProperties": "false",
|
||||
"minPrice": str(min_price),
|
||||
"maxPrice": str(max_price),
|
||||
"minBedrooms": str(min_bedrooms),
|
||||
"maxBedrooms": str(max_bedrooms),
|
||||
"apiApplication": "ANDROID",
|
||||
"appVersion": "4.28.0",
|
||||
}
|
||||
if channel is ListingType.BUY:
|
||||
params["dontShow"] = "sharedOwnership,retirement"
|
||||
if len(property_type) > 0:
|
||||
params["propertyTypes"] = ",".join(property_type)
|
||||
if max_days_since_added is not None and max_days_since_added not in [
|
||||
1,
|
||||
3,
|
||||
7,
|
||||
14,
|
||||
]:
|
||||
raise Exception(
|
||||
f"Invalid max days - {max_days_since_added} Can only be got",
|
||||
[1, 3, 7, 14],
|
||||
)
|
||||
params["maxDaysSinceAdded"] = str(max_days_since_added)
|
||||
|
||||
if mustNewHome:
|
||||
params["mustHave"] = "newHome"
|
||||
if channel is ListingType.RENT:
|
||||
if furnish_types:
|
||||
params["furnishTypes"] = ",".join(furnish_types)
|
||||
|
||||
request_headers = {
|
||||
"Host": "api.rightmove.co.uk",
|
||||
"Accept-Encoding": "gzip, deflate, br",
|
||||
"User-Agent": "okhttp/4.12.0",
|
||||
"Connection": "keep-alive",
|
||||
}
|
||||
|
||||
async def do_request(s: aiohttp.ClientSession) -> dict[str, Any]:
|
||||
start_time = time.time()
|
||||
try:
|
||||
async with s.get(
|
||||
"https://api.rightmove.co.uk/api/property-listing",
|
||||
params=params,
|
||||
headers=request_headers,
|
||||
) as response:
|
||||
response_time = time.time() - start_time
|
||||
body = await response.json() if response.status == 200 else None
|
||||
|
||||
# Validate response for throttling
|
||||
validate_response(
|
||||
response,
|
||||
response_time,
|
||||
body,
|
||||
config.slow_response_threshold,
|
||||
expect_data=(page == 1), # Only expect data on first page
|
||||
)
|
||||
|
||||
if response.status != 200:
|
||||
raise Exception(f"Failed due to: {await response.text()}")
|
||||
|
||||
if cb is not None:
|
||||
cb.record_success()
|
||||
return body # type: ignore
|
||||
except ThrottlingError:
|
||||
if cb is not None:
|
||||
cb.record_failure()
|
||||
raise
|
||||
except Exception as e:
|
||||
if cb is not None:
|
||||
cb.record_failure()
|
||||
raise e
|
||||
|
||||
if session:
|
||||
return await do_request(session)
|
||||
else:
|
||||
async with aiohttp.ClientSession(trust_env=True) as new_session:
|
||||
return await do_request(new_session)
|
||||
return await _execute_api_request(
|
||||
url=PROPERTY_LISTING_ENDPOINT,
|
||||
params=params,
|
||||
headers=LISTING_HEADERS,
|
||||
session=session,
|
||||
config=config,
|
||||
expect_data=(page == 1),
|
||||
)
|
||||
|
||||
|
||||
@retry(
|
||||
|
|
@ -363,7 +452,7 @@ async def probe_query(
|
|||
max_price: int,
|
||||
district: str,
|
||||
max_days_since_added: int = 30,
|
||||
furnish_types: list[FurnishType] = [],
|
||||
furnish_types: list[FurnishType] | None = None,
|
||||
config: ScraperConfig | None = None,
|
||||
) -> dict[str, Any]:
|
||||
"""Probe the API to get result count without fetching full results.
|
||||
|
|
@ -392,77 +481,27 @@ async def probe_query(
|
|||
"""
|
||||
if config is None:
|
||||
config = ScraperConfig.from_env()
|
||||
if furnish_types is None:
|
||||
furnish_types = []
|
||||
|
||||
check_circuit_breaker(config)
|
||||
cb = get_circuit_breaker(config)
|
||||
params = _build_probe_params(
|
||||
channel=channel,
|
||||
min_bedrooms=min_bedrooms,
|
||||
max_bedrooms=max_bedrooms,
|
||||
radius=radius,
|
||||
min_price=min_price,
|
||||
max_price=max_price,
|
||||
district=district,
|
||||
max_days_since_added=max_days_since_added,
|
||||
furnish_types=furnish_types,
|
||||
)
|
||||
|
||||
params: dict[str, str] = {
|
||||
"locationIdentifier": districts.get_districts()[district],
|
||||
"channel": str(channel).upper(),
|
||||
"page": "1",
|
||||
"numberOfPropertiesPerPage": "1", # Minimal page size for probing
|
||||
"radius": str(radius),
|
||||
"sortBy": "distance",
|
||||
"includeUnavailableProperties": "false",
|
||||
"minPrice": str(min_price),
|
||||
"maxPrice": str(max_price),
|
||||
"minBedrooms": str(min_bedrooms),
|
||||
"maxBedrooms": str(max_bedrooms),
|
||||
"apiApplication": "ANDROID",
|
||||
"appVersion": "4.28.0",
|
||||
}
|
||||
|
||||
if channel is ListingType.BUY:
|
||||
params["dontShow"] = "sharedOwnership,retirement"
|
||||
if max_days_since_added is not None and max_days_since_added in [
|
||||
1,
|
||||
3,
|
||||
7,
|
||||
14,
|
||||
]:
|
||||
params["maxDaysSinceAdded"] = str(max_days_since_added)
|
||||
|
||||
if channel is ListingType.RENT:
|
||||
if furnish_types:
|
||||
params["furnishTypes"] = ",".join(furnish_types)
|
||||
|
||||
request_headers = {
|
||||
"Host": "api.rightmove.co.uk",
|
||||
"Accept-Encoding": "gzip, deflate, br",
|
||||
"User-Agent": "okhttp/4.12.0",
|
||||
"Connection": "keep-alive",
|
||||
}
|
||||
|
||||
start_time = time.time()
|
||||
try:
|
||||
async with session.get(
|
||||
"https://api.rightmove.co.uk/api/property-listing",
|
||||
params=params,
|
||||
headers=request_headers,
|
||||
) as response:
|
||||
response_time = time.time() - start_time
|
||||
body = await response.json() if response.status == 200 else None
|
||||
|
||||
# Validate response for throttling
|
||||
validate_response(
|
||||
response,
|
||||
response_time,
|
||||
body,
|
||||
config.slow_response_threshold,
|
||||
expect_data=False, # Probe doesn't need data, just count
|
||||
)
|
||||
|
||||
if response.status != 200:
|
||||
raise Exception(f"Probe failed: {await response.text()}")
|
||||
|
||||
if cb is not None:
|
||||
cb.record_success()
|
||||
return body # type: ignore
|
||||
except ThrottlingError:
|
||||
if cb is not None:
|
||||
cb.record_failure()
|
||||
raise
|
||||
except Exception as e:
|
||||
if cb is not None:
|
||||
cb.record_failure()
|
||||
raise e
|
||||
return await _execute_api_request(
|
||||
url=PROPERTY_LISTING_ENDPOINT,
|
||||
params=params,
|
||||
headers=LISTING_HEADERS,
|
||||
session=session,
|
||||
config=config,
|
||||
expect_data=False,
|
||||
error_context="Probe failed: ",
|
||||
)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue