wrongmove/crawler/rec/query.py

282 lines
8.8 KiB
Python
Raw Normal View History

2024-03-25 20:58:35 +00:00
import enum
from typing import Any
from contextlib import asynccontextmanager
from collections.abc import AsyncIterator
import aiohttp
from models.listing import FurnishType, ListingType
from rec import districts
from tenacity import retry, stop_after_attempt, wait_random
from config.scraper_config import ScraperConfig
DEFAULT_HEADERS = {
2023-11-18 12:30:04 +02:00
"Host": "api.rightmove.co.uk",
"User-Agent": "okhttp/4.12.0",
"Connection": "keep-alive",
}
class PropertyType(enum.StrEnum):
BUNGALOW = "bungalow"
DETACHED = "detached"
FLAT = "flat"
LAND = "land"
PARK_HOME = "park-home"
SEMI_DETACHED = "semi-detached"
TERRACED = "terraced"
2024-03-25 20:58:35 +00:00
2023-11-18 12:30:04 +02:00
@asynccontextmanager
async def create_session(
config: ScraperConfig | None = None,
) -> AsyncIterator[aiohttp.ClientSession]:
"""Create an aiohttp session with optional proxy support.
Args:
config: Scraper configuration. Loads from environment if not provided.
Yields:
Configured aiohttp ClientSession.
"""
if config is None:
config = ScraperConfig.from_env()
connector = None
if config.proxy_url:
try:
from aiohttp_socks import ProxyConnector
connector = ProxyConnector.from_url(config.proxy_url)
except ImportError:
raise ImportError(
"aiohttp-socks is required for proxy support. "
"Install with: pip install aiohttp-socks"
)
session = aiohttp.ClientSession(
trust_env=True,
connector=connector,
headers=DEFAULT_HEADERS,
)
try:
yield session
finally:
await session.close()
@retry(wait=wait_random(min=1, max=2), stop=stop_after_attempt(3))
async def detail_query(
detail_id: int,
session: aiohttp.ClientSession | None = None,
) -> dict[str, Any]:
"""Fetch detailed property information.
Args:
detail_id: The property identifier.
session: Optional aiohttp session. Creates new one if not provided.
Returns:
Property details as a dictionary.
"""
params = {
2024-03-25 20:48:48 +00:00
"apiApplication": "ANDROID",
"appVersion": "3.70.0",
}
2024-03-25 20:48:48 +00:00
url = f"https://api.rightmove.co.uk/api/property/{detail_id}"
async def do_request(s: aiohttp.ClientSession) -> dict[str, Any]:
async with s.get(url, params=params, headers=DEFAULT_HEADERS) as response:
if response.status != 200:
raise Exception(
f"""id: {detail_id}. Status Code: {response.status}."""
2025-05-31 23:48:45 +00:00
f"""Failed due to: {await response.text()}"""
)
return await response.json()
if session:
return await do_request(session)
else:
async with aiohttp.ClientSession(trust_env=True) as new_session:
return await do_request(new_session)
@retry(wait=wait_random(min=1, max=60), stop=stop_after_attempt(3))
2025-05-17 21:55:42 +00:00
async def listing_query(
*,
2024-03-25 20:48:48 +00:00
page: int,
channel: ListingType,
2024-03-25 20:48:48 +00:00
min_bedrooms: int,
max_bedrooms: int,
radius: float,
min_price: int,
max_price: int,
district: str, # = "STATION^5168", # kings cross station
2024-03-25 20:48:48 +00:00
mustNewHome: bool = False,
2025-05-18 12:27:26 +00:00
max_days_since_added: int = 30,
property_type: list[PropertyType] = [],
2025-05-18 12:27:26 +00:00
page_size: int = 25,
furnish_types: list[FurnishType] = [],
session: aiohttp.ClientSession | None = None,
2025-05-18 12:27:26 +00:00
) -> dict[str, Any]:
"""Execute a listing search query.
Args:
page: Page number to fetch (1-indexed).
channel: Listing type (BUY or RENT).
min_bedrooms: Minimum number of bedrooms.
max_bedrooms: Maximum number of bedrooms.
radius: Search radius.
min_price: Minimum price.
max_price: Maximum price.
district: District identifier string.
mustNewHome: Filter for new homes only (BUY only).
max_days_since_added: Maximum days since listing was added (BUY only).
property_type: List of property types to filter (BUY only).
page_size: Number of results per page (default 25).
furnish_types: List of furnish types to filter (RENT only).
session: Optional aiohttp session. Creates new one if not provided.
Returns:
API response as a dictionary.
"""
2025-05-18 12:27:26 +00:00
params: dict[str, str] = {
"locationIdentifier": districts.get_districts()[district],
"channel": str(channel).upper(),
2023-11-18 12:30:04 +02:00
"page": str(page),
2024-04-01 20:28:15 +02:00
"numberOfPropertiesPerPage": str(page_size),
2023-11-18 12:38:54 +02:00
"radius": str(radius),
2023-11-18 12:30:04 +02:00
"sortBy": "distance",
"includeUnavailableProperties": "false",
2023-11-18 12:38:54 +02:00
"minPrice": str(min_price),
"maxPrice": str(max_price),
2023-11-18 12:30:04 +02:00
"minBedrooms": str(min_bedrooms),
"maxBedrooms": str(max_bedrooms),
"apiApplication": "ANDROID",
"appVersion": "4.28.0",
}
if channel is ListingType.BUY:
2025-05-18 12:27:26 +00:00
params["dontShow"] = "sharedOwnership,retirement"
if len(property_type) > 0:
params["propertyTypes"] = ",".join(property_type)
if max_days_since_added is not None and max_days_since_added not in [
2025-05-31 23:48:45 +00:00
1,
3,
7,
14,
]:
2025-06-08 20:58:28 +00:00
raise Exception(
f"Invalid max days - {max_days_since_added} Can only be got",
[1, 3, 7, 14],
)
2025-05-18 12:27:26 +00:00
params["maxDaysSinceAdded"] = str(max_days_since_added)
2024-03-25 20:48:48 +00:00
if mustNewHome:
params["mustHave"] = "newHome"
if channel is ListingType.RENT:
if furnish_types:
params["furnishTypes"] = ",".join(furnish_types)
request_headers = {
"Host": "api.rightmove.co.uk",
"Accept-Encoding": "gzip, deflate, br",
"User-Agent": "okhttp/4.12.0",
2025-05-31 23:48:45 +00:00
"Connection": "keep-alive",
}
async def do_request(s: aiohttp.ClientSession) -> dict[str, Any]:
async with s.get(
"https://api.rightmove.co.uk/api/property-listing",
params=params,
headers=request_headers,
) as response:
if response.status != 200:
raise Exception(f"Failed due to: {await response.text()}")
return await response.json()
if session:
return await do_request(session)
else:
async with aiohttp.ClientSession(trust_env=True) as new_session:
return await do_request(new_session)
@retry(wait=wait_random(min=1, max=10), stop=stop_after_attempt(3))
async def probe_query(
*,
session: aiohttp.ClientSession,
channel: ListingType,
min_bedrooms: int,
max_bedrooms: int,
radius: float,
min_price: int,
max_price: int,
district: str,
max_days_since_added: int = 30,
furnish_types: list[FurnishType] = [],
) -> dict[str, Any]:
"""Probe the API to get result count without fetching full results.
Makes a minimal request (page_size=1) to efficiently get totalAvailableResults.
Args:
session: aiohttp session for making requests.
channel: Listing type (BUY or RENT).
min_bedrooms: Minimum number of bedrooms.
max_bedrooms: Maximum number of bedrooms.
radius: Search radius.
min_price: Minimum price.
max_price: Maximum price.
district: District identifier string.
max_days_since_added: Maximum days since listing was added (BUY only).
furnish_types: List of furnish types to filter (RENT only).
Returns:
API response containing totalAvailableResults.
"""
params: dict[str, str] = {
"locationIdentifier": districts.get_districts()[district],
"channel": str(channel).upper(),
"page": "1",
"numberOfPropertiesPerPage": "1", # Minimal page size for probing
"radius": str(radius),
"sortBy": "distance",
"includeUnavailableProperties": "false",
"minPrice": str(min_price),
"maxPrice": str(max_price),
"minBedrooms": str(min_bedrooms),
"maxBedrooms": str(max_bedrooms),
"apiApplication": "ANDROID",
"appVersion": "4.28.0",
}
if channel is ListingType.BUY:
params["dontShow"] = "sharedOwnership,retirement"
if max_days_since_added is not None and max_days_since_added in [
1,
3,
7,
14,
]:
params["maxDaysSinceAdded"] = str(max_days_since_added)
if channel is ListingType.RENT:
if furnish_types:
params["furnishTypes"] = ",".join(furnish_types)
request_headers = {
"Host": "api.rightmove.co.uk",
"Accept-Encoding": "gzip, deflate, br",
"User-Agent": "okhttp/4.12.0",
"Connection": "keep-alive",
}
async with session.get(
"https://api.rightmove.co.uk/api/property-listing",
params=params,
headers=request_headers,
) as response:
if response.status != 200:
raise Exception(f"Probe failed: {await response.text()}")
return await response.json()