limit the number of concurrenct requests when dumping listings as right move blocks us

This commit is contained in:
Viktor Barzin 2025-06-01 00:27:12 +00:00
parent 24bf44caf9
commit 9735db72a0
No known key found for this signature in database
GPG key ID: 4056458DBDBF8863
2 changed files with 17 additions and 9 deletions

View file

@ -39,6 +39,7 @@ async def dump_listings(
print("Valid districts to scrape:", districts.keys()) print("Valid districts to scrape:", districts.keys())
listings = [] listings = []
semaphore = asyncio.Semaphore(5) # if too high, rightmove drops connections
json_responses = await asyncio.gather( json_responses = await asyncio.gather(
*[ *[
listing_query( listing_query(
@ -53,6 +54,7 @@ async def dump_listings(
page_size=parameters.page_size, page_size=parameters.page_size,
max_days_since_added=parameters.max_days_since_added, max_days_since_added=parameters.max_days_since_added,
furnish_types=parameters.furnish_types or [], furnish_types=parameters.furnish_types or [],
semaphore=semaphore,
) )
for locid in districts.values() for locid in districts.values()
for i in [1, 2] for i in [1, 2]

View file

@ -1,4 +1,5 @@
# from diskcache import Cache # from diskcache import Cache
import asyncio
import enum import enum
from typing import Any from typing import Any
import aiohttp import aiohttp
@ -64,6 +65,7 @@ async def listing_query(
property_type: list[PropertyType] = [], property_type: list[PropertyType] = [],
page_size: int = 25, page_size: int = 25,
furnish_types: list[FurnishType] = [], furnish_types: list[FurnishType] = [],
semaphore: asyncio.Semaphore | None = None,
) -> dict[str, Any]: ) -> dict[str, Any]:
params: dict[str, str] = { params: dict[str, str] = {
"locationIdentifier": location_id, "locationIdentifier": location_id,
@ -106,12 +108,16 @@ async def listing_query(
"Connection": "keep-alive", "Connection": "keep-alive",
} }
async with aiohttp.ClientSession(trust_env=True) as session: if semaphore is None:
async with session.get( semaphore = asyncio.Semaphore(1)
"https://api.rightmove.co.uk/api/property-listing",
params=params, async with semaphore:
headers=headers, async with aiohttp.ClientSession(trust_env=True) as session:
) as response: async with session.get(
if response.status != 200: "https://api.rightmove.co.uk/api/property-listing",
raise Exception(f"Failed due to: {await response.text()}") params=params,
return await response.json() headers=headers,
) as response:
if response.status != 200:
raise Exception(f"Failed due to: {await response.text()}")
return await response.json()