From 9735db72a09f5f9c78f246ac510a3981dd6f9da0 Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Sun, 1 Jun 2025 00:27:12 +0000 Subject: [PATCH] limit the number of concurrenct requests when dumping listings as right move blocks us --- crawler/1_dump_listings.py | 2 ++ crawler/rec/query.py | 24 +++++++++++++++--------- 2 files changed, 17 insertions(+), 9 deletions(-) diff --git a/crawler/1_dump_listings.py b/crawler/1_dump_listings.py index 2d71f03..484556f 100644 --- a/crawler/1_dump_listings.py +++ b/crawler/1_dump_listings.py @@ -39,6 +39,7 @@ async def dump_listings( print("Valid districts to scrape:", districts.keys()) listings = [] + semaphore = asyncio.Semaphore(5) # if too high, rightmove drops connections json_responses = await asyncio.gather( *[ listing_query( @@ -53,6 +54,7 @@ async def dump_listings( page_size=parameters.page_size, max_days_since_added=parameters.max_days_since_added, furnish_types=parameters.furnish_types or [], + semaphore=semaphore, ) for locid in districts.values() for i in [1, 2] diff --git a/crawler/rec/query.py b/crawler/rec/query.py index edeed2c..6afb0e3 100644 --- a/crawler/rec/query.py +++ b/crawler/rec/query.py @@ -1,4 +1,5 @@ # from diskcache import Cache +import asyncio import enum from typing import Any import aiohttp @@ -64,6 +65,7 @@ async def listing_query( property_type: list[PropertyType] = [], page_size: int = 25, furnish_types: list[FurnishType] = [], + semaphore: asyncio.Semaphore | None = None, ) -> dict[str, Any]: params: dict[str, str] = { "locationIdentifier": location_id, @@ -106,12 +108,16 @@ async def listing_query( "Connection": "keep-alive", } - async with aiohttp.ClientSession(trust_env=True) as session: - async with session.get( - "https://api.rightmove.co.uk/api/property-listing", - params=params, - headers=headers, - ) as response: - if response.status != 200: - raise Exception(f"Failed due to: {await response.text()}") - return await response.json() + if semaphore is None: + semaphore = asyncio.Semaphore(1) + + async with semaphore: + async with aiohttp.ClientSession(trust_env=True) as session: + async with session.get( + "https://api.rightmove.co.uk/api/property-listing", + params=params, + headers=headers, + ) as response: + if response.status != 200: + raise Exception(f"Failed due to: {await response.text()}") + return await response.json()