diff --git a/crawler/1_dump_listings.py b/crawler/1_dump_listings.py index c96338e..aacb787 100644 --- a/crawler/1_dump_listings.py +++ b/crawler/1_dump_listings.py @@ -108,32 +108,52 @@ async def _fetch_listings_with_semaphore( district: str, ) -> list[dict[str, Any]]: result = [] - # we don't know how many pages we have but we stop as soon as there's no more - for page_id in range(1, 3): - logger.debug(f"Processing {page_id=} for {district=}") - # seems like all searches stop at 1500 entries (page_id * page_size) - async with semaphore: - try: - listing_query_result = await listing_query( - page=page_id, - channel=parameters.listing_type, - min_bedrooms=parameters.min_bedrooms, - max_bedrooms=parameters.max_bedrooms, - radius=parameters.radius, - min_price=parameters.min_price, - max_price=parameters.max_price, - district=district, - page_size=parameters.page_size, - max_days_since_added=parameters.max_days_since_added, - furnish_types=parameters.furnish_types or [], - ) + # split the price in N bands to avoid the 1.5k capping by rightmove + # basically instead of 1 query with price between 1k and 5k that is capped at 1500 results + # we do 10 queries each with an increment in price range so we send more queries but each + # has a smaller chance of returning more than 1.5k results - except Exception as e: - if "GENERIC_ERROR" in str(e): # Too big page id - logger.debug(f"Max page id for {district=}: {page_id-1}") - break - raise e - result.append(listing_query_result) + number_of_steps = 10 + price_step = parameters.max_price // number_of_steps + + for step in range(number_of_steps): + min_price = step * price_step + max_price = (step + 1) * price_step + logger.debug( + f"Step {step} of {number_of_steps} with {min_price=} and {max_price=}" + ) + + for num_bedrooms in range(parameters.min_bedrooms, parameters.max_bedrooms + 1): + for page_id in range( + 1, + 3, # seems like all searches stop at 1500 entries (page_id * page_size) + ): + logger.debug(f"Processing {page_id=} for {district=}") + + async with semaphore: + try: + listing_query_result = await listing_query( + page=page_id, + channel=parameters.listing_type, + # min_bedrooms=parameters.min_bedrooms, + # max_bedrooms=parameters.max_bedrooms, + min_bedrooms=num_bedrooms, + max_bedrooms=num_bedrooms, + radius=parameters.radius, + min_price=min_price, + max_price=max_price, + district=district, + page_size=parameters.page_size, + max_days_since_added=parameters.max_days_since_added, + furnish_types=parameters.furnish_types or [], + ) + + except Exception as e: + if "GENERIC_ERROR" in str(e): # Too big page id + logger.debug(f"Max page id for {district=}: {page_id-1}") + break + raise e + result.append(listing_query_result) return result