From ad879f2d4f131ff2e5595164d77dedf5a0e2c9a2 Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Sat, 17 May 2025 21:55:42 +0000 Subject: [PATCH] convert listings dump to asyncio --- crawler/1_dump_listings.py | 62 +++++++++++++++++++------------------- crawler/main.py | 5 +-- crawler/rec/query.py | 2 +- 3 files changed, 35 insertions(+), 34 deletions(-) diff --git a/crawler/1_dump_listings.py b/crawler/1_dump_listings.py index 60b6a1b..188f5aa 100644 --- a/crawler/1_dump_listings.py +++ b/crawler/1_dump_listings.py @@ -1,3 +1,4 @@ +import asyncio from dataclasses import dataclass import pathlib from rec.query import ListingType, listing_query @@ -18,7 +19,7 @@ class QueryParameters: max_days_since_added: int | None = None -def dump_listings( +async def dump_listings( parameters: QueryParameters, data_dir: pathlib.Path = pathlib.Path("data/rs/"), ) -> list[Listing]: @@ -29,38 +30,37 @@ def dump_listings( } print("Valid districts to scrape:", districts.keys()) listings = [] - for district, locid in districts.items(): - print("#### District:", district) - for i in [1, 2]: - try: - response_json = listing_query( - page=i, - channel=parameters.listing_type, - min_bedrooms=parameters.min_bedrooms, - max_bedrooms=parameters.max_bedrooms, - radius=parameters.radius, - min_price=parameters.min_price, - max_price=parameters.max_price, - location_id=locid, - page_size=parameters.page_size, - max_days_since_added=parameters.max_days_since_added, - ) - except Exception as e: - print(e) - break - if i == 1: - print("totalAvailableResults: ", response_json["totalAvailableResults"]) - if len(response_json["properties"]) == 0: - break - print(f"page {i}", end=", ", flush=True) - for property in response_json["properties"]: - identifier = property["identifier"] + json_responses = await asyncio.gather( + *[ + listing_query( + page=i, + channel=parameters.listing_type, + min_bedrooms=parameters.min_bedrooms, + max_bedrooms=parameters.max_bedrooms, + radius=parameters.radius, + min_price=parameters.min_price, + max_price=parameters.max_price, + location_id=locid, + page_size=parameters.page_size, + max_days_since_added=parameters.max_days_since_added, + ) for locid in districts.values() for i in [1, 2] + ] + ) + listings = [] + for response_json in json_responses: + if response_json["totalAvailableResults"] == 0: + print("No results found") + continue + if response_json["totalAvailableResults"] > 0: + print("totalAvailableResults: ", response_json["totalAvailableResults"]) + for property in response_json["properties"]: + identifier = property["identifier"] + + listing = Listing(identifier, data_dir=data_dir) + listing.dump_listing(property) + listings.append(listing) - listing = Listing(identifier, data_dir=data_dir) - listing.dump_listing(property) - listings.append(listing) - print() # break line as we used end=, above. return listings diff --git a/crawler/main.py b/crawler/main.py index be5c93c..3658001 100644 --- a/crawler/main.py +++ b/crawler/main.py @@ -1,3 +1,4 @@ +import asyncio import pathlib import click import importlib @@ -107,12 +108,12 @@ def dump_listings( f'{query_parameters}' ) data_dir_path = pathlib.Path(data_dir) - dump_listings_module.dump_listings(query_parameters, data_dir_path) + asyncio.run(dump_listings_module.dump_listings(query_parameters, data_dir_path)) @cli.command() @click.pass_context -def dump_detail(ctx: click.core.Context): +def dump_details(ctx: click.core.Context): data_dir = ctx.obj['data_dir'] click.echo(f'Running dump_detail for listings stored in {data_dir}') listing_paths = sorted(list(pathlib.Path(data_dir).glob("*/listing.json"))) diff --git a/crawler/rec/query.py b/crawler/rec/query.py index 304e9b9..06be85b 100644 --- a/crawler/rec/query.py +++ b/crawler/rec/query.py @@ -46,7 +46,7 @@ def detail_query(detail_id: int): return response.json() -def listing_query( +async def listing_query( page: int, channel: ListingType, min_bedrooms: int,