diff --git a/crawler/2_dump_detail.py b/crawler/2_dump_detail.py index bf9696a..e58beef 100644 --- a/crawler/2_dump_detail.py +++ b/crawler/2_dump_detail.py @@ -1,35 +1,39 @@ +import asyncio import json import pathlib from rec.query import detail_query -from tqdm import tqdm +from tqdm.asyncio import tqdm from data_access import Listing -def dump_detail(listing_paths: list[str]): - incremental = True +async def dump_detail(listing_paths: list[str]): listings = Listing.get_all_listings(listing_paths) - filtered_listings = [] - for listing in listings: - # We introduced last_seen later, so not all entries have it. - # If it doesnt exist then its on the platform anymore. So skip - last_seen = listing.last_seen - if last_seen is None: - continue + filtered_listings = await tqdm.gather( + *[_dump_detail_for_listing(listing) for listing in listings] + ) + return filtered_listings - if not incremental and last_seen <= 1: - filtered_listings.append(listing) - if incremental and not listing.path_detail_json().exists(): - filtered_listings.append(listing) +async def _dump_detail_for_listing(listing: Listing): + incremental = True + # We introduced last_seen later, so not all entries have it. + # If it doesnt exist then its on the platform anymore. So skip + last_seen = listing.last_seen + if last_seen is None: + return - for listing in tqdm(filtered_listings): - try: - d = detail_query(listing.identifier) - with open(listing.path_detail_json(), "w") as f: - json.dump(d, f) - except Exception as e: - print(e) + if not incremental and last_seen <= 1: + return + + if incremental and not listing.path_detail_json().exists(): + return + print('fetching', listing.identifier) + + # for listing in tqdm(filtered_listings): + d = await detail_query(listing.identifier) + with open(listing.path_detail_json(), "w") as f: + json.dump(d, f) def main(): diff --git a/crawler/main.py b/crawler/main.py index 3658001..733fbee 100644 --- a/crawler/main.py +++ b/crawler/main.py @@ -117,7 +117,7 @@ def dump_details(ctx: click.core.Context): data_dir = ctx.obj['data_dir'] click.echo(f'Running dump_detail for listings stored in {data_dir}') listing_paths = sorted(list(pathlib.Path(data_dir).glob("*/listing.json"))) - dump_detail_module.dump_detail(listing_paths) + asyncio.run(dump_detail_module.dump_detail(listing_paths)) @cli.command() diff --git a/crawler/rec/query.py b/crawler/rec/query.py index 06be85b..50ea314 100644 --- a/crawler/rec/query.py +++ b/crawler/rec/query.py @@ -30,7 +30,7 @@ class PropertyType(enum.StrEnum): TERRACED = "terraced" -def detail_query(detail_id: int): +async def detail_query(detail_id: int): params = { "apiApplication": "ANDROID", "appVersion": "3.70.0",