wrongmove/crawler/2_dump_detail.py
2025-05-17 22:34:27 +00:00

39 lines
1,021 B
Python

import asyncio
import json
import pathlib
from rec.query import detail_query
from tqdm.asyncio import tqdm
from data_access import Listing
# Setting this too high either crashes rightmove or gets us blocked
semaphore = asyncio.Semaphore(10)
async def dump_detail(listing_paths: list[str]):
listings = Listing.get_all_listings(listing_paths)
filtered_listings = await tqdm.gather(
*[_dump_detail_for_listing(listing) for listing in listings]
)
return filtered_listings
async def _dump_detail_for_listing(listing: Listing):
incremental = True
if incremental and not listing.path_detail_json().exists():
return
# for listing in tqdm(filtered_listings):
async with semaphore:
d = await detail_query(listing.identifier)
with open(listing.path_detail_json(), "w") as f:
json.dump(d, f)
def main():
listing_paths = sorted(list(pathlib.Path("data/rs").glob("*/listing.json")))
dump_detail(listing_paths)
if __name__ == "__main__":
main()