dump images using aiohttp and concurrently

This commit is contained in:
Viktor Barzin 2025-05-17 22:42:37 +00:00
parent 01ac24b4b7
commit 68cc70bd11
No known key found for this signature in database
GPG key ID: 4056458DBDBF8863
2 changed files with 30 additions and 28 deletions

View file

@ -19,8 +19,7 @@ async def dump_detail(listing_paths: list[str]):
async def _dump_detail_for_listing(listing: Listing): async def _dump_detail_for_listing(listing: Listing):
incremental = True if listing.path_detail_json().exists():
if incremental and not listing.path_detail_json().exists():
return return
# for listing in tqdm(filtered_listings): # for listing in tqdm(filtered_listings):

View file

@ -1,37 +1,40 @@
import asyncio
import json import json
import pathlib import pathlib
from urllib.request import urlretrieve import aiohttp
from tqdm import tqdm from tqdm import tqdm
from data_access import Listing from data_access import Listing
# Setting this too high either crashes rightmove or gets us blocked
semaphore = asyncio.Semaphore(10)
def dump_images(listing_paths: list[str]):
for listing in tqdm(Listing.get_all_listings(listing_paths)):
with open(listing.path_detail_json()) as f:
detail = json.load(f)
# for photo in detail["property"]["photos"]: async def dump_images(listing_paths: list[str]):
# url = photo["maxSizeUrl"] listings = Listing.get_all_listings(listing_paths)
# picname = url.split("/")[-1] await tqdm.gather(*[dump_images_for_listing(listing) for listing in listings])
# order = photo["order"]
# p = listing.path_pic_file(order, picname)
# if p.exists():
# continue
# tqdm.write(str(p))
# urlretrieve(url, p)
for photo in detail["property"]["floorplans"]:
url = photo["url"] async def dump_images_for_listing(listing: Listing):
picname = url.split("/")[-1] with open(listing.path_detail_json()) as f:
order = photo["order"] detail = json.load(f)
p = listing.path_floorplan_file(order, picname)
if p.exists(): for photo in detail["property"]["floorplans"]:
continue url = photo["url"]
tqdm.write(str(p)) picname = url.split("/")[-1]
try: order = photo["order"]
urlretrieve(url, p) p = listing.path_floorplan_file(order, picname)
except Exception as e: if p.exists():
tqdm.write(f"Error for {url}: {e}") continue
try:
async with aiohttp.ClientSession() as session:
async with semaphore:
async with session.get(url) as response:
if response.status != 200:
raise Exception(f"Error for {url}: {response.status}")
with open(p, "wb") as f:
f.write(await response.read())
except Exception as e:
tqdm.write(f"Error for {url}: {e}")
def main(): def main():