wrongmove/crawler/3_dump_images.py

52 lines
2 KiB
Python

import asyncio
import json
from pathlib import Path
import aiohttp
from repositories import ListingRepository
from tenacity import retry, stop_after_attempt, wait_random
from tqdm.asyncio import tqdm
from models import Listing
# Setting this too high either crashes rightmove or gets us blocked
semaphore = asyncio.Semaphore(5)
async def dump_images(
repository: ListingRepository,
image_base_path: Path = Path("data/rs/"),
):
listings = await repository.get_listings()
updated_listings = await tqdm.gather(
*[dump_images_for_listing(listing, image_base_path) for listing in listings]
)
await repository.upsert_listings(
[listing for listing in updated_listings if listing is not None]
)
@retry(wait=wait_random(min=1, max=2), stop=stop_after_attempt(3))
async def dump_images_for_listing(listing: Listing, base_path: Path) -> Listing | None:
all_floorplans = listing.additional_info["property"]["floorplans"]
for floorplan in all_floorplans:
url = floorplan["url"]
picname = url.split("/")[-1]
floorplan_path = Path(base_path, str(listing.id), "floorplans", picname)
if floorplan_path.exists():
continue
try:
async with semaphore:
async with aiohttp.ClientSession() as session:
async with session.get(url) as response:
if response.status == 404:
return None
if response.status != 200:
raise Exception(f"Error for {url}: {response.status}")
floorplan_path.parent.mkdir(parents=True, exist_ok=True)
with open(floorplan_path, "wb") as f:
f.write(await response.read())
listing.floorplan_image_paths.append(str(floorplan_path))
return listing
except Exception as e:
tqdm.write(f"Error for {url}: {e}")
raise e # raise so that we retry it