2025-05-17 22:42:37 +00:00
|
|
|
import asyncio
|
2024-03-10 18:49:39 +00:00
|
|
|
import json
|
2025-06-07 13:56:00 +00:00
|
|
|
from pathlib import Path
|
2025-05-17 22:42:37 +00:00
|
|
|
import aiohttp
|
2025-06-07 13:56:00 +00:00
|
|
|
from repositories import ListingRepository
|
2025-07-01 16:12:06 +00:00
|
|
|
from tenacity import retry, stop_after_attempt, wait_random
|
2025-05-17 23:14:00 +00:00
|
|
|
from tqdm.asyncio import tqdm
|
2025-06-07 13:56:00 +00:00
|
|
|
|
|
|
|
|
from models import Listing
|
2024-03-10 18:49:39 +00:00
|
|
|
|
2025-05-17 22:42:37 +00:00
|
|
|
# Setting this too high either crashes rightmove or gets us blocked
|
2025-07-01 16:12:06 +00:00
|
|
|
semaphore = asyncio.Semaphore(5)
|
2025-05-17 22:42:37 +00:00
|
|
|
|
|
|
|
|
|
2025-06-22 21:15:50 +00:00
|
|
|
async def dump_images(
|
|
|
|
|
repository: ListingRepository,
|
|
|
|
|
image_base_path: Path = Path("data/rs/"),
|
|
|
|
|
):
|
2025-06-07 13:56:00 +00:00
|
|
|
listings = await repository.get_listings()
|
|
|
|
|
updated_listings = await tqdm.gather(
|
|
|
|
|
*[dump_images_for_listing(listing, image_base_path) for listing in listings]
|
|
|
|
|
)
|
|
|
|
|
await repository.upsert_listings(
|
|
|
|
|
[listing for listing in updated_listings if listing is not None]
|
|
|
|
|
)
|
2025-05-17 22:42:37 +00:00
|
|
|
|
|
|
|
|
|
2025-07-01 16:12:06 +00:00
|
|
|
@retry(wait=wait_random(min=1, max=2), stop=stop_after_attempt(3))
|
2025-06-07 13:56:00 +00:00
|
|
|
async def dump_images_for_listing(listing: Listing, base_path: Path) -> Listing | None:
|
|
|
|
|
all_floorplans = listing.additional_info["property"]["floorplans"]
|
|
|
|
|
for floorplan in all_floorplans:
|
|
|
|
|
url = floorplan["url"]
|
2025-05-17 22:42:37 +00:00
|
|
|
picname = url.split("/")[-1]
|
2025-06-07 13:56:00 +00:00
|
|
|
floorplan_path = Path(base_path, str(listing.id), "floorplans", picname)
|
|
|
|
|
if floorplan_path.exists():
|
2025-05-17 22:42:37 +00:00
|
|
|
continue
|
|
|
|
|
try:
|
2025-06-07 13:56:00 +00:00
|
|
|
async with semaphore:
|
|
|
|
|
async with aiohttp.ClientSession() as session:
|
2025-05-17 22:42:37 +00:00
|
|
|
async with session.get(url) as response:
|
2025-06-21 12:04:48 +00:00
|
|
|
if response.status == 404:
|
|
|
|
|
return None
|
2025-05-17 22:42:37 +00:00
|
|
|
if response.status != 200:
|
2025-05-31 23:50:43 +00:00
|
|
|
raise Exception(f"Error for {url}: {response.status}")
|
2025-06-07 13:56:00 +00:00
|
|
|
floorplan_path.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
|
with open(floorplan_path, "wb") as f:
|
2025-05-17 22:42:37 +00:00
|
|
|
f.write(await response.read())
|
2025-06-07 13:56:00 +00:00
|
|
|
listing.floorplan_image_paths.append(str(floorplan_path))
|
|
|
|
|
return listing
|
2025-05-17 22:42:37 +00:00
|
|
|
except Exception as e:
|
|
|
|
|
tqdm.write(f"Error for {url}: {e}")
|
2025-06-30 23:24:16 +00:00
|
|
|
raise e # raise so that we retry it
|