wrongmove/crawler/3_dump_images.py
Kadir 508aa02812 Real crawling scripts and floorplan detection
1. get all listings
2. get all detail jsons
3. get all images
4. get all floorplans
5. detecting floorplans

Also updating dependencies for huggingface etc.
2024-03-10 18:49:39 +00:00

42 lines
No EOL
1.3 KiB
Python

import pathlib
import json
from urllib.request import urlretrieve
from tqdm import tqdm
folder = pathlib.Path('data/rs/')
details = folder.glob('*/detail.json')
for detail_path in tqdm(list(details)):
with open(detail_path) as f:
detail = json.load(f)
identifier = detail['property']['identifier']
rsfolder = folder / str(identifier)
for photo in detail['property']['photos']:
url = photo['maxSizeUrl']
picname = url.split('/')[-1]
order = photo['order']
filename = f'{order}_{picname}'
fullpicpath = rsfolder / 'pics' / filename
if fullpicpath.exists():
continue
fullpicpath.parent.mkdir(parents=True, exist_ok=True) # create the 'pics' folder
tqdm.write(str(fullpicpath))
urlretrieve(url, fullpicpath)
for photo in detail['property']['floorplans']:
url = photo['url']
picname = url.split('/')[-1]
order = photo['order']
filename = f'{order}_{picname}'
fullpicpath = rsfolder / 'floorplans' / filename
if fullpicpath.exists():
continue
fullpicpath.parent.mkdir(parents=True, exist_ok=True) # create the 'floorplans' folder
tqdm.write(str(fullpicpath))
urlretrieve(url, fullpicpath)