Real crawling scripts and floorplan detection

1. get all listings
2. get all detail jsons
3. get all images
4. get all floorplans
5. detecting floorplans

Also updating dependencies for huggingface etc.
This commit is contained in:
Kadir 2024-03-10 18:49:39 +00:00
parent 46bb641026
commit 508aa02812
12 changed files with 1531 additions and 170 deletions

42
crawler/3_dump_images.py Normal file
View file

@ -0,0 +1,42 @@
import pathlib
import json
from urllib.request import urlretrieve
from tqdm import tqdm
folder = pathlib.Path('data/rs/')
details = folder.glob('*/detail.json')
for detail_path in tqdm(list(details)):
with open(detail_path) as f:
detail = json.load(f)
identifier = detail['property']['identifier']
rsfolder = folder / str(identifier)
for photo in detail['property']['photos']:
url = photo['maxSizeUrl']
picname = url.split('/')[-1]
order = photo['order']
filename = f'{order}_{picname}'
fullpicpath = rsfolder / 'pics' / filename
if fullpicpath.exists():
continue
fullpicpath.parent.mkdir(parents=True, exist_ok=True) # create the 'pics' folder
tqdm.write(str(fullpicpath))
urlretrieve(url, fullpicpath)
for photo in detail['property']['floorplans']:
url = photo['url']
picname = url.split('/')[-1]
order = photo['order']
filename = f'{order}_{picname}'
fullpicpath = rsfolder / 'floorplans' / filename
if fullpicpath.exists():
continue
fullpicpath.parent.mkdir(parents=True, exist_ok=True) # create the 'floorplans' folder
tqdm.write(str(fullpicpath))
urlretrieve(url, fullpicpath)