Real crawling scripts and floorplan detection
1. get all listings 2. get all detail jsons 3. get all images 4. get all floorplans 5. detecting floorplans Also updating dependencies for huggingface etc.
This commit is contained in:
parent
46bb641026
commit
508aa02812
12 changed files with 1531 additions and 170 deletions
42
crawler/3_dump_images.py
Normal file
42
crawler/3_dump_images.py
Normal file
|
|
@ -0,0 +1,42 @@
|
|||
import pathlib
|
||||
import json
|
||||
from urllib.request import urlretrieve
|
||||
from tqdm import tqdm
|
||||
|
||||
folder = pathlib.Path('data/rs/')
|
||||
details = folder.glob('*/detail.json')
|
||||
|
||||
for detail_path in tqdm(list(details)):
|
||||
|
||||
with open(detail_path) as f:
|
||||
detail = json.load(f)
|
||||
|
||||
identifier = detail['property']['identifier']
|
||||
rsfolder = folder / str(identifier)
|
||||
|
||||
|
||||
for photo in detail['property']['photos']:
|
||||
url = photo['maxSizeUrl']
|
||||
picname = url.split('/')[-1]
|
||||
order = photo['order']
|
||||
filename = f'{order}_{picname}'
|
||||
fullpicpath = rsfolder / 'pics' / filename
|
||||
if fullpicpath.exists():
|
||||
continue
|
||||
fullpicpath.parent.mkdir(parents=True, exist_ok=True) # create the 'pics' folder
|
||||
tqdm.write(str(fullpicpath))
|
||||
urlretrieve(url, fullpicpath)
|
||||
|
||||
for photo in detail['property']['floorplans']:
|
||||
url = photo['url']
|
||||
picname = url.split('/')[-1]
|
||||
order = photo['order']
|
||||
filename = f'{order}_{picname}'
|
||||
fullpicpath = rsfolder / 'floorplans' / filename
|
||||
if fullpicpath.exists():
|
||||
continue
|
||||
fullpicpath.parent.mkdir(parents=True, exist_ok=True) # create the 'floorplans' folder
|
||||
tqdm.write(str(fullpicpath))
|
||||
urlretrieve(url, fullpicpath)
|
||||
|
||||
|
||||
Loading…
Add table
Add a link
Reference in a new issue