crawling for 3 and refactoring to allow incremental crawls
This commit is contained in:
parent
de2639f9c3
commit
36258d877f
5 changed files with 310 additions and 167 deletions
|
|
@ -2,41 +2,31 @@ import pathlib
|
|||
import json
|
||||
from urllib.request import urlretrieve
|
||||
from tqdm import tqdm
|
||||
from data_access import Listing
|
||||
|
||||
folder = pathlib.Path('data/rs/')
|
||||
details = folder.glob('*/detail.json')
|
||||
|
||||
for detail_path in tqdm(list(details)):
|
||||
|
||||
with open(detail_path) as f:
|
||||
for listing in tqdm(Listing.get_all_listings()):
|
||||
with open(listing.path_detail_json()) as f:
|
||||
detail = json.load(f)
|
||||
|
||||
identifier = detail['property']['identifier']
|
||||
rsfolder = folder / str(identifier)
|
||||
|
||||
|
||||
for photo in detail['property']['photos']:
|
||||
url = photo['maxSizeUrl']
|
||||
picname = url.split('/')[-1]
|
||||
order = photo['order']
|
||||
filename = f'{order}_{picname}'
|
||||
fullpicpath = rsfolder / 'pics' / filename
|
||||
if fullpicpath.exists():
|
||||
p = listing.path_pic_file(order, picname)
|
||||
if p.exists():
|
||||
continue
|
||||
fullpicpath.parent.mkdir(parents=True, exist_ok=True) # create the 'pics' folder
|
||||
tqdm.write(str(fullpicpath))
|
||||
urlretrieve(url, fullpicpath)
|
||||
tqdm.write(str(p))
|
||||
urlretrieve(url, p)
|
||||
|
||||
for photo in detail['property']['floorplans']:
|
||||
url = photo['url']
|
||||
picname = url.split('/')[-1]
|
||||
order = photo['order']
|
||||
filename = f'{order}_{picname}'
|
||||
fullpicpath = rsfolder / 'floorplans' / filename
|
||||
if fullpicpath.exists():
|
||||
p = listing.path_floorplan_file(order, picname)
|
||||
if p.exists():
|
||||
continue
|
||||
fullpicpath.parent.mkdir(parents=True, exist_ok=True) # create the 'floorplans' folder
|
||||
tqdm.write(str(fullpicpath))
|
||||
urlretrieve(url, fullpicpath)
|
||||
tqdm.write(str(p))
|
||||
urlretrieve(url, p)
|
||||
|
||||
|
||||
Loading…
Add table
Add a link
Reference in a new issue