crawling for 3 and refactoring to allow incremental crawls

This commit is contained in:
Kadir 2024-03-11 14:43:53 +00:00
parent de2639f9c3
commit 36258d877f
5 changed files with 310 additions and 167 deletions

View file

@ -1,25 +1,17 @@
import pathlib
import json
from rec.query import detail_query
from tqdm import tqdm
folder = pathlib.Path('data/rs/')
listings = folder.glob('*/listing.json')
from data_access import Listing
for listing_path in listings:
with open(listing_path) as f:
listing = json.load(f)
identifier = listing['identifier']
for listing in tqdm(Listing.get_all_listings()):
if listing.path_detail_json().exists():
continue
try:
d = detail_query(identifier)
d = detail_query(listing.identifier)
with open(listing.path_detail_json(), 'w') as f:
json.dump(d, f)
except:
print('Failed at: ', identifier)
print('Failed at: ', listing.identifier)
raise
print(identifier)
detail_path = pathlib.Path(f'data/rs/{identifier}/detail.json')
with open(detail_path, 'w') as f:
json.dump(d, f)