diff --git a/crawler/2_dump_detail.py b/crawler/2_dump_detail.py index dde735f..e29e995 100644 --- a/crawler/2_dump_detail.py +++ b/crawler/2_dump_detail.py @@ -4,28 +4,36 @@ from tqdm import tqdm from data_access import Listing -incremental = True + +def dump_detail(): + incremental = True + listings = Listing.get_all_listings() + filtered_listings = [] + for listing in listings: + # We introduced last_seen later, so not all entries have it. + # If it doesnt exist then its on the platform anymore. So skip + last_seen = listing.last_seen + if last_seen is None: + continue + + if not incremental and last_seen <= 1: + filtered_listings.append(listing) + + if incremental and not listing.path_detail_json().exists(): + filtered_listings.append(listing) + + for listing in tqdm(filtered_listings): + try: + d = detail_query(listing.identifier) + with open(listing.path_detail_json(), "w") as f: + json.dump(d, f) + except Exception as e: + print(e) -listings = Listing.get_all_listings() -filtered_listings = [] -for listing in listings: - # We introduced last_seen later, so not all entries have it. - # If it doesnt exist then its on the platform anymore. So skip - last_seen = listing.last_seen - if last_seen is None: - continue +def main(): + dump_detail() - if not incremental and last_seen <= 1: - filtered_listings.append(listing) - if incremental and not listing.path_detail_json().exists(): - filtered_listings.append(listing) - -for listing in tqdm(filtered_listings): - try: - d = detail_query(listing.identifier) - with open(listing.path_detail_json(), "w") as f: - json.dump(d, f) - except Exception as e: - print(e) +if __name__ == "__main__": + main() diff --git a/crawler/main.py b/crawler/main.py index 6a0a1b0..b7767c3 100644 --- a/crawler/main.py +++ b/crawler/main.py @@ -2,9 +2,11 @@ import click import importlib dump_listings_module = importlib.import_module('1_dump_listings') +dump_detail_module = importlib.import_module('2_dump_detail') steps_to_handlers = { 'dump_listings': dump_listings_module.dump_listings, + 'dump_detail': dump_detail_module.dump_detail, }