From 4b6b8628c2c5249176c37882486b292da5ee2715 Mon Sep 17 00:00:00 2001 From: Kadir Date: Sat, 23 Nov 2024 22:57:22 +0000 Subject: [PATCH] add runall script, update parameters to 4 bed etc and allow incremental updating --- crawler/1_dump_listings.py | 5 +++-- crawler/2_dump_detail.py | 8 +++++++- crawler/runall.sh | 10 ++++++++++ 3 files changed, 20 insertions(+), 3 deletions(-) create mode 100755 crawler/runall.sh diff --git a/crawler/1_dump_listings.py b/crawler/1_dump_listings.py index 0d05d6b..7046b48 100644 --- a/crawler/1_dump_listings.py +++ b/crawler/1_dump_listings.py @@ -14,12 +14,13 @@ for district, locid in districts.items(): d = listing_query( page=i, min_bedrooms=1, - max_bedrooms=3, + max_bedrooms=4, radius=0, min_price=0, - max_price=800000, + max_price=1000000, location_id=locid, page_size=500, + max_days_since_added=7, ) except Exception as e: print(e) diff --git a/crawler/2_dump_detail.py b/crawler/2_dump_detail.py index 4a1c295..a7b0e18 100644 --- a/crawler/2_dump_detail.py +++ b/crawler/2_dump_detail.py @@ -4,10 +4,16 @@ from tqdm import tqdm from data_access import Listing +incremental = True + + listings = Listing.get_all_listings() filtered_listings = [] for listing in listings: - if not listing.path_detail_json().exists(): + if not incremental and not listing.isRemoved: + filtered_listings.append(listing) + + if incremental and not listing.path_detail_json().exists(): filtered_listings.append(listing) for listing in tqdm(filtered_listings): diff --git a/crawler/runall.sh b/crawler/runall.sh new file mode 100755 index 0000000..f5311cc --- /dev/null +++ b/crawler/runall.sh @@ -0,0 +1,10 @@ +#!/usr/bin/env bash + +set -euxo pipefail + +mkdir -p /tmp/re/ +python 1_dump_listings.py | tee -a /tmp/re/1.log | tee -a /tmp/re/log.log +python 2_dump_detail.py | tee -a /tmp/re/2.log | tee -a /tmp/re/log.log +python 3_dump_images.py | tee -a /tmp/re/3.log | tee -a /tmp/re/log.log +python 4_detect_floorplan.py | tee -a /tmp/re/4.log | tee -a /tmp/re/log.log +python 5_routing.py | tee -a /tmp/re/5.log | tee -a /tmp/re/log.log