add runall script, update parameters to 4 bed etc and allow incremental updating

This commit is contained in:
Kadir 2024-11-23 22:57:22 +00:00
parent dbf72e42e3
commit 4b6b8628c2
3 changed files with 20 additions and 3 deletions

View file

@ -14,12 +14,13 @@ for district, locid in districts.items():
d = listing_query(
page=i,
min_bedrooms=1,
max_bedrooms=3,
max_bedrooms=4,
radius=0,
min_price=0,
max_price=800000,
max_price=1000000,
location_id=locid,
page_size=500,
max_days_since_added=7,
)
except Exception as e:
print(e)

View file

@ -4,10 +4,16 @@ from tqdm import tqdm
from data_access import Listing
incremental = True
listings = Listing.get_all_listings()
filtered_listings = []
for listing in listings:
if not listing.path_detail_json().exists():
if not incremental and not listing.isRemoved:
filtered_listings.append(listing)
if incremental and not listing.path_detail_json().exists():
filtered_listings.append(listing)
for listing in tqdm(filtered_listings):

10
crawler/runall.sh Executable file
View file

@ -0,0 +1,10 @@
#!/usr/bin/env bash
set -euxo pipefail
mkdir -p /tmp/re/
python 1_dump_listings.py | tee -a /tmp/re/1.log | tee -a /tmp/re/log.log
python 2_dump_detail.py | tee -a /tmp/re/2.log | tee -a /tmp/re/log.log
python 3_dump_images.py | tee -a /tmp/re/3.log | tee -a /tmp/re/log.log
python 4_detect_floorplan.py | tee -a /tmp/re/4.log | tee -a /tmp/re/log.log
python 5_routing.py | tee -a /tmp/re/5.log | tee -a /tmp/re/log.log