diff --git a/crawler/3_dump_images.py b/crawler/3_dump_images.py index 8e84217..5792b5b 100644 --- a/crawler/3_dump_images.py +++ b/crawler/3_dump_images.py @@ -25,4 +25,7 @@ for listing in tqdm(Listing.get_all_listings()): if p.exists(): continue tqdm.write(str(p)) - urlretrieve(url, p) + try: + urlretrieve(url, p) + except: + tqdm.write(f"404 for {url}") diff --git a/crawler/data_access.py b/crawler/data_access.py index 9e985eb..9ed4e7b 100644 --- a/crawler/data_access.py +++ b/crawler/data_access.py @@ -4,6 +4,8 @@ import pathlib from typing import List, Dict from rec import floorplan, routing import re +import datetime + _DATA_DIR = pathlib.Path("data/rs/") @@ -193,6 +195,13 @@ class Listing: if len(matches): return float(matches[0]) return None + + @property + def updateDaysAgo(self) -> int: + ts = self.detailobject["property"]["updateDate"] / 1000 + now = datetime.datetime.now() + ds = datetime.datetime.fromtimestamp(ts) + return (now - ds).days @property def serviceCharge(self) -> float: @@ -233,6 +242,7 @@ class Listing: "service_charge": self.serviceCharge, "development": self.development, "tenure_type": self.tenure_type, + "updated_days": self.updateDaysAgo, }