diff --git a/crawler/1_dump_listings.py b/crawler/1_dump_listings.py index 7046b48..3f8eb74 100644 --- a/crawler/1_dump_listings.py +++ b/crawler/1_dump_listings.py @@ -1,7 +1,6 @@ from rec.query import listing_query from rec.districts import get_districts import pathlib -import json from data_access import Listing folder = pathlib.Path("data/rs/") @@ -20,7 +19,7 @@ for district, locid in districts.items(): max_price=1000000, location_id=locid, page_size=500, - max_days_since_added=7, + max_days_since_added=None, ) except Exception as e: print(e) @@ -35,6 +34,5 @@ for district, locid in districts.items(): identifier = property["identifier"] listing = Listing(identifier) - with open(listing.path_listing_json(), "w") as f: - json.dump(property, f) + listing.dump_listing(property) print() # break line as we used end=, above. diff --git a/crawler/data_access.py b/crawler/data_access.py index 3e37e52..0bad232 100644 --- a/crawler/data_access.py +++ b/crawler/data_access.py @@ -59,6 +59,16 @@ class Listing: def path_floorplan_file(self, order, name) -> pathlib.Path: self.path_floorplan_folder().mkdir(parents=True, exist_ok=True) return self.path_floorplan_folder() / f"{order}_{name}" + + def path_last_seen_listing(self) -> pathlib.Path: + return self.path_listing() / "last_seen.json" + + def dump_listing(self, d: dict): + with open(self.path_listing_json(), "w") as f: + json.dump(d, f) + with open(self.path_last_seen_listing(), "w") as f: + dt = datetime.datetime.now().isoformat() + json.dump(dt, f) def list_floorplans(self): images = list(self.path_floorplan_folder().glob("*")) @@ -203,6 +213,15 @@ class Listing: ds = datetime.datetime.fromtimestamp(ts) return (now - ds).days + @property + def last_seen(self) -> datetime.datetime: + if not self.path_last_seen_listing().exists(): + return None + + with open(self.path_last_seen_listing(), 'r') as f: + datetime_str = json.load(f) + return datetime.fromisoformat(datetime_str) + @property def serviceCharge(self) -> float: ds = self.detailobject["property"].get("tenureInfo", {}).get("content", []) @@ -255,6 +274,7 @@ class Listing: "tenure_type": self.tenure_type, "updated_days": self.updateDaysAgo, "status": self.status, + "last_seen": self.last_seen, }