diff --git a/crawler/1_dump_listings.py b/crawler/1_dump_listings.py index cc2aa53..1f364b7 100644 --- a/crawler/1_dump_listings.py +++ b/crawler/1_dump_listings.py @@ -3,13 +3,13 @@ import pathlib import json from data_access import Listing -d = listing_query(1, 3, 3, 15, 0, 800000) +d = listing_query(1, 3, 3, 15, 0, 800000, max_days_since_added=7) folder = pathlib.Path("data/rs/") for i in range(1, 10000): try: print(f"page {i}") - d = listing_query(i, 3, 3, 15, 0, 800000, max_days_since_added=3) + d = listing_query(i, 3, 3, 15, 0, 800000, max_days_since_added=1) except: break diff --git a/crawler/9_recalculate_regex_squaremeter.py b/crawler/9_recalculate_regex_squaremeter.py new file mode 100644 index 0000000..d226df5 --- /dev/null +++ b/crawler/9_recalculate_regex_squaremeter.py @@ -0,0 +1,15 @@ +# recalculate regex from sqm from already previously ocr'ed text +import json +from rec.floorplan import extract_total_sqm +from tqdm import tqdm +from data_access import Listing + +for listing in tqdm(list(Listing.get_all_listings())): + with open(listing.path_floorplan_ocr_json()) as f: + floorplans = json.load(f) + + for floorplan in floorplans: + floorplan['estimated_sqm'] = extract_total_sqm(floorplan['text']) + + with open(listing.path_floorplan_ocr_json(), 'w') as f: + floorplans = json.dump(floorplans, f) diff --git a/crawler/data_access.py b/crawler/data_access.py index 5d809e5..4467fd7 100644 --- a/crawler/data_access.py +++ b/crawler/data_access.py @@ -154,7 +154,9 @@ class Listing(): @property def price_per_sqm(self) -> float: - if self.sqm_ocr is None: + if self.sqm_ocr == 0: + print(self.identifier) + if self.sqm_ocr is None or self.sqm_ocr == 0: return None return self.price / self.sqm_ocr diff --git a/crawler/rec/floorplan.py b/crawler/rec/floorplan.py index 17f9a98..fb9703b 100644 --- a/crawler/rec/floorplan.py +++ b/crawler/rec/floorplan.py @@ -17,7 +17,7 @@ def inference(image_path): def extract_total_sqm(deplot_input_str): - sqmregex = r'(\d+\.\d*) ?(sqm|sq.m|sq m|m)' + sqmregex = r'(\d+\.\d*) ?(sq ?m|sq. ?m)' matches = re.findall(sqmregex, deplot_input_str.lower()) if len(matches) == 0: return None