fixing floorplan detection and adding recalculation method
This commit is contained in:
parent
335adc0856
commit
4dea766a12
4 changed files with 21 additions and 4 deletions
|
|
@ -3,13 +3,13 @@ import pathlib
|
||||||
import json
|
import json
|
||||||
from data_access import Listing
|
from data_access import Listing
|
||||||
|
|
||||||
d = listing_query(1, 3, 3, 15, 0, 800000)
|
d = listing_query(1, 3, 3, 15, 0, 800000, max_days_since_added=7)
|
||||||
folder = pathlib.Path("data/rs/")
|
folder = pathlib.Path("data/rs/")
|
||||||
|
|
||||||
for i in range(1, 10000):
|
for i in range(1, 10000):
|
||||||
try:
|
try:
|
||||||
print(f"page {i}")
|
print(f"page {i}")
|
||||||
d = listing_query(i, 3, 3, 15, 0, 800000, max_days_since_added=3)
|
d = listing_query(i, 3, 3, 15, 0, 800000, max_days_since_added=1)
|
||||||
except:
|
except:
|
||||||
break
|
break
|
||||||
|
|
||||||
|
|
|
||||||
15
crawler/9_recalculate_regex_squaremeter.py
Normal file
15
crawler/9_recalculate_regex_squaremeter.py
Normal file
|
|
@ -0,0 +1,15 @@
|
||||||
|
# recalculate regex from sqm from already previously ocr'ed text
|
||||||
|
import json
|
||||||
|
from rec.floorplan import extract_total_sqm
|
||||||
|
from tqdm import tqdm
|
||||||
|
from data_access import Listing
|
||||||
|
|
||||||
|
for listing in tqdm(list(Listing.get_all_listings())):
|
||||||
|
with open(listing.path_floorplan_ocr_json()) as f:
|
||||||
|
floorplans = json.load(f)
|
||||||
|
|
||||||
|
for floorplan in floorplans:
|
||||||
|
floorplan['estimated_sqm'] = extract_total_sqm(floorplan['text'])
|
||||||
|
|
||||||
|
with open(listing.path_floorplan_ocr_json(), 'w') as f:
|
||||||
|
floorplans = json.dump(floorplans, f)
|
||||||
|
|
@ -154,7 +154,9 @@ class Listing():
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def price_per_sqm(self) -> float:
|
def price_per_sqm(self) -> float:
|
||||||
if self.sqm_ocr is None:
|
if self.sqm_ocr == 0:
|
||||||
|
print(self.identifier)
|
||||||
|
if self.sqm_ocr is None or self.sqm_ocr == 0:
|
||||||
return None
|
return None
|
||||||
return self.price / self.sqm_ocr
|
return self.price / self.sqm_ocr
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -17,7 +17,7 @@ def inference(image_path):
|
||||||
|
|
||||||
|
|
||||||
def extract_total_sqm(deplot_input_str):
|
def extract_total_sqm(deplot_input_str):
|
||||||
sqmregex = r'(\d+\.\d*) ?(sqm|sq.m|sq m|m)'
|
sqmregex = r'(\d+\.\d*) ?(sq ?m|sq. ?m)'
|
||||||
matches = re.findall(sqmregex, deplot_input_str.lower())
|
matches = re.findall(sqmregex, deplot_input_str.lower())
|
||||||
if len(matches) == 0:
|
if len(matches) == 0:
|
||||||
return None
|
return None
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue