crawling for 3 and refactoring to allow incremental crawls

This commit is contained in:
Kadir 2024-03-11 14:43:53 +00:00
parent de2639f9c3
commit 36258d877f
5 changed files with 310 additions and 167 deletions

View file

@ -10,6 +10,7 @@ _DATA_DIR = pathlib.Path('data/rs/')
class Listing():
identifier: int
_cached: Dict = None
@staticmethod
def get_all_listings() -> List['Listing']:
@ -23,7 +24,9 @@ class Listing():
return identifiers
def path_listing(self) -> pathlib.Path:
return _DATA_DIR / str(self.identifier)
p = _DATA_DIR / str(self.identifier)
p.mkdir(parents=True, exist_ok=True)
return p
def path_listing_json(self) -> pathlib.Path:
return self.path_listing() / 'listing.json'
@ -40,15 +43,16 @@ class Listing():
def path_pic_folder(self) -> pathlib.Path:
return self.path_listing() / 'pics'
def path_pic_folder(self, order, name) -> pathlib.Path:
def path_pic_file(self, order, name) -> pathlib.Path:
self.path_pic_folder().mkdir(parents=True, exist_ok=True)
return self.path_pic_folder() / f'{order}_{name}'
def path_floorplan_folder(self) -> pathlib.Path:
return self.path_listing() / 'floorplans'
def path_floorplan_file(self, order, name) -> pathlib.Path:
return self.path_pic_folder() / f'{order}_{name}'
self.path_floorplan_folder().mkdir(parents=True, exist_ok=True)
return self.path_floorplan_folder() / f'{order}_{name}'
def list_floorplans(self):
images = list(self.path_floorplan_folder().glob('*'))
@ -80,7 +84,10 @@ class Listing():
max_sqm = max([o['estimated_sqm'] for o in objs if o is None]) # filter out Nones
return max_sqm
def calculate_sqm_ocr(self):
def calculate_sqm_ocr(self, recalculate=True):
if not recalculate and self.path_floorplan_ocr_json().exists():
return
objs = []
for floorplan_path in self.list_floorplans():
estimated_sqm, model_output = floorplan.calculate_ocr(floorplan_path)
@ -129,12 +136,17 @@ class Listing():
return None
return self.price / self.sqm_ocr
@property
def bedrooms(self) -> int:
return self.detailobject['property']['bedrooms']
def dict_nicely(self):
return {
'sqm_ocr': self.sqm_ocr,
'price': self.price,
'price_per_sqm': self.price_per_sqm,
'url': self.url,
'bedrooms': self.bedrooms,
}