crawling for 3 and refactoring to allow incremental crawls
This commit is contained in:
parent
de2639f9c3
commit
36258d877f
5 changed files with 310 additions and 167 deletions
|
|
@ -10,6 +10,7 @@ _DATA_DIR = pathlib.Path('data/rs/')
|
|||
class Listing():
|
||||
identifier: int
|
||||
_cached: Dict = None
|
||||
|
||||
|
||||
@staticmethod
|
||||
def get_all_listings() -> List['Listing']:
|
||||
|
|
@ -23,7 +24,9 @@ class Listing():
|
|||
return identifiers
|
||||
|
||||
def path_listing(self) -> pathlib.Path:
|
||||
return _DATA_DIR / str(self.identifier)
|
||||
p = _DATA_DIR / str(self.identifier)
|
||||
p.mkdir(parents=True, exist_ok=True)
|
||||
return p
|
||||
|
||||
def path_listing_json(self) -> pathlib.Path:
|
||||
return self.path_listing() / 'listing.json'
|
||||
|
|
@ -40,15 +43,16 @@ class Listing():
|
|||
def path_pic_folder(self) -> pathlib.Path:
|
||||
return self.path_listing() / 'pics'
|
||||
|
||||
def path_pic_folder(self, order, name) -> pathlib.Path:
|
||||
def path_pic_file(self, order, name) -> pathlib.Path:
|
||||
self.path_pic_folder().mkdir(parents=True, exist_ok=True)
|
||||
return self.path_pic_folder() / f'{order}_{name}'
|
||||
|
||||
|
||||
def path_floorplan_folder(self) -> pathlib.Path:
|
||||
return self.path_listing() / 'floorplans'
|
||||
|
||||
def path_floorplan_file(self, order, name) -> pathlib.Path:
|
||||
return self.path_pic_folder() / f'{order}_{name}'
|
||||
self.path_floorplan_folder().mkdir(parents=True, exist_ok=True)
|
||||
return self.path_floorplan_folder() / f'{order}_{name}'
|
||||
|
||||
def list_floorplans(self):
|
||||
images = list(self.path_floorplan_folder().glob('*'))
|
||||
|
|
@ -80,7 +84,10 @@ class Listing():
|
|||
max_sqm = max([o['estimated_sqm'] for o in objs if o is None]) # filter out Nones
|
||||
return max_sqm
|
||||
|
||||
def calculate_sqm_ocr(self):
|
||||
def calculate_sqm_ocr(self, recalculate=True):
|
||||
if not recalculate and self.path_floorplan_ocr_json().exists():
|
||||
return
|
||||
|
||||
objs = []
|
||||
for floorplan_path in self.list_floorplans():
|
||||
estimated_sqm, model_output = floorplan.calculate_ocr(floorplan_path)
|
||||
|
|
@ -129,12 +136,17 @@ class Listing():
|
|||
return None
|
||||
return self.price / self.sqm_ocr
|
||||
|
||||
@property
|
||||
def bedrooms(self) -> int:
|
||||
return self.detailobject['property']['bedrooms']
|
||||
|
||||
def dict_nicely(self):
|
||||
return {
|
||||
'sqm_ocr': self.sqm_ocr,
|
||||
'price': self.price,
|
||||
'price_per_sqm': self.price_per_sqm,
|
||||
'url': self.url,
|
||||
'bedrooms': self.bedrooms,
|
||||
}
|
||||
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue