wrongmove/crawler/data_access.py
Kadir 508aa02812 Real crawling scripts and floorplan detection
1. get all listings
2. get all detail jsons
3. get all images
4. get all floorplans
5. detecting floorplans

Also updating dependencies for huggingface etc.
2024-03-10 18:49:39 +00:00

86 lines
No EOL
2.8 KiB
Python

from dataclasses import dataclass
import json
import pathlib
from typing import List
from rec import floorplan
_DATA_DIR = pathlib.Path('data/rs/')
@dataclass()
class Listing():
identifier: int
@staticmethod
def get_all_listings() -> List['Listing']:
listing_paths = sorted(list(_DATA_DIR.glob('*/listing.json')))
identifiers = []
for listing_path in listing_paths:
with open(listing_path) as f:
d = json.load(f)
identifiers.append(Listing(d['identifier']))
return identifiers
def path_listing(self) -> pathlib.Path:
return _DATA_DIR / str(self.identifier)
def path_listing_json(self) -> pathlib.Path:
return self.path_listing() / 'listing.json'
def path_detail_json(self) -> pathlib.Path:
return self.path_listing() / 'detail.json'
def path_floorplan_json(self) -> pathlib.Path:
return self.path_listing() / 'floorplan.json'
def path_pic_folder(self) -> pathlib.Path:
return self.path_listing() / 'pics'
def path_pic_folder(self, order, name) -> pathlib.Path:
return self.path_pic_folder() / f'{order}_{name}'
def path_floorplan_folder(self) -> pathlib.Path:
return self.path_listing() / 'floorplans'
def path_floorplan_file(self, order, name) -> pathlib.Path:
return self.path_pic_folder() / f'{order}_{name}'
def list_floorplans(self):
images = list(self.path_floorplan_folder().glob('*'))
# todo add check if return is image
return images
def calculate_sqm(self):
objs = []
for floorplan_path in self.list_floorplans():
estimated_sqm, model_output, predictions = floorplan.calculate(floorplan_path)
objs.append({
'floorplan_path': floorplan_path,
'estimated_sqm': estimated_sqm,
'model_output': model_output,
'no_predictions': len(predictions) # cant serialize the predictions itself since its a tensor
})
with open(self.path_floorplan_json(), 'w') as f:
json.dump(objs, f)
max_sqm = max([o['estimated_sqm'] for o in objs if o is None]) # filter out Nones
return max_sqm
@property
def sqm(self, recalculate=True):
if recalculate and not self.path_floorplan_json().exists():
self.calculate_sqm()
with open(self.path_floorplan_json()) as f:
objs = json.load(f)
max_sqm = max([o['estimated_sqm'] for o in objs if o is None]) # filter out Nones
return max_sqm
if __name__ == '__main__':
listings = Listing.get_all_listings()
print(listings[0].list_floorplans())