wrongmove/crawler/data_access.py

206 lines
6.7 KiB
Python

from dataclasses import dataclass
import json
import pathlib
from typing import List, Dict
from rec import floorplan, routing
import re
_DATA_DIR = pathlib.Path('data/rs/')
@dataclass()
class Listing():
identifier: int
_cached: Dict = None
@staticmethod
def get_all_listings() -> List['Listing']:
listing_paths = sorted(list(_DATA_DIR.glob('*/listing.json')))
identifiers = []
for listing_path in listing_paths:
with open(listing_path) as f:
d = json.load(f)
identifiers.append(Listing(d['identifier']))
return identifiers
def path_listing(self) -> pathlib.Path:
p = _DATA_DIR / str(self.identifier)
p.mkdir(parents=True, exist_ok=True)
return p
def path_listing_json(self) -> pathlib.Path:
return self.path_listing() / 'listing.json'
def path_detail_json(self) -> pathlib.Path:
return self.path_listing() / 'detail.json'
def path_routing_json(self) -> pathlib.Path:
return self.path_listing() / 'routing.json'
def path_floorplan_model_json(self) -> pathlib.Path:
return self.path_listing() / 'floorplan_model.json'
def path_floorplan_ocr_json(self) -> pathlib.Path:
return self.path_listing() / 'floorplan_ocr.json'
def path_pic_folder(self) -> pathlib.Path:
return self.path_listing() / 'pics'
def path_pic_file(self, order, name) -> pathlib.Path:
self.path_pic_folder().mkdir(parents=True, exist_ok=True)
return self.path_pic_folder() / f'{order}_{name}'
def path_floorplan_folder(self) -> pathlib.Path:
return self.path_listing() / 'floorplans'
def path_floorplan_file(self, order, name) -> pathlib.Path:
self.path_floorplan_folder().mkdir(parents=True, exist_ok=True)
return self.path_floorplan_folder() / f'{order}_{name}'
def list_floorplans(self):
images = list(self.path_floorplan_folder().glob('*'))
# todo add check if return is image
return images
def calculate_sqm_model(self):
objs = []
for floorplan_path in self.list_floorplans():
estimated_sqm, model_output, predictions = floorplan.calculate_model(floorplan_path)
objs.append({
'floorplan_path': str(floorplan_path),
'estimated_sqm': estimated_sqm,
'model_output': model_output,
'no_predictions': len(predictions) # cant serialize the predictions itself since its a tensor
})
with open(self.path_floorplan_model_json(), 'w') as f:
json.dump(objs, f)
@property
def sqm_model(self, recalculate=True):
if not self.path_floorplan_model_json().exists() or recalculate:
self.calculate_sqm_model()
with open(self.path_floorplan_json()) as f:
objs = json.load(f)
max_sqm = max([o['estimated_sqm'] for o in objs if o is None]) # filter out Nones
return max_sqm
def calculate_sqm_ocr(self, recalculate=True):
if not recalculate and self.path_floorplan_ocr_json().exists():
return
objs = []
for floorplan_path in self.list_floorplans():
estimated_sqm, model_output = floorplan.calculate_ocr(floorplan_path)
objs.append({
'floorplan_path': str(floorplan_path),
'estimated_sqm': estimated_sqm,
'text': model_output,
})
with open(self.path_floorplan_ocr_json(), 'w') as f:
json.dump(objs, f)
@property
def sqm_ocr(self, recalculate=False):
if not self.path_floorplan_ocr_json().exists() or recalculate:
self.calculate_sqm_ocr()
with open(self.path_floorplan_ocr_json()) as f:
objs = json.load(f)
sqms = [o['estimated_sqm'] for o in objs if o['estimated_sqm'] is not None]
if len(sqms) == 0:
return None
max_sqm = max(sqms)
return max_sqm
def calculate_route(self, dest_lat: float, dest_lon: float, recalculate=False):
if self.path_routing_json().exists() and not recalculate:
return
result = routing.transit_route(self.latitude, self.longitude, dest_lat, dest_lon)
with open(self.path_routing_json(), 'w') as f:
json.dump(result, f)
@property
def travel_time(self) -> List:
if not self.path_routing_json().exists():
return []
with open(self.path_routing_json()) as f:
d = json.load(f)
return routing.extract_time(d)
@property
def url(self):
return f'https://www.rightmove.co.uk/properties/{self.identifier}'
@property
def detailobject(self):
if self._cached is None:
with open(self.path_detail_json()) as f:
self._cached = json.load(f)
return self._cached
@property
def price(self) -> float:
return self.detailobject['property']['price']
@property
def price_per_sqm(self) -> float:
if self.sqm_ocr is None or self.sqm_ocr == 0:
return None
return self.price / self.sqm_ocr
@property
def bedrooms(self) -> int:
return self.detailobject['property']['bedrooms']
@property
def latitude(self) -> float:
return self.detailobject['property']['latitude']
@property
def longitude(self) -> float:
return self.detailobject['property']['longitude']
@property
def leaseLeft(self) -> int:
ds = self.detailobject['property'].get('tenureInfo', {}).get('content', [])
for d in ds:
if d['type'] == 'lengthOfLease':
matches = re.findall(r'(\d+\.?\d*)', d['value'])
if len(matches):
return float(matches[0])
return None
@property
def development(self) -> bool:
# aka new home
return self.detailobject['property']['development']
def dict_nicely(self):
return {
'identifier': self.identifier,
'sqm_ocr': self.sqm_ocr,
'price': self.price,
'price_per_sqm': self.price_per_sqm,
'url': self.url,
'bedrooms': self.bedrooms,
'travel_time_fastest': self.travel_time[0],
'travel_time_second': None if len(self.travel_time) < 2 else self.travel_time[1],
'lease_left': self.leaseLeft,
'development': self.development,
}
if __name__ == '__main__':
listings = Listing.get_all_listings()
print(listings[0].list_floorplans())