add routing, incremental crawling, travel time, lease and development
This commit is contained in:
parent
d4f87bed76
commit
335adc0856
5 changed files with 2652 additions and 70 deletions
File diff suppressed because it is too large
Load diff
|
|
@ -9,7 +9,7 @@ folder = pathlib.Path("data/rs/")
|
|||
for i in range(1, 10000):
|
||||
try:
|
||||
print(f"page {i}")
|
||||
d = listing_query(i, 3, 3, 15, 0, 800000)
|
||||
d = listing_query(i, 3, 3, 15, 0, 800000, max_days_since_added=3)
|
||||
except:
|
||||
break
|
||||
|
||||
|
|
|
|||
|
|
@ -6,4 +6,4 @@ listings = Listing.get_all_listings()
|
|||
for listing in tqdm(listings):
|
||||
tqdm.write(str(listing.identifier))
|
||||
# listing.calculate_sqm_model() # using google/deplot model. Too slow, rather use tesseract
|
||||
listing.calculate_sqm_ocr()
|
||||
listing.calculate_sqm_ocr(recalculate=False)
|
||||
|
|
|
|||
|
|
@ -2,7 +2,8 @@ from dataclasses import dataclass
|
|||
import json
|
||||
import pathlib
|
||||
from typing import List, Dict
|
||||
from rec import floorplan
|
||||
from rec import floorplan, routing
|
||||
import re
|
||||
|
||||
_DATA_DIR = pathlib.Path('data/rs/')
|
||||
|
||||
|
|
@ -34,6 +35,9 @@ class Listing():
|
|||
def path_detail_json(self) -> pathlib.Path:
|
||||
return self.path_listing() / 'detail.json'
|
||||
|
||||
def path_routing_json(self) -> pathlib.Path:
|
||||
return self.path_listing() / 'routing.json'
|
||||
|
||||
def path_floorplan_model_json(self) -> pathlib.Path:
|
||||
return self.path_listing() / 'floorplan_model.json'
|
||||
|
||||
|
|
@ -115,6 +119,24 @@ class Listing():
|
|||
max_sqm = max(sqms)
|
||||
return max_sqm
|
||||
|
||||
def calculate_route(self, dest_lat: float, dest_lon: float, recalculate=False):
|
||||
if self.path_routing_json().exists() and not recalculate:
|
||||
return
|
||||
|
||||
result = routing.transit_route(self.latitude, self.longitude, dest_lat, dest_lon)
|
||||
with open(self.path_routing_json(), 'w') as f:
|
||||
json.dump(result, f)
|
||||
|
||||
@property
|
||||
def travel_time(self) -> List:
|
||||
if not self.path_routing_json().exists():
|
||||
return []
|
||||
with open(self.path_routing_json()) as f:
|
||||
d = json.load(f)
|
||||
|
||||
return routing.extract_time(d)
|
||||
|
||||
|
||||
@property
|
||||
def url(self):
|
||||
return f'https://www.rightmove.co.uk/properties/{self.identifier}'
|
||||
|
|
@ -140,6 +162,29 @@ class Listing():
|
|||
def bedrooms(self) -> int:
|
||||
return self.detailobject['property']['bedrooms']
|
||||
|
||||
@property
|
||||
def latitude(self) -> float:
|
||||
return self.detailobject['property']['latitude']
|
||||
|
||||
@property
|
||||
def longitude(self) -> float:
|
||||
return self.detailobject['property']['longitude']
|
||||
|
||||
@property
|
||||
def leaseLeft(self) -> int:
|
||||
ds = self.detailobject['property'].get('tenureInfo', {}).get('content', [])
|
||||
for d in ds:
|
||||
if d['type'] == 'lengthOfLease':
|
||||
matches = re.findall(r'(\d+\.?\d*)', d['value'])
|
||||
if len(matches):
|
||||
return float(matches[0])
|
||||
return None
|
||||
|
||||
@property
|
||||
def development(self) -> bool:
|
||||
# aka new home
|
||||
return self.detailobject['property']['development']
|
||||
|
||||
def dict_nicely(self):
|
||||
return {
|
||||
'sqm_ocr': self.sqm_ocr,
|
||||
|
|
@ -147,6 +192,10 @@ class Listing():
|
|||
'price_per_sqm': self.price_per_sqm,
|
||||
'url': self.url,
|
||||
'bedrooms': self.bedrooms,
|
||||
'travel_time_fastest': self.travel_time[0],
|
||||
'travel_time_second': None if len(self.travel_time) < 2 else self.travel_time[1],
|
||||
'lease_left': self.leaseLeft,
|
||||
'development': self.development,
|
||||
}
|
||||
|
||||
|
||||
|
|
|
|||
File diff suppressed because it is too large
Load diff
Loading…
Add table
Add a link
Reference in a new issue