add routing, incremental crawling, travel time, lease and development

This commit is contained in:
Kadir 2024-03-13 16:24:57 +00:00
parent d4f87bed76
commit 335adc0856
5 changed files with 2652 additions and 70 deletions

File diff suppressed because it is too large Load diff

View file

@ -9,7 +9,7 @@ folder = pathlib.Path("data/rs/")
for i in range(1, 10000): for i in range(1, 10000):
try: try:
print(f"page {i}") print(f"page {i}")
d = listing_query(i, 3, 3, 15, 0, 800000) d = listing_query(i, 3, 3, 15, 0, 800000, max_days_since_added=3)
except: except:
break break

View file

@ -6,4 +6,4 @@ listings = Listing.get_all_listings()
for listing in tqdm(listings): for listing in tqdm(listings):
tqdm.write(str(listing.identifier)) tqdm.write(str(listing.identifier))
# listing.calculate_sqm_model() # using google/deplot model. Too slow, rather use tesseract # listing.calculate_sqm_model() # using google/deplot model. Too slow, rather use tesseract
listing.calculate_sqm_ocr() listing.calculate_sqm_ocr(recalculate=False)

View file

@ -2,7 +2,8 @@ from dataclasses import dataclass
import json import json
import pathlib import pathlib
from typing import List, Dict from typing import List, Dict
from rec import floorplan from rec import floorplan, routing
import re
_DATA_DIR = pathlib.Path('data/rs/') _DATA_DIR = pathlib.Path('data/rs/')
@ -34,6 +35,9 @@ class Listing():
def path_detail_json(self) -> pathlib.Path: def path_detail_json(self) -> pathlib.Path:
return self.path_listing() / 'detail.json' return self.path_listing() / 'detail.json'
def path_routing_json(self) -> pathlib.Path:
return self.path_listing() / 'routing.json'
def path_floorplan_model_json(self) -> pathlib.Path: def path_floorplan_model_json(self) -> pathlib.Path:
return self.path_listing() / 'floorplan_model.json' return self.path_listing() / 'floorplan_model.json'
@ -115,6 +119,24 @@ class Listing():
max_sqm = max(sqms) max_sqm = max(sqms)
return max_sqm return max_sqm
def calculate_route(self, dest_lat: float, dest_lon: float, recalculate=False):
if self.path_routing_json().exists() and not recalculate:
return
result = routing.transit_route(self.latitude, self.longitude, dest_lat, dest_lon)
with open(self.path_routing_json(), 'w') as f:
json.dump(result, f)
@property
def travel_time(self) -> List:
if not self.path_routing_json().exists():
return []
with open(self.path_routing_json()) as f:
d = json.load(f)
return routing.extract_time(d)
@property @property
def url(self): def url(self):
return f'https://www.rightmove.co.uk/properties/{self.identifier}' return f'https://www.rightmove.co.uk/properties/{self.identifier}'
@ -140,6 +162,29 @@ class Listing():
def bedrooms(self) -> int: def bedrooms(self) -> int:
return self.detailobject['property']['bedrooms'] return self.detailobject['property']['bedrooms']
@property
def latitude(self) -> float:
return self.detailobject['property']['latitude']
@property
def longitude(self) -> float:
return self.detailobject['property']['longitude']
@property
def leaseLeft(self) -> int:
ds = self.detailobject['property'].get('tenureInfo', {}).get('content', [])
for d in ds:
if d['type'] == 'lengthOfLease':
matches = re.findall(r'(\d+\.?\d*)', d['value'])
if len(matches):
return float(matches[0])
return None
@property
def development(self) -> bool:
# aka new home
return self.detailobject['property']['development']
def dict_nicely(self): def dict_nicely(self):
return { return {
'sqm_ocr': self.sqm_ocr, 'sqm_ocr': self.sqm_ocr,
@ -147,6 +192,10 @@ class Listing():
'price_per_sqm': self.price_per_sqm, 'price_per_sqm': self.price_per_sqm,
'url': self.url, 'url': self.url,
'bedrooms': self.bedrooms, 'bedrooms': self.bedrooms,
'travel_time_fastest': self.travel_time[0],
'travel_time_second': None if len(self.travel_time) < 2 else self.travel_time[1],
'lease_left': self.leaseLeft,
'development': self.development,
} }

File diff suppressed because it is too large Load diff