add routing, incremental crawling, travel time, lease and development

This commit is contained in:
Kadir 2024-03-13 16:24:57 +00:00
parent d4f87bed76
commit 335adc0856
5 changed files with 2652 additions and 70 deletions

File diff suppressed because it is too large Load diff

View file

@ -9,7 +9,7 @@ folder = pathlib.Path("data/rs/")
for i in range(1, 10000):
try:
print(f"page {i}")
d = listing_query(i, 3, 3, 15, 0, 800000)
d = listing_query(i, 3, 3, 15, 0, 800000, max_days_since_added=3)
except:
break

View file

@ -6,4 +6,4 @@ listings = Listing.get_all_listings()
for listing in tqdm(listings):
tqdm.write(str(listing.identifier))
# listing.calculate_sqm_model() # using google/deplot model. Too slow, rather use tesseract
listing.calculate_sqm_ocr()
listing.calculate_sqm_ocr(recalculate=False)

View file

@ -2,7 +2,8 @@ from dataclasses import dataclass
import json
import pathlib
from typing import List, Dict
from rec import floorplan
from rec import floorplan, routing
import re
_DATA_DIR = pathlib.Path('data/rs/')
@ -34,6 +35,9 @@ class Listing():
def path_detail_json(self) -> pathlib.Path:
return self.path_listing() / 'detail.json'
def path_routing_json(self) -> pathlib.Path:
return self.path_listing() / 'routing.json'
def path_floorplan_model_json(self) -> pathlib.Path:
return self.path_listing() / 'floorplan_model.json'
@ -115,6 +119,24 @@ class Listing():
max_sqm = max(sqms)
return max_sqm
def calculate_route(self, dest_lat: float, dest_lon: float, recalculate=False):
if self.path_routing_json().exists() and not recalculate:
return
result = routing.transit_route(self.latitude, self.longitude, dest_lat, dest_lon)
with open(self.path_routing_json(), 'w') as f:
json.dump(result, f)
@property
def travel_time(self) -> List:
if not self.path_routing_json().exists():
return []
with open(self.path_routing_json()) as f:
d = json.load(f)
return routing.extract_time(d)
@property
def url(self):
return f'https://www.rightmove.co.uk/properties/{self.identifier}'
@ -140,6 +162,29 @@ class Listing():
def bedrooms(self) -> int:
return self.detailobject['property']['bedrooms']
@property
def latitude(self) -> float:
return self.detailobject['property']['latitude']
@property
def longitude(self) -> float:
return self.detailobject['property']['longitude']
@property
def leaseLeft(self) -> int:
ds = self.detailobject['property'].get('tenureInfo', {}).get('content', [])
for d in ds:
if d['type'] == 'lengthOfLease':
matches = re.findall(r'(\d+\.?\d*)', d['value'])
if len(matches):
return float(matches[0])
return None
@property
def development(self) -> bool:
# aka new home
return self.detailobject['property']['development']
def dict_nicely(self):
return {
'sqm_ocr': self.sqm_ocr,
@ -147,6 +192,10 @@ class Listing():
'price_per_sqm': self.price_per_sqm,
'url': self.url,
'bedrooms': self.bedrooms,
'travel_time_fastest': self.travel_time[0],
'travel_time_second': None if len(self.travel_time) < 2 else self.travel_time[1],
'lease_left': self.leaseLeft,
'development': self.development,
}

File diff suppressed because it is too large Load diff