2025-05-17 22:58:35 +00:00
|
|
|
import asyncio
|
2024-03-10 18:49:39 +00:00
|
|
|
from dataclasses import dataclass
|
|
|
|
|
import json
|
|
|
|
|
import pathlib
|
2025-05-18 12:27:26 +00:00
|
|
|
from typing import Any, List, Dict
|
2024-03-13 16:24:57 +00:00
|
|
|
from rec import floorplan, routing
|
|
|
|
|
import re
|
2024-08-11 19:36:25 +01:00
|
|
|
import datetime
|
|
|
|
|
|
2024-03-10 18:49:39 +00:00
|
|
|
|
|
|
|
|
@dataclass()
|
2024-03-25 20:48:48 +00:00
|
|
|
class Listing:
|
2024-03-10 18:49:39 +00:00
|
|
|
identifier: int
|
2025-05-18 12:27:26 +00:00
|
|
|
_cached: Dict | None = None
|
2025-05-14 20:19:08 +00:00
|
|
|
data_dir: pathlib.Path = pathlib.Path("data/rs/")
|
2025-05-17 20:13:28 +00:00
|
|
|
ALL_COLUMNS = [
|
|
|
|
|
"identifier",
|
|
|
|
|
"sqm_ocr",
|
|
|
|
|
"price",
|
|
|
|
|
"price_per_sqm",
|
|
|
|
|
"url",
|
|
|
|
|
"bedrooms",
|
|
|
|
|
"travel_time_fastest",
|
|
|
|
|
"travel_time_second",
|
|
|
|
|
"lease_left",
|
|
|
|
|
"service_charge",
|
|
|
|
|
"development",
|
|
|
|
|
"tenure_type",
|
|
|
|
|
"updated_days",
|
|
|
|
|
"status",
|
|
|
|
|
"last_seen",
|
|
|
|
|
]
|
2024-03-25 20:48:48 +00:00
|
|
|
|
2025-05-14 20:32:37 +00:00
|
|
|
@staticmethod
|
2025-05-17 20:13:28 +00:00
|
|
|
def get_all_listings(
|
|
|
|
|
listing_paths: list[str],
|
|
|
|
|
seen_in_the_last_n_days: int = 30,
|
|
|
|
|
) -> List["Listing"]:
|
2024-03-10 18:49:39 +00:00
|
|
|
identifiers = []
|
|
|
|
|
for listing_path in listing_paths:
|
|
|
|
|
with open(listing_path) as f:
|
|
|
|
|
d = json.load(f)
|
2025-05-14 21:01:58 +00:00
|
|
|
|
|
|
|
|
# data_dir is the first directory before the listing_path
|
|
|
|
|
data_dir = pathlib.Path(listing_path)
|
|
|
|
|
while str(d['identifier']) in str(data_dir.resolve().absolute()):
|
|
|
|
|
data_dir = data_dir.parent
|
2025-05-17 20:13:28 +00:00
|
|
|
listing = Listing(d["identifier"], data_dir=data_dir)
|
2025-05-18 12:27:26 +00:00
|
|
|
if (listing.last_seen is not None
|
|
|
|
|
and listing.last_seen < seen_in_the_last_n_days):
|
2025-05-17 20:13:28 +00:00
|
|
|
identifiers.append(listing)
|
2024-03-25 20:48:48 +00:00
|
|
|
|
2024-03-10 18:49:39 +00:00
|
|
|
return identifiers
|
|
|
|
|
|
|
|
|
|
def path_listing(self) -> pathlib.Path:
|
2025-05-14 20:19:08 +00:00
|
|
|
p = self.data_dir / str(self.identifier)
|
2024-03-11 14:43:53 +00:00
|
|
|
p.mkdir(parents=True, exist_ok=True)
|
|
|
|
|
return p
|
2024-03-25 20:48:48 +00:00
|
|
|
|
2024-03-10 18:49:39 +00:00
|
|
|
def path_listing_json(self) -> pathlib.Path:
|
2024-03-25 20:48:48 +00:00
|
|
|
return self.path_listing() / "listing.json"
|
|
|
|
|
|
2024-03-10 18:49:39 +00:00
|
|
|
def path_detail_json(self) -> pathlib.Path:
|
2024-03-25 20:48:48 +00:00
|
|
|
return self.path_listing() / "detail.json"
|
|
|
|
|
|
2024-03-13 16:24:57 +00:00
|
|
|
def path_routing_json(self) -> pathlib.Path:
|
2024-03-25 20:48:48 +00:00
|
|
|
return self.path_listing() / "routing.json"
|
|
|
|
|
|
2024-03-10 22:32:34 +00:00
|
|
|
def path_floorplan_model_json(self) -> pathlib.Path:
|
2024-03-25 20:48:48 +00:00
|
|
|
return self.path_listing() / "floorplan_model.json"
|
|
|
|
|
|
2024-03-10 22:32:34 +00:00
|
|
|
def path_floorplan_ocr_json(self) -> pathlib.Path:
|
2024-03-25 20:48:48 +00:00
|
|
|
return self.path_listing() / "floorplan_ocr.json"
|
|
|
|
|
|
2024-03-10 18:49:39 +00:00
|
|
|
def path_pic_folder(self) -> pathlib.Path:
|
2024-03-25 20:48:48 +00:00
|
|
|
return self.path_listing() / "pics"
|
|
|
|
|
|
2024-03-11 14:43:53 +00:00
|
|
|
def path_pic_file(self, order, name) -> pathlib.Path:
|
|
|
|
|
self.path_pic_folder().mkdir(parents=True, exist_ok=True)
|
2024-03-25 20:48:48 +00:00
|
|
|
return self.path_pic_folder() / f"{order}_{name}"
|
|
|
|
|
|
2024-03-10 18:49:39 +00:00
|
|
|
def path_floorplan_folder(self) -> pathlib.Path:
|
2024-03-25 20:48:48 +00:00
|
|
|
return self.path_listing() / "floorplans"
|
|
|
|
|
|
2024-03-10 18:49:39 +00:00
|
|
|
def path_floorplan_file(self, order, name) -> pathlib.Path:
|
2024-03-11 14:43:53 +00:00
|
|
|
self.path_floorplan_folder().mkdir(parents=True, exist_ok=True)
|
2024-03-25 20:48:48 +00:00
|
|
|
return self.path_floorplan_folder() / f"{order}_{name}"
|
2025-05-07 21:25:40 +00:00
|
|
|
|
2025-01-26 21:39:51 +00:00
|
|
|
def path_last_seen_listing(self) -> pathlib.Path:
|
|
|
|
|
return self.path_listing() / "last_seen.json"
|
2025-05-07 21:25:40 +00:00
|
|
|
|
2025-01-26 21:39:51 +00:00
|
|
|
def dump_listing(self, d: dict):
|
|
|
|
|
with open(self.path_listing_json(), "w") as f:
|
|
|
|
|
json.dump(d, f)
|
|
|
|
|
with open(self.path_last_seen_listing(), "w") as f:
|
|
|
|
|
dt = datetime.datetime.now().isoformat()
|
|
|
|
|
json.dump(dt, f)
|
2024-03-25 20:48:48 +00:00
|
|
|
|
2024-03-10 18:49:39 +00:00
|
|
|
def list_floorplans(self):
|
2024-03-25 20:48:48 +00:00
|
|
|
images = list(self.path_floorplan_folder().glob("*"))
|
2024-03-10 18:49:39 +00:00
|
|
|
# todo add check if return is image
|
|
|
|
|
return images
|
2024-03-25 20:48:48 +00:00
|
|
|
|
2024-03-10 22:32:34 +00:00
|
|
|
def calculate_sqm_model(self):
|
2024-03-10 18:49:39 +00:00
|
|
|
objs = []
|
|
|
|
|
for floorplan_path in self.list_floorplans():
|
2024-03-25 20:48:48 +00:00
|
|
|
estimated_sqm, model_output, predictions = floorplan.calculate_model(
|
2025-05-18 12:27:26 +00:00
|
|
|
floorplan_path)
|
|
|
|
|
objs.append({
|
|
|
|
|
"floorplan_path": str(floorplan_path),
|
|
|
|
|
"estimated_sqm": estimated_sqm,
|
|
|
|
|
"model_output": model_output,
|
|
|
|
|
"no_predictions": len(
|
|
|
|
|
predictions
|
|
|
|
|
), # cant serialize the predictions itself since its a tensor
|
|
|
|
|
})
|
2024-03-25 20:48:48 +00:00
|
|
|
|
|
|
|
|
with open(self.path_floorplan_model_json(), "w") as f:
|
2024-03-10 18:49:39 +00:00
|
|
|
json.dump(objs, f)
|
2024-03-25 20:48:48 +00:00
|
|
|
|
2024-03-10 18:49:39 +00:00
|
|
|
@property
|
2024-03-10 22:32:34 +00:00
|
|
|
def sqm_model(self, recalculate=True):
|
2024-03-11 09:44:37 +00:00
|
|
|
if not self.path_floorplan_model_json().exists() or recalculate:
|
2024-03-10 22:32:34 +00:00
|
|
|
self.calculate_sqm_model()
|
2024-03-25 20:48:48 +00:00
|
|
|
|
2024-03-10 18:49:39 +00:00
|
|
|
with open(self.path_floorplan_json()) as f:
|
|
|
|
|
objs = json.load(f)
|
2024-03-25 20:48:48 +00:00
|
|
|
|
2025-05-18 12:27:26 +00:00
|
|
|
max_sqm = max([o["estimated_sqm"] for o in objs
|
|
|
|
|
if o is None]) # filter out Nones
|
2024-03-10 18:49:39 +00:00
|
|
|
return max_sqm
|
2024-03-25 20:48:48 +00:00
|
|
|
|
2025-05-17 22:58:35 +00:00
|
|
|
async def calculate_sqm_ocr(self, recalculate=True):
|
2024-03-11 14:43:53 +00:00
|
|
|
if not recalculate and self.path_floorplan_ocr_json().exists():
|
|
|
|
|
return
|
2024-03-25 20:48:48 +00:00
|
|
|
|
2024-03-10 22:32:34 +00:00
|
|
|
objs = []
|
|
|
|
|
for floorplan_path in self.list_floorplans():
|
2025-05-17 22:58:35 +00:00
|
|
|
estimated_sqm, model_output = await asyncio.to_thread(
|
2025-05-18 12:27:26 +00:00
|
|
|
floorplan.calculate_ocr, floorplan_path)
|
|
|
|
|
objs.append({
|
|
|
|
|
"floorplan_path": str(floorplan_path),
|
|
|
|
|
"estimated_sqm": estimated_sqm,
|
|
|
|
|
"text": model_output,
|
|
|
|
|
})
|
2024-03-25 20:48:48 +00:00
|
|
|
|
|
|
|
|
with open(self.path_floorplan_ocr_json(), "w") as f:
|
2024-03-10 22:32:34 +00:00
|
|
|
json.dump(objs, f)
|
2024-03-25 20:48:48 +00:00
|
|
|
|
2024-03-10 22:32:34 +00:00
|
|
|
@property
|
2024-03-11 09:44:37 +00:00
|
|
|
def sqm_ocr(self, recalculate=False):
|
|
|
|
|
if not self.path_floorplan_ocr_json().exists() or recalculate:
|
2024-03-10 22:32:34 +00:00
|
|
|
self.calculate_sqm_ocr()
|
2024-03-25 20:48:48 +00:00
|
|
|
|
2024-03-10 22:32:34 +00:00
|
|
|
with open(self.path_floorplan_ocr_json()) as f:
|
|
|
|
|
objs = json.load(f)
|
2024-03-25 20:48:48 +00:00
|
|
|
|
2025-05-18 12:27:26 +00:00
|
|
|
sqms = [
|
|
|
|
|
o["estimated_sqm"] for o in objs if o["estimated_sqm"] is not None
|
|
|
|
|
]
|
2024-03-11 09:44:37 +00:00
|
|
|
if len(sqms) == 0:
|
|
|
|
|
return None
|
|
|
|
|
max_sqm = max(sqms)
|
2024-03-10 22:32:34 +00:00
|
|
|
return max_sqm
|
2024-03-25 20:48:48 +00:00
|
|
|
|
2025-05-18 12:27:26 +00:00
|
|
|
def calculate_route(self,
|
|
|
|
|
dest_lat: float,
|
|
|
|
|
dest_lon: float,
|
|
|
|
|
recalculate=False):
|
2024-03-13 16:24:57 +00:00
|
|
|
if self.path_routing_json().exists() and not recalculate:
|
|
|
|
|
return
|
2024-03-25 20:48:48 +00:00
|
|
|
|
2025-05-18 12:27:26 +00:00
|
|
|
result = routing.transit_route(self.latitude, self.longitude, dest_lat,
|
|
|
|
|
dest_lon)
|
2024-03-25 20:48:48 +00:00
|
|
|
with open(self.path_routing_json(), "w") as f:
|
2024-03-13 16:24:57 +00:00
|
|
|
json.dump(result, f)
|
2024-03-25 20:48:48 +00:00
|
|
|
|
2024-03-13 16:24:57 +00:00
|
|
|
@property
|
|
|
|
|
def travel_time(self) -> List:
|
|
|
|
|
if not self.path_routing_json().exists():
|
|
|
|
|
return []
|
|
|
|
|
with open(self.path_routing_json()) as f:
|
|
|
|
|
d = json.load(f)
|
2024-03-25 20:48:48 +00:00
|
|
|
|
2024-03-13 16:24:57 +00:00
|
|
|
return routing.extract_time(d)
|
2024-03-25 20:48:48 +00:00
|
|
|
|
2024-03-11 09:44:37 +00:00
|
|
|
@property
|
|
|
|
|
def url(self):
|
2024-03-25 20:48:48 +00:00
|
|
|
return f"https://www.rightmove.co.uk/properties/{self.identifier}"
|
|
|
|
|
|
2025-02-16 03:02:21 +00:00
|
|
|
@property
|
|
|
|
|
def listingobject(self):
|
|
|
|
|
if self._cached is None:
|
|
|
|
|
with open(self.path_listing_json()) as f:
|
|
|
|
|
return json.load(f)
|
2025-05-07 21:25:40 +00:00
|
|
|
|
2024-03-11 09:44:37 +00:00
|
|
|
@property
|
2025-05-18 12:27:26 +00:00
|
|
|
def detailobject(self) -> dict[str, Any]:
|
2024-03-11 09:44:37 +00:00
|
|
|
if self._cached is None:
|
|
|
|
|
with open(self.path_detail_json()) as f:
|
|
|
|
|
self._cached = json.load(f)
|
2025-05-18 12:27:26 +00:00
|
|
|
return self._cached # type: ignore
|
2024-03-25 20:48:48 +00:00
|
|
|
|
2024-03-11 09:44:37 +00:00
|
|
|
@property
|
|
|
|
|
def price(self) -> float:
|
2024-03-25 20:48:48 +00:00
|
|
|
return self.detailobject["property"]["price"]
|
2025-05-07 21:25:40 +00:00
|
|
|
|
2024-05-06 18:54:55 +01:00
|
|
|
@property
|
|
|
|
|
def tenure_type(self) -> str:
|
|
|
|
|
return self.detailobject["property"]["tenureType"]
|
2024-03-25 20:48:48 +00:00
|
|
|
|
2024-03-11 09:44:37 +00:00
|
|
|
@property
|
|
|
|
|
def price_per_sqm(self) -> float:
|
2024-03-18 00:56:39 +00:00
|
|
|
if self.sqm_ocr is None or self.sqm_ocr == 0:
|
2025-05-18 12:27:26 +00:00
|
|
|
return -1
|
2024-03-11 09:44:37 +00:00
|
|
|
return self.price / self.sqm_ocr
|
2024-03-25 20:48:48 +00:00
|
|
|
|
2024-03-11 14:43:53 +00:00
|
|
|
@property
|
|
|
|
|
def bedrooms(self) -> int:
|
2024-03-25 20:48:48 +00:00
|
|
|
return self.detailobject["property"]["bedrooms"]
|
|
|
|
|
|
2024-03-13 16:24:57 +00:00
|
|
|
@property
|
|
|
|
|
def latitude(self) -> float:
|
2024-03-25 20:48:48 +00:00
|
|
|
return self.detailobject["property"]["latitude"]
|
|
|
|
|
|
2024-03-13 16:24:57 +00:00
|
|
|
@property
|
|
|
|
|
def longitude(self) -> float:
|
2024-03-25 20:48:48 +00:00
|
|
|
return self.detailobject["property"]["longitude"]
|
|
|
|
|
|
2024-03-13 16:24:57 +00:00
|
|
|
@property
|
2025-05-18 12:27:26 +00:00
|
|
|
def leaseLeft(self) -> float | None:
|
|
|
|
|
ds = self.detailobject["property"].get("tenureInfo",
|
|
|
|
|
{}).get("content", [])
|
2024-03-13 16:24:57 +00:00
|
|
|
for d in ds:
|
2024-03-25 20:48:48 +00:00
|
|
|
if d["type"] == "lengthOfLease":
|
|
|
|
|
matches = re.findall(r"(\d+\.?\d*)", d["value"])
|
2024-03-13 16:24:57 +00:00
|
|
|
if len(matches):
|
|
|
|
|
return float(matches[0])
|
|
|
|
|
return None
|
2025-05-07 21:25:40 +00:00
|
|
|
|
2024-08-11 19:36:25 +01:00
|
|
|
@property
|
|
|
|
|
def updateDaysAgo(self) -> int:
|
|
|
|
|
ts = self.detailobject["property"]["updateDate"] / 1000
|
|
|
|
|
now = datetime.datetime.now()
|
|
|
|
|
ds = datetime.datetime.fromtimestamp(ts)
|
|
|
|
|
return (now - ds).days
|
2024-03-25 20:48:48 +00:00
|
|
|
|
2025-01-26 21:39:51 +00:00
|
|
|
@property
|
2025-05-18 12:27:26 +00:00
|
|
|
def last_seen(self) -> int | None:
|
2025-01-26 21:39:51 +00:00
|
|
|
if not self.path_last_seen_listing().exists():
|
|
|
|
|
return None
|
2025-05-07 21:25:40 +00:00
|
|
|
|
2025-01-26 21:39:51 +00:00
|
|
|
with open(self.path_last_seen_listing(), 'r') as f:
|
|
|
|
|
datetime_str = json.load(f)
|
2025-02-14 21:21:50 +00:00
|
|
|
dt = datetime.datetime.fromisoformat(datetime_str)
|
|
|
|
|
return (datetime.datetime.now() - dt).days
|
2025-01-26 21:39:51 +00:00
|
|
|
|
2024-04-05 11:37:03 +01:00
|
|
|
@property
|
2025-05-18 12:27:26 +00:00
|
|
|
def serviceCharge(self) -> float | None:
|
|
|
|
|
ds = self.detailobject["property"].get("tenureInfo",
|
|
|
|
|
{}).get("content", [])
|
2024-04-05 11:37:03 +01:00
|
|
|
for d in ds:
|
|
|
|
|
if d["type"] == "annualServiceCharge":
|
|
|
|
|
matches = re.findall(r"([\d,.]+)", d["value"])
|
|
|
|
|
if len(matches):
|
|
|
|
|
# remove separators (e.g. 6,395.76)
|
|
|
|
|
match = matches[0].replace(",", "")
|
|
|
|
|
return float(match)
|
|
|
|
|
return None
|
|
|
|
|
|
2024-03-13 16:24:57 +00:00
|
|
|
@property
|
|
|
|
|
def development(self) -> bool:
|
|
|
|
|
# aka new home
|
2024-04-05 11:37:03 +01:00
|
|
|
try:
|
|
|
|
|
return self.detailobject["property"]["development"]
|
2025-05-18 12:27:26 +00:00
|
|
|
except Exception:
|
2024-04-05 11:37:03 +01:00
|
|
|
return False
|
2025-05-07 21:25:40 +00:00
|
|
|
|
2024-09-22 11:31:32 +01:00
|
|
|
@property
|
|
|
|
|
def isRemoved(self) -> bool:
|
2025-02-16 04:44:42 +00:00
|
|
|
return not self.detailobject["property"]["visible"]
|
2025-05-07 21:25:40 +00:00
|
|
|
|
2024-09-22 11:31:32 +01:00
|
|
|
@property
|
|
|
|
|
def status(self) -> str:
|
|
|
|
|
if self.isRemoved:
|
|
|
|
|
return 'removed'
|
|
|
|
|
status = self.detailobject["property"]["status"]
|
|
|
|
|
return status
|
2024-03-25 20:48:48 +00:00
|
|
|
|
2024-03-11 09:44:37 +00:00
|
|
|
def dict_nicely(self):
|
|
|
|
|
return {
|
2025-05-07 21:25:40 +00:00
|
|
|
"identifier":
|
2025-05-18 12:27:26 +00:00
|
|
|
self.identifier,
|
2025-05-07 21:25:40 +00:00
|
|
|
"sqm_ocr":
|
2025-05-18 12:27:26 +00:00
|
|
|
self.sqm_ocr,
|
2025-05-07 21:25:40 +00:00
|
|
|
"price":
|
2025-05-18 12:27:26 +00:00
|
|
|
self.price,
|
2025-05-07 21:25:40 +00:00
|
|
|
"price_per_sqm":
|
2025-05-18 12:27:26 +00:00
|
|
|
self.price_per_sqm,
|
2025-05-07 21:25:40 +00:00
|
|
|
"url":
|
2025-05-18 12:27:26 +00:00
|
|
|
self.url,
|
2025-05-07 21:25:40 +00:00
|
|
|
"bedrooms":
|
2025-05-18 12:27:26 +00:00
|
|
|
self.bedrooms,
|
2025-05-07 21:25:40 +00:00
|
|
|
"travel_time_fastest":
|
2025-05-18 12:27:26 +00:00
|
|
|
None if len(self.travel_time) == 0 else self.travel_time[0],
|
2025-05-07 21:25:40 +00:00
|
|
|
"travel_time_second":
|
2025-05-18 12:27:26 +00:00
|
|
|
None if len(self.travel_time) < 2 else self.travel_time[1],
|
2025-05-07 21:25:40 +00:00
|
|
|
"lease_left":
|
2025-05-18 12:27:26 +00:00
|
|
|
self.leaseLeft,
|
2025-05-07 21:25:40 +00:00
|
|
|
"service_charge":
|
2025-05-18 12:27:26 +00:00
|
|
|
self.serviceCharge,
|
2025-05-07 21:25:40 +00:00
|
|
|
"development":
|
2025-05-18 12:27:26 +00:00
|
|
|
self.development,
|
2025-05-07 21:25:40 +00:00
|
|
|
"tenure_type":
|
2025-05-18 12:27:26 +00:00
|
|
|
self.tenure_type,
|
2025-05-07 21:25:40 +00:00
|
|
|
"updated_days":
|
2025-05-18 12:27:26 +00:00
|
|
|
self.updateDaysAgo,
|
2025-05-07 21:25:40 +00:00
|
|
|
"status":
|
2025-05-18 12:27:26 +00:00
|
|
|
self.status,
|
2025-05-07 21:25:40 +00:00
|
|
|
"last_seen":
|
2025-05-18 12:27:26 +00:00
|
|
|
self.last_seen,
|
2024-03-11 09:44:37 +00:00
|
|
|
}
|