From 835494d29f5c584ed8a3c0a5a7806a975154beca Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Wed, 7 May 2025 21:25:40 +0000 Subject: [PATCH] reformat most things --- crawler/5_routing.py | 13 ++++-- crawler/91_recalculate_floorplan.py | 2 +- crawler/data_access.py | 68 +++++++++++++++++------------ crawler/rec/db.py | 1 - crawler/rec/floorplan.py | 8 +++- crawler/rec/query.py | 8 ++-- crawler/rec/routing.py | 35 ++++++++++----- 7 files changed, 85 insertions(+), 50 deletions(-) diff --git a/crawler/5_routing.py b/crawler/5_routing.py index 16ee395..a9931c7 100644 --- a/crawler/5_routing.py +++ b/crawler/5_routing.py @@ -16,19 +16,24 @@ for listing in listings: log.info(f"Removed-Skip: Skipping {listing.identifier} is already removed.") continue if miles > 7: - log.info(f"Miles-Skip: Skipping {listing.identifier} as it is {miles} miles away") + log.info( + f"Miles-Skip: Skipping {listing.identifier} as it is {miles} miles away" + ) continue if listing.path_routing_json().exists(): - log.info(f"Path-Skip: Skipping {listing.identifier} as path routing already exists") + log.info( + f"Path-Skip: Skipping {listing.identifier} as path routing already exists" + ) continue if listing.sqm_ocr is None or listing.sqm_ocr < 30 or listing.sqm_ocr > 200: - log.info(f"Floorplan-Skip: Skipping {listing.identifier} as sqm_ocr is {listing.sqm_ocr}") + log.info( + f"Floorplan-Skip: Skipping {listing.identifier} as sqm_ocr is {listing.sqm_ocr}" + ) continue filtered_listings.append(listing) print(f"Filtered listings from {len(listings)} to {len(filtered_listings)}") - for listing in tqdm(filtered_listings): lat, long = BROCK_STREET_LAT_LONG listing.calculate_route(lat, long, recalculate=False) diff --git a/crawler/91_recalculate_floorplan.py b/crawler/91_recalculate_floorplan.py index 71432b1..28ad776 100644 --- a/crawler/91_recalculate_floorplan.py +++ b/crawler/91_recalculate_floorplan.py @@ -10,4 +10,4 @@ for listing in listings: recalculate_listings.append(listing) for listing in tqdm(recalculate_listings): - listing.calculate_sqm_ocr(recalculate=True) \ No newline at end of file + listing.calculate_sqm_ocr(recalculate=True) diff --git a/crawler/data_access.py b/crawler/data_access.py index 11b8bd3..52e4157 100644 --- a/crawler/data_access.py +++ b/crawler/data_access.py @@ -6,7 +6,6 @@ from rec import floorplan, routing import re import datetime - _DATA_DIR = pathlib.Path("data/rs/") @@ -59,10 +58,10 @@ class Listing: def path_floorplan_file(self, order, name) -> pathlib.Path: self.path_floorplan_folder().mkdir(parents=True, exist_ok=True) return self.path_floorplan_folder() / f"{order}_{name}" - + def path_last_seen_listing(self) -> pathlib.Path: return self.path_listing() / "last_seen.json" - + def dump_listing(self, d: dict): with open(self.path_listing_json(), "w") as f: json.dump(d, f) @@ -121,7 +120,7 @@ class Listing: "estimated_sqm": estimated_sqm, "text": model_output, } - ) + ) with open(self.path_floorplan_ocr_json(), "w") as f: json.dump(objs, f) @@ -168,7 +167,7 @@ class Listing: if self._cached is None: with open(self.path_listing_json()) as f: return json.load(f) - + @property def detailobject(self): if self._cached is None: @@ -179,7 +178,7 @@ class Listing: @property def price(self) -> float: return self.detailobject["property"]["price"] - + @property def tenure_type(self) -> str: return self.detailobject["property"]["tenureType"] @@ -211,7 +210,7 @@ class Listing: if len(matches): return float(matches[0]) return None - + @property def updateDaysAgo(self) -> int: ts = self.detailobject["property"]["updateDate"] / 1000 @@ -223,7 +222,7 @@ class Listing: def last_seen(self) -> int: if not self.path_last_seen_listing().exists(): return None - + with open(self.path_last_seen_listing(), 'r') as f: datetime_str = json.load(f) dt = datetime.datetime.fromisoformat(datetime_str) @@ -249,11 +248,11 @@ class Listing: except: print(self.identifier) return False - + @property def isRemoved(self) -> bool: return not self.detailobject["property"]["visible"] - + @property def status(self) -> str: if self.isRemoved: @@ -263,25 +262,36 @@ class Listing: def dict_nicely(self): return { - "identifier": self.identifier, - "sqm_ocr": self.sqm_ocr, - "price": self.price, - "price_per_sqm": self.price_per_sqm, - "url": self.url, - "bedrooms": self.bedrooms, - "travel_time_fastest": None - if len(self.travel_time) == 0 - else self.travel_time[0], - "travel_time_second": None - if len(self.travel_time) < 2 - else self.travel_time[1], - "lease_left": self.leaseLeft, - "service_charge": self.serviceCharge, - "development": self.development, - "tenure_type": self.tenure_type, - "updated_days": self.updateDaysAgo, - "status": self.status, - "last_seen": self.last_seen, + "identifier": + self.identifier, + "sqm_ocr": + self.sqm_ocr, + "price": + self.price, + "price_per_sqm": + self.price_per_sqm, + "url": + self.url, + "bedrooms": + self.bedrooms, + "travel_time_fastest": + None if len(self.travel_time) == 0 else self.travel_time[0], + "travel_time_second": + None if len(self.travel_time) < 2 else self.travel_time[1], + "lease_left": + self.leaseLeft, + "service_charge": + self.serviceCharge, + "development": + self.development, + "tenure_type": + self.tenure_type, + "updated_days": + self.updateDaysAgo, + "status": + self.status, + "last_seen": + self.last_seen, } diff --git a/crawler/rec/db.py b/crawler/rec/db.py index 66f7362..045e90e 100644 --- a/crawler/rec/db.py +++ b/crawler/rec/db.py @@ -6,7 +6,6 @@ from sqlalchemy.orm import declarative_base engine = create_engine("sqlite:///sqlite.db", echo=True) session = Session(engine) - Base = declarative_base() diff --git a/crawler/rec/floorplan.py b/crawler/rec/floorplan.py index 8bce80c..0544d83 100644 --- a/crawler/rec/floorplan.py +++ b/crawler/rec/floorplan.py @@ -34,12 +34,16 @@ def calculate_model(image_path): estimated_sqm = extract_total_sqm(output) return estimated_sqm, output, predictions_tensor + def improve_img_for_ocr(img: Image): img2 = np.array(img.convert('L')) cv2.resize(img2, None, fx=1.2, fy=1.2, interpolation=cv2.INTER_CUBIC) - thresh = cv2.adaptiveThreshold(img2,255,cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY,11,2) + thresh = cv2.adaptiveThreshold( + img2, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2 + ) return Image.fromarray(thresh) + def calculate_ocr(image_path): img = Image.open(image_path) text = pytesseract.image_to_string(img) @@ -51,5 +55,5 @@ def calculate_ocr(image_path): with open("recalculating.log", "a") as f: f.write(f"before: {estimated_sqm} after: {estimated_sqm2} - {image_path}\n") return estimated_sqm2, text2 - + return estimated_sqm, text diff --git a/crawler/rec/query.py b/crawler/rec/query.py index fb67f98..cb7bbc3 100644 --- a/crawler/rec/query.py +++ b/crawler/rec/query.py @@ -2,8 +2,7 @@ import enum from typing import List import requests -# from rec.db import RightmoveListing - +from rec.db import RightmoveListing import urllib3 urllib3.disable_warnings() @@ -36,7 +35,10 @@ def detail_query(detail_id: int): url = f"https://api.rightmove.co.uk/api/property/{detail_id}" response = requests.get(url, params=params, headers=headers, verify=False) if response.status_code != 200: - raise Exception(f"id: {detail_id}. Status Code: {response.status_code}. Failed due to: {response.text}") + raise Exception( + f"""id: {detail_id}. Status Code: {response.status_code}.""" + f"""Failed due to: {response.text}""" + ) return response.json() diff --git a/crawler/rec/routing.py b/crawler/rec/routing.py index 22fb388..413c26c 100644 --- a/crawler/rec/routing.py +++ b/crawler/rec/routing.py @@ -16,18 +16,34 @@ def transit_route( monday9am = nextMonday() header = { - "X-Goog-Api-Key": API_KEY, - "Content-Type": "application/json", - "X-Goog-FieldMask": "routes.distanceMeters,routes.duration,routes.staticDuration,routes.legs.steps.distanceMeters,routes.legs.steps.staticDuration,routes.legs.steps.travelMode", + "X-Goog-Api-Key": + API_KEY, + "Content-Type": + "application/json", + "X-Goog-FieldMask": + "routes.distanceMeters,routes.duration,routes.staticDuration,routes.legs.steps.distanceMeters,routes.legs.steps.staticDuration,routes.legs.steps.travelMode", } body = { - "origin": { - "location": {"latLng": {"latitude": origin_lat, "longitude": origin_lon}} - }, - "destination": { - "location": {"latLng": {"latitude": dest_lat, "longitude": dest_lon}} - }, + "origin": + { + "location": + { + "latLng": { + "latitude": origin_lat, + "longitude": origin_lon + } + } + }, + "destination": + { + "location": { + "latLng": { + "latitude": dest_lat, + "longitude": dest_lon + } + } + }, "travelMode": "TRANSIT", # "2023-10-15T15:01:23.045123456Z" "departureTime": monday9am.strftime("%Y-%m-%dT%H:%M:%S.%fZ"), @@ -97,7 +113,6 @@ if __name__ == "__main__": extract_time(d) - # if __name__ == "__main__": # origin = 51.5635664310333, -0.1107173751570373 # home # dest = 51.50475678313417, 0.04915321000190009 # london city airport