From d777558b346fad4a010fb7252a284dcc7c4da2b3 Mon Sep 17 00:00:00 2001 From: Kadir Date: Mon, 25 Mar 2024 20:48:48 +0000 Subject: [PATCH] ruff format --- crawler/1_dump_listings.py | 10 +- crawler/2_dump_detail.py | 6 +- crawler/3_dump_images.py | 23 +- crawler/5_routing.py | 4 +- crawler/9_recalculate_regex_squaremeter.py | 4 +- crawler/data_access.py | 203 +++++++++--------- crawler/main_tmp.py | 34 +-- crawler/proof_of_concept/image.py | 8 +- crawler/proof_of_concept/listings.py | 88 ++++---- .../routing_distancematrix.py | 19 +- crawler/proof_of_concept/routing_routing.py | 129 ++++++----- crawler/proof_of_concept/single-query.py | 17 +- crawler/rec/floorplan.py | 13 +- crawler/rec/query.py | 36 +++- crawler/rec/routing.py | 172 +++++++-------- crawler/rec/utils.py | 8 +- crawler/testing.py | 5 +- 17 files changed, 411 insertions(+), 368 deletions(-) diff --git a/crawler/1_dump_listings.py b/crawler/1_dump_listings.py index 1f364b7..8d5517a 100644 --- a/crawler/1_dump_listings.py +++ b/crawler/1_dump_listings.py @@ -12,10 +12,10 @@ for i in range(1, 10000): d = listing_query(i, 3, 3, 15, 0, 800000, max_days_since_added=1) except: break - - for property in d['properties']: - identifier = property['identifier'] - + + for property in d["properties"]: + identifier = property["identifier"] + listing = Listing(identifier) - with open(listing.path_listing_json(), 'w') as f: + with open(listing.path_listing_json(), "w") as f: json.dump(property, f) diff --git a/crawler/2_dump_detail.py b/crawler/2_dump_detail.py index 1bd9247..13644bf 100644 --- a/crawler/2_dump_detail.py +++ b/crawler/2_dump_detail.py @@ -7,11 +7,11 @@ from data_access import Listing for listing in tqdm(Listing.get_all_listings()): if listing.path_detail_json().exists(): continue - + try: d = detail_query(listing.identifier) - with open(listing.path_detail_json(), 'w') as f: + with open(listing.path_detail_json(), "w") as f: json.dump(d, f) except: - print('Failed at: ', listing.identifier) + print("Failed at: ", listing.identifier) raise diff --git a/crawler/3_dump_images.py b/crawler/3_dump_images.py index 0d831e0..17244c0 100644 --- a/crawler/3_dump_images.py +++ b/crawler/3_dump_images.py @@ -6,26 +6,23 @@ from data_access import Listing for listing in tqdm(Listing.get_all_listings()): with open(listing.path_detail_json()) as f: detail = json.load(f) - - - for photo in detail['property']['photos']: - url = photo['maxSizeUrl'] - picname = url.split('/')[-1] - order = photo['order'] + + for photo in detail["property"]["photos"]: + url = photo["maxSizeUrl"] + picname = url.split("/")[-1] + order = photo["order"] p = listing.path_pic_file(order, picname) if p.exists(): continue tqdm.write(str(p)) urlretrieve(url, p) - - for photo in detail['property']['floorplans']: - url = photo['url'] - picname = url.split('/')[-1] - order = photo['order'] + + for photo in detail["property"]["floorplans"]: + url = photo["url"] + picname = url.split("/")[-1] + order = photo["order"] p = listing.path_floorplan_file(order, picname) if p.exists(): continue tqdm.write(str(p)) urlretrieve(url, p) - - \ No newline at end of file diff --git a/crawler/5_routing.py b/crawler/5_routing.py index 56fba6c..88263ca 100644 --- a/crawler/5_routing.py +++ b/crawler/5_routing.py @@ -8,6 +8,6 @@ for listing in tqdm(listings): lat, long = BROCK_STREET_LAT_LONG listing.calculate_route(lat, long, recalculate=False) traveltime = listing.travel_time[0] - duration_minutes = traveltime['duration'] / 60. - + duration_minutes = traveltime["duration"] / 60.0 + tqdm.write(f"{listing.identifier} {duration_minutes}") diff --git a/crawler/9_recalculate_regex_squaremeter.py b/crawler/9_recalculate_regex_squaremeter.py index d226df5..70d1c3b 100644 --- a/crawler/9_recalculate_regex_squaremeter.py +++ b/crawler/9_recalculate_regex_squaremeter.py @@ -9,7 +9,7 @@ for listing in tqdm(list(Listing.get_all_listings())): floorplans = json.load(f) for floorplan in floorplans: - floorplan['estimated_sqm'] = extract_total_sqm(floorplan['text']) + floorplan["estimated_sqm"] = extract_total_sqm(floorplan["text"]) - with open(listing.path_floorplan_ocr_json(), 'w') as f: + with open(listing.path_floorplan_ocr_json(), "w") as f: floorplans = json.dump(floorplans, f) diff --git a/crawler/data_access.py b/crawler/data_access.py index 7ed9be4..ad9d512 100644 --- a/crawler/data_access.py +++ b/crawler/data_access.py @@ -5,202 +5,213 @@ from typing import List, Dict from rec import floorplan, routing import re -_DATA_DIR = pathlib.Path('data/rs/') +_DATA_DIR = pathlib.Path("data/rs/") + @dataclass() -class Listing(): +class Listing: identifier: int _cached: Dict = None - + @staticmethod - def get_all_listings() -> List['Listing']: - listing_paths = sorted(list(_DATA_DIR.glob('*/listing.json'))) + def get_all_listings() -> List["Listing"]: + listing_paths = sorted(list(_DATA_DIR.glob("*/listing.json"))) identifiers = [] for listing_path in listing_paths: with open(listing_path) as f: d = json.load(f) - identifiers.append(Listing(d['identifier'])) - + identifiers.append(Listing(d["identifier"])) + return identifiers def path_listing(self) -> pathlib.Path: p = _DATA_DIR / str(self.identifier) p.mkdir(parents=True, exist_ok=True) return p - + def path_listing_json(self) -> pathlib.Path: - return self.path_listing() / 'listing.json' - + return self.path_listing() / "listing.json" + def path_detail_json(self) -> pathlib.Path: - return self.path_listing() / 'detail.json' - + return self.path_listing() / "detail.json" + def path_routing_json(self) -> pathlib.Path: - return self.path_listing() / 'routing.json' - + return self.path_listing() / "routing.json" + def path_floorplan_model_json(self) -> pathlib.Path: - return self.path_listing() / 'floorplan_model.json' - + return self.path_listing() / "floorplan_model.json" + def path_floorplan_ocr_json(self) -> pathlib.Path: - return self.path_listing() / 'floorplan_ocr.json' - + return self.path_listing() / "floorplan_ocr.json" + def path_pic_folder(self) -> pathlib.Path: - return self.path_listing() / 'pics' - + return self.path_listing() / "pics" + def path_pic_file(self, order, name) -> pathlib.Path: self.path_pic_folder().mkdir(parents=True, exist_ok=True) - return self.path_pic_folder() / f'{order}_{name}' - + return self.path_pic_folder() / f"{order}_{name}" + def path_floorplan_folder(self) -> pathlib.Path: - return self.path_listing() / 'floorplans' - + return self.path_listing() / "floorplans" + def path_floorplan_file(self, order, name) -> pathlib.Path: self.path_floorplan_folder().mkdir(parents=True, exist_ok=True) - return self.path_floorplan_folder() / f'{order}_{name}' - + return self.path_floorplan_folder() / f"{order}_{name}" + def list_floorplans(self): - images = list(self.path_floorplan_folder().glob('*')) + images = list(self.path_floorplan_folder().glob("*")) # todo add check if return is image return images - + def calculate_sqm_model(self): objs = [] for floorplan_path in self.list_floorplans(): - estimated_sqm, model_output, predictions = floorplan.calculate_model(floorplan_path) - objs.append({ - 'floorplan_path': str(floorplan_path), - 'estimated_sqm': estimated_sqm, - 'model_output': model_output, - 'no_predictions': len(predictions) # cant serialize the predictions itself since its a tensor - }) - - with open(self.path_floorplan_model_json(), 'w') as f: + estimated_sqm, model_output, predictions = floorplan.calculate_model( + floorplan_path + ) + objs.append( + { + "floorplan_path": str(floorplan_path), + "estimated_sqm": estimated_sqm, + "model_output": model_output, + "no_predictions": len( + predictions + ), # cant serialize the predictions itself since its a tensor + } + ) + + with open(self.path_floorplan_model_json(), "w") as f: json.dump(objs, f) - + @property def sqm_model(self, recalculate=True): if not self.path_floorplan_model_json().exists() or recalculate: self.calculate_sqm_model() - + with open(self.path_floorplan_json()) as f: objs = json.load(f) - - max_sqm = max([o['estimated_sqm'] for o in objs if o is None]) # filter out Nones + + max_sqm = max( + [o["estimated_sqm"] for o in objs if o is None] + ) # filter out Nones return max_sqm - + def calculate_sqm_ocr(self, recalculate=True): if not recalculate and self.path_floorplan_ocr_json().exists(): return - + objs = [] for floorplan_path in self.list_floorplans(): estimated_sqm, model_output = floorplan.calculate_ocr(floorplan_path) - objs.append({ - 'floorplan_path': str(floorplan_path), - 'estimated_sqm': estimated_sqm, - 'text': model_output, - }) - - with open(self.path_floorplan_ocr_json(), 'w') as f: + objs.append( + { + "floorplan_path": str(floorplan_path), + "estimated_sqm": estimated_sqm, + "text": model_output, + } + ) + + with open(self.path_floorplan_ocr_json(), "w") as f: json.dump(objs, f) - + @property def sqm_ocr(self, recalculate=False): if not self.path_floorplan_ocr_json().exists() or recalculate: self.calculate_sqm_ocr() - + with open(self.path_floorplan_ocr_json()) as f: objs = json.load(f) - - - sqms = [o['estimated_sqm'] for o in objs if o['estimated_sqm'] is not None] + + sqms = [o["estimated_sqm"] for o in objs if o["estimated_sqm"] is not None] if len(sqms) == 0: return None max_sqm = max(sqms) return max_sqm - + def calculate_route(self, dest_lat: float, dest_lon: float, recalculate=False): if self.path_routing_json().exists() and not recalculate: return - - result = routing.transit_route(self.latitude, self.longitude, dest_lat, dest_lon) - with open(self.path_routing_json(), 'w') as f: + + result = routing.transit_route( + self.latitude, self.longitude, dest_lat, dest_lon + ) + with open(self.path_routing_json(), "w") as f: json.dump(result, f) - + @property def travel_time(self) -> List: if not self.path_routing_json().exists(): return [] with open(self.path_routing_json()) as f: d = json.load(f) - + return routing.extract_time(d) - - + @property def url(self): - return f'https://www.rightmove.co.uk/properties/{self.identifier}' - + return f"https://www.rightmove.co.uk/properties/{self.identifier}" + @property def detailobject(self): if self._cached is None: with open(self.path_detail_json()) as f: self._cached = json.load(f) return self._cached - + @property def price(self) -> float: - return self.detailobject['property']['price'] - + return self.detailobject["property"]["price"] + @property def price_per_sqm(self) -> float: if self.sqm_ocr is None or self.sqm_ocr == 0: return None return self.price / self.sqm_ocr - + @property def bedrooms(self) -> int: - return self.detailobject['property']['bedrooms'] - + return self.detailobject["property"]["bedrooms"] + @property def latitude(self) -> float: - return self.detailobject['property']['latitude'] - + return self.detailobject["property"]["latitude"] + @property def longitude(self) -> float: - return self.detailobject['property']['longitude'] - + return self.detailobject["property"]["longitude"] + @property def leaseLeft(self) -> int: - ds = self.detailobject['property'].get('tenureInfo', {}).get('content', []) + ds = self.detailobject["property"].get("tenureInfo", {}).get("content", []) for d in ds: - if d['type'] == 'lengthOfLease': - matches = re.findall(r'(\d+\.?\d*)', d['value']) + if d["type"] == "lengthOfLease": + matches = re.findall(r"(\d+\.?\d*)", d["value"]) if len(matches): return float(matches[0]) return None - + @property def development(self) -> bool: # aka new home - return self.detailobject['property']['development'] - + return self.detailobject["property"]["development"] + def dict_nicely(self): return { - 'identifier': self.identifier, - 'sqm_ocr': self.sqm_ocr, - 'price': self.price, - 'price_per_sqm': self.price_per_sqm, - 'url': self.url, - 'bedrooms': self.bedrooms, - 'travel_time_fastest': self.travel_time[0], - 'travel_time_second': None if len(self.travel_time) < 2 else self.travel_time[1], - 'lease_left': self.leaseLeft, - 'development': self.development, + "identifier": self.identifier, + "sqm_ocr": self.sqm_ocr, + "price": self.price, + "price_per_sqm": self.price_per_sqm, + "url": self.url, + "bedrooms": self.bedrooms, + "travel_time_fastest": self.travel_time[0], + "travel_time_second": None + if len(self.travel_time) < 2 + else self.travel_time[1], + "lease_left": self.leaseLeft, + "development": self.development, } - - - - -if __name__ == '__main__': + + +if __name__ == "__main__": listings = Listing.get_all_listings() print(listings[0].list_floorplans()) diff --git a/crawler/main_tmp.py b/crawler/main_tmp.py index 4e967d9..0741966 100644 --- a/crawler/main_tmp.py +++ b/crawler/main_tmp.py @@ -1,32 +1,40 @@ def record(): from rec.query import listing_query, detail_query import json - + page = 1 listing = listing_query(page, 2, 2, 5, 200000, 500000) - with open(f'/Users/kadir/code/realestate/crawler/code/json/queries/listing{page}.json', 'w') as f: + with open( + f"/Users/kadir/code/realestate/crawler/code/json/queries/listing{page}.json", + "w", + ) as f: json.dump(listing, f) - for prop in listing['properties']: - identifier = prop['identifier'] + for prop in listing["properties"]: + identifier = prop["identifier"] resp = detail_query(identifier) # print(identifier, resp.status_code) - with open(f'/Users/kadir/code/realestate/crawler/code/json/queries/detail_{identifier}.json', 'w') as f: + with open( + f"/Users/kadir/code/realestate/crawler/code/json/queries/detail_{identifier}.json", + "w", + ) as f: json.dump(resp, f) - + + def process(): import json import pathlib - path = pathlib.Path('/Users/kadir/code/realestate/crawler/code/json/queries/') - detailjsons = list(path.glob('detail_*json')) + path = pathlib.Path("/Users/kadir/code/realestate/crawler/code/json/queries/") + + detailjsons = list(path.glob("detail_*json")) for file in detailjsons: - with open(file) as f: js = json.load(f) - - for floorplan in js['property']['floorplans']: - print(floorplan['url']) + + for floorplan in js["property"]["floorplans"]: + print(floorplan["url"]) + # record() -process() \ No newline at end of file +process() diff --git a/crawler/proof_of_concept/image.py b/crawler/proof_of_concept/image.py index 00ddbd1..ce8c4b5 100644 --- a/crawler/proof_of_concept/image.py +++ b/crawler/proof_of_concept/image.py @@ -1,13 +1,13 @@ import requests headers = { - 'Host': 'media.rightmove.co.uk', + "Host": "media.rightmove.co.uk", # 'Accept-Encoding': 'gzip, deflate, br', - 'User-Agent': 'okhttp/4.10.0', + "User-Agent": "okhttp/4.10.0", } response = requests.get( - 'https://media.rightmove.co.uk/47k/46001/138680705/46001_32532509_IMG_00_0000.jpeg', + "https://media.rightmove.co.uk/47k/46001/138680705/46001_32532509_IMG_00_0000.jpeg", headers=headers, verify=False, -) \ No newline at end of file +) diff --git a/crawler/proof_of_concept/listings.py b/crawler/proof_of_concept/listings.py index 60b47da..ab36150 100644 --- a/crawler/proof_of_concept/listings.py +++ b/crawler/proof_of_concept/listings.py @@ -1,58 +1,68 @@ import requests headers = { - 'Host': 'api.rightmove.co.uk', + "Host": "api.rightmove.co.uk", # 'Accept-Encoding': 'gzip, deflate, br', - 'User-Agent': 'okhttp/4.10.0', - 'Connection': 'close', + "User-Agent": "okhttp/4.10.0", + "Connection": "close", } params = { - 'locationIdentifier': 'POSTCODE^4228216', - 'channel': 'BUY', - 'page': '1', - 'numberOfPropertiesPerPage': '25', - 'radius': '3.0', - 'sortBy': 'distance', - 'includeUnavailableProperties': 'false', - 'propertyTypes': 'flat', - 'mustHave': 'newHome', # added manually later - 'dontShow': 'sharedOwnership,retirement', - 'minPrice': '150000', - 'maxPrice': '500000', - 'minBedrooms': '2', - 'maxBedrooms': '2', - 'apiApplication': 'ANDROID', - 'appVersion': '3.70.0', + "locationIdentifier": "POSTCODE^4228216", + "channel": "BUY", + "page": "1", + "numberOfPropertiesPerPage": "25", + "radius": "3.0", + "sortBy": "distance", + "includeUnavailableProperties": "false", + "propertyTypes": "flat", + "mustHave": "newHome", # added manually later + "dontShow": "sharedOwnership,retirement", + "minPrice": "150000", + "maxPrice": "500000", + "minBedrooms": "2", + "maxBedrooms": "2", + "apiApplication": "ANDROID", + "appVersion": "3.70.0", } -response = requests.get('https://api.rightmove.co.uk/api/property-listing', params=params, headers=headers, verify=False) +response = requests.get( + "https://api.rightmove.co.uk/api/property-listing", + params=params, + headers=headers, + verify=False, +) import requests headers = { - 'Host': 'api.rightmove.co.uk', + "Host": "api.rightmove.co.uk", # 'Accept-Encoding': 'gzip, deflate, br', - 'User-Agent': 'okhttp/4.10.0', - 'Connection': 'close', + "User-Agent": "okhttp/4.10.0", + "Connection": "close", } params = { - 'locationIdentifier': 'POSTCODE^4228216', - 'channel': 'BUY', - 'page': '2', - 'numberOfPropertiesPerPage': '25', - 'radius': '3.0', - 'sortBy': 'distance', - 'includeUnavailableProperties': 'false', - 'propertyTypes': 'flat', - 'dontShow': 'sharedOwnership,retirement', - 'minPrice': '150000', - 'maxPrice': '600000', - 'minBedrooms': '2', - 'maxBedrooms': '2', - 'apiApplication': 'ANDROID', - 'appVersion': '3.70.0', + "locationIdentifier": "POSTCODE^4228216", + "channel": "BUY", + "page": "2", + "numberOfPropertiesPerPage": "25", + "radius": "3.0", + "sortBy": "distance", + "includeUnavailableProperties": "false", + "propertyTypes": "flat", + "dontShow": "sharedOwnership,retirement", + "minPrice": "150000", + "maxPrice": "600000", + "minBedrooms": "2", + "maxBedrooms": "2", + "apiApplication": "ANDROID", + "appVersion": "3.70.0", } -response = requests.get('https://api.rightmove.co.uk/api/property-listing', params=params, headers=headers, verify=False) \ No newline at end of file +response = requests.get( + "https://api.rightmove.co.uk/api/property-listing", + params=params, + headers=headers, + verify=False, +) diff --git a/crawler/proof_of_concept/routing_distancematrix.py b/crawler/proof_of_concept/routing_distancematrix.py index acc52d2..42057da 100644 --- a/crawler/proof_of_concept/routing_distancematrix.py +++ b/crawler/proof_of_concept/routing_distancematrix.py @@ -1,16 +1,16 @@ import requests -API_KEY = 'AIzaSyBoBHzeQFgR7O-NlNsuHXQcC1B7ccEHpl8' +API_KEY = "AIzaSyBoBHzeQFgR7O-NlNsuHXQcC1B7ccEHpl8" url = "https://maps.googleapis.com/maps/api/distancematrix/json" -origin = '51.5636306598907,-0.11061106079085892' +origin = "51.5636306598907,-0.11061106079085892" dest = "51.53836609846008,-0.12743940233824352" params = { - "origins": origin, - "destinations": dest, - "key": API_KEY, - "departure_time": "", # timstamp, optional - "mode": "transit", + "origins": origin, + "destinations": dest, + "key": API_KEY, + "departure_time": "", # timstamp, optional + "mode": "transit", } r = requests.get(url, params=params) @@ -18,6 +18,5 @@ print(r.status_code) print(r.json()) -with open('code/json/routing_distancematrix.json', 'w') as f: - f.write(r.text) - +with open("code/json/routing_distancematrix.json", "w") as f: + f.write(r.text) diff --git a/crawler/proof_of_concept/routing_routing.py b/crawler/proof_of_concept/routing_routing.py index 32aad97..6f043c5 100644 --- a/crawler/proof_of_concept/routing_routing.py +++ b/crawler/proof_of_concept/routing_routing.py @@ -2,83 +2,77 @@ import requests from utils import nextMonday from collections import defaultdict -API_KEY = 'AIzaSyBoBHzeQFgR7O-NlNsuHXQcC1B7ccEHpl8' +API_KEY = "AIzaSyBoBHzeQFgR7O-NlNsuHXQcC1B7ccEHpl8" url = "https://routes.googleapis.com/directions/v2:computeRoutes" -def travel_time(origin_lat:float, origin_lon:float, dest_lat:float, dest_lon:float): - monday9am = nextMonday() +def travel_time(origin_lat: float, origin_lon: float, dest_lat: float, dest_lon: float): + monday9am = nextMonday() - header = { - "X-Goog-Api-Key": API_KEY, - "Content-Type": "application/json", - "X-Goog-FieldMask": "routes.distanceMeters,routes.duration,routes.staticDuration,routes.legs.steps.distanceMeters,routes.legs.steps.staticDuration,routes.legs.steps.travelMode", - } + header = { + "X-Goog-Api-Key": API_KEY, + "Content-Type": "application/json", + "X-Goog-FieldMask": "routes.distanceMeters,routes.duration,routes.staticDuration,routes.legs.steps.distanceMeters,routes.legs.steps.staticDuration,routes.legs.steps.travelMode", + } - body = { - "origin":{ - "location":{ - "latLng":{ - "latitude": origin_lat, - "longitude": origin_lon - } - } - }, - "destination":{ - "location":{ - "latLng":{ - "latitude": dest_lat, - "longitude": dest_lon - } - } - }, - "travelMode": "TRANSIT", - # "2023-10-15T15:01:23.045123456Z" - "departureTime": monday9am.strftime("%Y-%m-%dT%H:%M:%S.%fZ"), - "computeAlternativeRoutes": False, - # "routeModifiers": { - # "avoidTolls": false, - # "avoidHighways": false, - # "avoidFerries": false - # }, - "languageCode": "en-US", - "units": "METRIC" - } + body = { + "origin": { + "location": {"latLng": {"latitude": origin_lat, "longitude": origin_lon}} + }, + "destination": { + "location": {"latLng": {"latitude": dest_lat, "longitude": dest_lon}} + }, + "travelMode": "TRANSIT", + # "2023-10-15T15:01:23.045123456Z" + "departureTime": monday9am.strftime("%Y-%m-%dT%H:%M:%S.%fZ"), + "computeAlternativeRoutes": False, + # "routeModifiers": { + # "avoidTolls": false, + # "avoidHighways": false, + # "avoidFerries": false + # }, + "languageCode": "en-US", + "units": "METRIC", + } + + r = requests.post(url, json=body, headers=header) + if r.status_code == 200: + return r.json() + + raise Exception(r.json()) - r = requests.post(url, json=body, headers=header) - if r.status_code == 200: - return r.json() - - raise Exception(r.json()) def extract_time(d): - r = d['routes'][0] - print(r.keys()) - distance = r['distanceMeters'] - duration = r['duration'] - duration_static = r['staticDuration'] - - steps = r['legs'][0]['steps'] - # print(steps) - duration_per_transit = defaultdict(lambda: 0) - distance_per_transit = defaultdict(lambda: 0) - - for step in steps: - duration_per_transit[step['travelMode']] += int(step['staticDuration'].strip('s')) - distance_per_transit[step['travelMode']] += step.get('distanceMeters', 0) - - - print(f"dis {distance}, dur {duration}, duration per transit {dict(duration_per_transit)}, distance per transit {dict(distance_per_transit)}") - + r = d["routes"][0] + print(r.keys()) + distance = r["distanceMeters"] + duration = r["duration"] + duration_static = r["staticDuration"] + + steps = r["legs"][0]["steps"] + # print(steps) + duration_per_transit = defaultdict(lambda: 0) + distance_per_transit = defaultdict(lambda: 0) + + for step in steps: + duration_per_transit[step["travelMode"]] += int( + step["staticDuration"].strip("s") + ) + distance_per_transit[step["travelMode"]] += step.get("distanceMeters", 0) + + print( + f"dis {distance}, dur {duration}, duration per transit {dict(duration_per_transit)}, distance per transit {dict(distance_per_transit)}" + ) + if __name__ == "__main__": - import json - with open('code/json/routing_routeapi.json', 'r') as f: - d = json.load(f) - - extract_time(d) - - + import json + + with open("code/json/routing_routeapi.json", "r") as f: + d = json.load(f) + + extract_time(d) + # if __name__ == "__main__": # origin = 51.5635664310333, -0.1107173751570373 # home @@ -87,4 +81,3 @@ if __name__ == "__main__": # import json # with open('code/json/routing_routeapi.json', 'w') as f: # json.dump(d, f) - diff --git a/crawler/proof_of_concept/single-query.py b/crawler/proof_of_concept/single-query.py index 232a545..4879fd3 100644 --- a/crawler/proof_of_concept/single-query.py +++ b/crawler/proof_of_concept/single-query.py @@ -1,15 +1,20 @@ import requests headers = { - 'Host': 'api.rightmove.co.uk', + "Host": "api.rightmove.co.uk", # 'Accept-Encoding': 'gzip, deflate, br', - 'User-Agent': 'okhttp/4.10.0', - 'Connection': 'close', + "User-Agent": "okhttp/4.10.0", + "Connection": "close", } params = { - 'apiApplication': 'ANDROID', - 'appVersion': '3.70.0', + "apiApplication": "ANDROID", + "appVersion": "3.70.0", } -response = requests.get('https://api.rightmove.co.uk/api/property/119578451', params=params, headers=headers, verify=False) \ No newline at end of file +response = requests.get( + "https://api.rightmove.co.uk/api/property/119578451", + params=params, + headers=headers, + verify=False, +) diff --git a/crawler/rec/floorplan.py b/crawler/rec/floorplan.py index fb9703b..b942c76 100644 --- a/crawler/rec/floorplan.py +++ b/crawler/rec/floorplan.py @@ -3,21 +3,22 @@ from PIL import Image from transformers import Pix2StructProcessor, Pix2StructForConditionalGeneration import pytesseract + def inference(image_path): image = Image.open(image_path) - question = "How many living rooms are displayed on this floor plan?" # not sure if it even has an effect - processor = Pix2StructProcessor.from_pretrained('google/deplot') - model = Pix2StructForConditionalGeneration.from_pretrained('google/deplot') + question = "How many living rooms are displayed on this floor plan?" # not sure if it even has an effect + processor = Pix2StructProcessor.from_pretrained("google/deplot") + model = Pix2StructForConditionalGeneration.from_pretrained("google/deplot") inputs = processor(images=image, text=question, return_tensors="pt") predictions = model.generate(**inputs, max_new_tokens=512) output = processor.decode(predictions[0], skip_special_tokens=True) - + return output, predictions - + def extract_total_sqm(deplot_input_str): - sqmregex = r'(\d+\.\d*) ?(sq ?m|sq. ?m)' + sqmregex = r"(\d+\.\d*) ?(sq ?m|sq. ?m)" matches = re.findall(sqmregex, deplot_input_str.lower()) if len(matches) == 0: return None diff --git a/crawler/rec/query.py b/crawler/rec/query.py index 410de18..2aa45b6 100644 --- a/crawler/rec/query.py +++ b/crawler/rec/query.py @@ -18,10 +18,10 @@ headers = { def detail_query(detail_id: int): params = { - 'apiApplication': 'ANDROID', - 'appVersion': '3.70.0', + "apiApplication": "ANDROID", + "appVersion": "3.70.0", } - url = f'https://api.rightmove.co.uk/api/property/{detail_id}' + url = f"https://api.rightmove.co.uk/api/property/{detail_id}" response = requests.get(url, params=params, headers=headers, verify=False) if response.status_code != 200: raise Exception("Failed due to: ", response.text) @@ -30,7 +30,16 @@ def detail_query(detail_id: int): # @cache.memoize() -def listing_query(page: int, min_bedrooms: int, max_bedrooms: int, radius: float, min_price: int, max_price: int, mustNewHome: bool = False, max_days_since_added: int = None) -> dict: +def listing_query( + page: int, + min_bedrooms: int, + max_bedrooms: int, + radius: float, + min_price: int, + max_price: int, + mustNewHome: bool = False, + max_days_since_added: int = None, +) -> dict: params = { "locationIdentifier": "POSTCODE^4228216", "channel": "BUY", @@ -49,12 +58,12 @@ def listing_query(page: int, min_bedrooms: int, max_bedrooms: int, radius: float "appVersion": "3.70.0", } if max_days_since_added: - if max_days_since_added not in [1,3,7,14]: - raise Exception("Invalid max days. Can only be", [1,3,7,14]) - params['maxDaysSinceAdded'] = max_days_since_added - + if max_days_since_added not in [1, 3, 7, 14]: + raise Exception("Invalid max days. Can only be", [1, 3, 7, 14]) + params["maxDaysSinceAdded"] = max_days_since_added + if mustNewHome: - params['mustHave'] = 'newHome' + params["mustHave"] = "newHome" response = requests.get( "https://api.rightmove.co.uk/api/property-listing", @@ -69,7 +78,14 @@ def listing_query(page: int, min_bedrooms: int, max_bedrooms: int, radius: float if __name__ == "__main__": - response = listing_query(page=1, min_bedrooms=2, max_bedrooms=2, radius=5.0, min_price=150000, max_price=700000) + response = listing_query( + page=1, + min_bedrooms=2, + max_bedrooms=2, + radius=5.0, + min_price=150000, + max_price=700000, + ) resp = response for d in resp["properties"]: rl = RightmoveListing( diff --git a/crawler/rec/routing.py b/crawler/rec/routing.py index 390b61c..22fb388 100644 --- a/crawler/rec/routing.py +++ b/crawler/rec/routing.py @@ -2,100 +2,101 @@ import requests from rec.utils import nextMonday from collections import defaultdict -API_KEY = 'AIzaSyBoBHzeQFgR7O-NlNsuHXQcC1B7ccEHpl8' +API_KEY = "AIzaSyBoBHzeQFgR7O-NlNsuHXQcC1B7ccEHpl8" url = "https://routes.googleapis.com/directions/v2:computeRoutes" -def transit_route(origin_lat:float, origin_lon:float, dest_lat:float, dest_lon:float, compute_alternative_routes=True): - monday9am = nextMonday() +def transit_route( + origin_lat: float, + origin_lon: float, + dest_lat: float, + dest_lon: float, + compute_alternative_routes=True, +): + monday9am = nextMonday() - header = { - "X-Goog-Api-Key": API_KEY, - "Content-Type": "application/json", - "X-Goog-FieldMask": "routes.distanceMeters,routes.duration,routes.staticDuration,routes.legs.steps.distanceMeters,routes.legs.steps.staticDuration,routes.legs.steps.travelMode", - } + header = { + "X-Goog-Api-Key": API_KEY, + "Content-Type": "application/json", + "X-Goog-FieldMask": "routes.distanceMeters,routes.duration,routes.staticDuration,routes.legs.steps.distanceMeters,routes.legs.steps.staticDuration,routes.legs.steps.travelMode", + } - body = { - "origin":{ - "location":{ - "latLng":{ - "latitude": origin_lat, - "longitude": origin_lon - } - } - }, - "destination":{ - "location":{ - "latLng":{ - "latitude": dest_lat, - "longitude": dest_lon - } - } - }, - "travelMode": "TRANSIT", - # "2023-10-15T15:01:23.045123456Z" - "departureTime": monday9am.strftime("%Y-%m-%dT%H:%M:%S.%fZ"), - "computeAlternativeRoutes": compute_alternative_routes, - # "routeModifiers": { - # "avoidTolls": false, - # "avoidHighways": false, - # "avoidFerries": false - # }, - "languageCode": "en-US", - "units": "METRIC" - } + body = { + "origin": { + "location": {"latLng": {"latitude": origin_lat, "longitude": origin_lon}} + }, + "destination": { + "location": {"latLng": {"latitude": dest_lat, "longitude": dest_lon}} + }, + "travelMode": "TRANSIT", + # "2023-10-15T15:01:23.045123456Z" + "departureTime": monday9am.strftime("%Y-%m-%dT%H:%M:%S.%fZ"), + "computeAlternativeRoutes": compute_alternative_routes, + # "routeModifiers": { + # "avoidTolls": false, + # "avoidHighways": false, + # "avoidFerries": false + # }, + "languageCode": "en-US", + "units": "METRIC", + } - r = requests.post(url, json=body, headers=header) - if r.status_code == 200: - return r.json() - - raise Exception(r.json()) + r = requests.post(url, json=body, headers=header) + if r.status_code == 200: + return r.json() + + raise Exception(r.json()) + + +def extract_time(d, limit: int = 2): + res = [] + for route in d["routes"]: + distance = route["distanceMeters"] + duration = int(route["duration"].strip("s")) + duration_static = int(route["staticDuration"].strip("s")) + + steps = route["legs"][0]["steps"] + initial_walk_duration = 0 + used_transit = False + duration_per_transit = defaultdict(lambda: 0) + distance_per_transit = defaultdict(lambda: 0) + number_of_transit_stops = 0 + + for step in steps: + if used_transit == False and step["travelMode"] == "WALK": + initial_walk_duration += int(step["staticDuration"].strip("s")) + else: + used_transit = True + duration_per_transit[step["travelMode"]] += int( + step["staticDuration"].strip("s") + ) + distance_per_transit[step["travelMode"]] += step.get("distanceMeters", 0) + if step["travelMode"] == "TRANSIT": + number_of_transit_stops += 1 + + res.append( + { + "duration": duration, + "distance": distance, + "duration_static": duration_static, + "initial_walk_duration": initial_walk_duration, + "duration_per_transit": dict(duration_per_transit), + "distance_per_transit": dict(distance_per_transit), + "number_of_transit_stops": number_of_transit_stops, + } + ) + + return res[:limit] -def extract_time(d, limit:int=2): - res = [] - for route in d['routes']: - distance = route['distanceMeters'] - duration = int(route['duration'].strip('s')) - duration_static = int(route['staticDuration'].strip('s')) - - steps = route['legs'][0]['steps'] - initial_walk_duration = 0 - used_transit = False - duration_per_transit = defaultdict(lambda: 0) - distance_per_transit = defaultdict(lambda: 0) - number_of_transit_stops = 0 - - for step in steps: - if used_transit == False and step['travelMode'] == 'WALK': - initial_walk_duration += int(step['staticDuration'].strip('s')) - else: - used_transit = True - duration_per_transit[step['travelMode']] += int(step['staticDuration'].strip('s')) - distance_per_transit[step['travelMode']] += step.get('distanceMeters', 0) - if step['travelMode'] == 'TRANSIT': - number_of_transit_stops += 1 - - res.append({ - 'duration': duration, - 'distance': distance, - 'duration_static': duration_static, - 'initial_walk_duration': initial_walk_duration, - 'duration_per_transit': dict(duration_per_transit), - 'distance_per_transit': dict(distance_per_transit), - 'number_of_transit_stops': number_of_transit_stops, - }) - - return res[:limit] - if __name__ == "__main__": - import json - with open('code/json/routing_routeapi.json', 'r') as f: - d = json.load(f) - - extract_time(d) - - + import json + + with open("code/json/routing_routeapi.json", "r") as f: + d = json.load(f) + + extract_time(d) + # if __name__ == "__main__": # origin = 51.5635664310333, -0.1107173751570373 # home @@ -104,4 +105,3 @@ if __name__ == "__main__": # import json # with open('code/json/routing_routeapi.json', 'w') as f: # json.dump(d, f) - diff --git a/crawler/rec/utils.py b/crawler/rec/utils.py index 8140720..74140a5 100644 --- a/crawler/rec/utils.py +++ b/crawler/rec/utils.py @@ -1,5 +1,6 @@ from datetime import datetime, timedelta, timezone + def nextMonday(): """ I think this function doesnt work when the day is monday itself. @@ -10,8 +11,11 @@ def nextMonday(): now = datetime.now(timezone.utc) days_until_monday = (0 - now.weekday() + 7) % 7 monday = now + timedelta(days=days_until_monday) - monday_9am = monday.replace(hour=9, minute=0, second=0, microsecond=0, tzinfo=timezone.utc) + monday_9am = monday.replace( + hour=9, minute=0, second=0, microsecond=0, tzinfo=timezone.utc + ) return monday_9am -if __name__ == '__main__': + +if __name__ == "__main__": print(nextMonday()) diff --git a/crawler/testing.py b/crawler/testing.py index 4288538..1aa10e6 100644 --- a/crawler/testing.py +++ b/crawler/testing.py @@ -1,10 +1,9 @@ from rec.db import RightmoveListing, session from sqlalchemy import select -if __name__ == '__main__': +if __name__ == "__main__": print("x") - x = select(RightmoveListing).where(RightmoveListing.price <600000) + x = select(RightmoveListing).where(RightmoveListing.price < 600000) print("y") d = list(session.execute(x)) print(d) -