diff --git a/crawler/1_dump_listings.py b/crawler/1_dump_listings.py index 43c2512..2d71f03 100644 --- a/crawler/1_dump_listings.py +++ b/crawler/1_dump_listings.py @@ -25,8 +25,8 @@ class QueryParameters: async def dump_listings( - parameters: QueryParameters, - data_dir: pathlib.Path = pathlib.Path("data/rs/"), + parameters: QueryParameters, + data_dir: pathlib.Path = pathlib.Path("data/rs/"), ) -> list[Listing]: if parameters.district_names: districts = { @@ -39,29 +39,32 @@ async def dump_listings( print("Valid districts to scrape:", districts.keys()) listings = [] - json_responses = await asyncio.gather(*[ - listing_query( - page=i, - channel=parameters.listing_type, - min_bedrooms=parameters.min_bedrooms, - max_bedrooms=parameters.max_bedrooms, - radius=parameters.radius, - min_price=parameters.min_price, - max_price=parameters.max_price, - location_id=locid, - page_size=parameters.page_size, - max_days_since_added=parameters.max_days_since_added, - furnish_types=parameters.furnish_types or [], - ) for locid in districts.values() for i in [1, 2] - ]) + json_responses = await asyncio.gather( + *[ + listing_query( + page=i, + channel=parameters.listing_type, + min_bedrooms=parameters.min_bedrooms, + max_bedrooms=parameters.max_bedrooms, + radius=parameters.radius, + min_price=parameters.min_price, + max_price=parameters.max_price, + location_id=locid, + page_size=parameters.page_size, + max_days_since_added=parameters.max_days_since_added, + furnish_types=parameters.furnish_types or [], + ) + for locid in districts.values() + for i in [1, 2] + ] + ) listings = [] for response_json in json_responses: if response_json["totalAvailableResults"] == 0: print("No results found") continue if response_json["totalAvailableResults"] > 0: - print("totalAvailableResults: ", - response_json["totalAvailableResults"]) + print("totalAvailableResults: ", response_json["totalAvailableResults"]) for property in response_json["properties"]: identifier = property["identifier"] diff --git a/crawler/2_dump_detail.py b/crawler/2_dump_detail.py index ba87135..dfc2e27 100644 --- a/crawler/2_dump_detail.py +++ b/crawler/2_dump_detail.py @@ -12,7 +12,8 @@ semaphore = asyncio.Semaphore(10) async def dump_detail(listing_paths: list[str]): listings = Listing.get_all_listings(listing_paths) filtered_listings = await tqdm.gather( - *[_dump_detail_for_listing(listing) for listing in listings]) + *[_dump_detail_for_listing(listing) for listing in listings] + ) return filtered_listings diff --git a/crawler/3_dump_images.py b/crawler/3_dump_images.py index c11e5eb..bb14855 100644 --- a/crawler/3_dump_images.py +++ b/crawler/3_dump_images.py @@ -10,8 +10,7 @@ semaphore = asyncio.Semaphore(10) async def dump_images(listing_paths: list[str]): listings = Listing.get_all_listings(listing_paths) - await tqdm.gather( - *[dump_images_for_listing(listing) for listing in listings]) + await tqdm.gather(*[dump_images_for_listing(listing) for listing in listings]) async def dump_images_for_listing(listing: Listing): @@ -30,8 +29,7 @@ async def dump_images_for_listing(listing: Listing): async with semaphore: async with session.get(url) as response: if response.status != 200: - raise Exception( - f"Error for {url}: {response.status}") + raise Exception(f"Error for {url}: {response.status}") with open(p, "wb") as f: f.write(await response.read()) except Exception as e: diff --git a/crawler/4_detect_floorplan.py b/crawler/4_detect_floorplan.py index 199b601..09078a7 100644 --- a/crawler/4_detect_floorplan.py +++ b/crawler/4_detect_floorplan.py @@ -9,13 +9,13 @@ async def detect_floorplan(listing_paths: list[str]): cpu_count = multiprocessing.cpu_count() // 4 semaphore = asyncio.Semaphore(cpu_count) - await tqdm.gather(*[ - _detect_floorplan_with_semaphore(listing, semaphore) - for listing in listings - ]) + await tqdm.gather( + *[_detect_floorplan_with_semaphore(listing, semaphore) for listing in listings] + ) -async def _detect_floorplan_with_semaphore(listing: Listing, - semaphore: asyncio.Semaphore): +async def _detect_floorplan_with_semaphore( + listing: Listing, semaphore: asyncio.Semaphore +): async with semaphore: return await listing.calculate_sqm_ocr(recalculate=False) diff --git a/crawler/5_routing.py b/crawler/5_routing.py index 6226a53..2717d81 100644 --- a/crawler/5_routing.py +++ b/crawler/5_routing.py @@ -14,20 +14,22 @@ async def calculate_route( # reduce listings to everything within 7 miles filtered_listings = [] for listing in listings: - print(f'Processing {listing.identifier}') + print(f"Processing {listing.identifier}") if listing.isRemoved: - print(f"Removed-Skip: Skipping {listing.identifier} " - "is already removed.") + print(f"Removed-Skip: Skipping {listing.identifier} " "is already removed.") continue sqm_ocr = await listing.sqm_ocr() - if (sqm_ocr is None or sqm_ocr < 30 or sqm_ocr > 200): - print((f"Floorplan-Skip: Skipping {listing.identifier} as " - f"sqm_ocr is {sqm_ocr}")) + if sqm_ocr is None or sqm_ocr < 30 or sqm_ocr > 200: + print( + ( + f"Floorplan-Skip: Skipping {listing.identifier} as " + f"sqm_ocr is {sqm_ocr}" + ) + ) continue filtered_listings.append(listing) - print( - f"Filtered listings from {len(listings)} to {len(filtered_listings)}") + print(f"Filtered listings from {len(listings)} to {len(filtered_listings)}") for listing in tqdm(filtered_listings): listing.calculate_route( diff --git a/crawler/csv_exporter.py b/crawler/csv_exporter.py index 57130be..a8b364c 100644 --- a/crawler/csv_exporter.py +++ b/crawler/csv_exporter.py @@ -12,9 +12,9 @@ async def export_to_csv( ds = await asyncio.gather(*[listing.dict_nicely() for listing in listings]) df = pd.DataFrame(ds) # read decisions on file - decisions_path = 'data/decisions.json' + decisions_path = "data/decisions.json" decisions = pd.read_json(decisions_path) - df.loc[:, 'decision'] = df.identifier.apply(lambda x: decisions.get(x)) + df.loc[:, "decision"] = df.identifier.apply(lambda x: decisions.get(x)) # remove all entries where we didnt calculate transit time (probably due to a too far distance) # df2 = df[df.travel_time_fastest.notna()] @@ -26,9 +26,9 @@ async def export_to_csv( # s1 = df2 # fill in gap values for service charge and lease left. This is for excel so we can use filters better there - df2.loc[:, 'service_charge'] = df2.service_charge.fillna(-1) - df2.loc[:, 'lease_left'] = df2.lease_left.fillna(-1) - df2.loc[:, 'sqm_ocr'] = df2.sqm_ocr.fillna(-1) + df2.loc[:, "service_charge"] = df2.service_charge.fillna(-1) + df2.loc[:, "lease_left"] = df2.lease_left.fillna(-1) + df2.loc[:, "sqm_ocr"] = df2.sqm_ocr.fillna(-1) df3 = df2 # df3 = pd.concat([df2.drop(['travel_time_fastest', 'travel_time_second'], axis=1), s1], axis=1) @@ -37,5 +37,5 @@ async def export_to_csv( df4 = df3 df5 = df4[columns] - df6 = df5.sort_values(by=['price_per_sqm'], ascending=True) + df6 = df5.sort_values(by=["price_per_sqm"], ascending=True) df6.to_csv(str(output_file), index=False) diff --git a/crawler/data_access.py b/crawler/data_access.py index a2cd939..dd3b561 100644 --- a/crawler/data_access.py +++ b/crawler/data_access.py @@ -46,11 +46,13 @@ class Listing: # data_dir is the first directory before the listing_path data_dir = pathlib.Path(listing_path) - while str(d['identifier']) in str(data_dir.resolve().absolute()): + while str(d["identifier"]) in str(data_dir.resolve().absolute()): data_dir = data_dir.parent listing = Listing(d["identifier"], data_dir=data_dir) - if (listing.last_seen is not None - and listing.last_seen < seen_in_the_last_n_days): + if ( + listing.last_seen is not None + and listing.last_seen < seen_in_the_last_n_days + ): identifiers.append(listing) return identifiers @@ -107,16 +109,19 @@ class Listing: def calculate_sqm_model(self): objs = [] for floorplan_path in self.list_floorplans(): - estimated_sqm, model_output, predictions = ( - floorplan.calculate_model(floorplan_path)) - objs.append({ - "floorplan_path": str(floorplan_path), - "estimated_sqm": estimated_sqm, - "model_output": model_output, - "no_predictions": len( - predictions - ), # cant serialize the predictions itself since its a tensor - }) + estimated_sqm, model_output, predictions = floorplan.calculate_model( + floorplan_path + ) + objs.append( + { + "floorplan_path": str(floorplan_path), + "estimated_sqm": estimated_sqm, + "model_output": model_output, + "no_predictions": len( + predictions + ), # cant serialize the predictions itself since its a tensor + } + ) with open(self.path_floorplan_model_json(), "w") as f: json.dump(objs, f) @@ -129,8 +134,9 @@ class Listing: with open(self.path_floorplan_json()) as f: objs = json.load(f) - max_sqm = max([o["estimated_sqm"] for o in objs - if o is None]) # filter out Nones + max_sqm = max( + [o["estimated_sqm"] for o in objs if o is None] + ) # filter out Nones return max_sqm async def calculate_sqm_ocr(self, recalculate=True): @@ -143,12 +149,15 @@ class Listing: for floorplan_path in self.list_floorplans(): estimated_sqm, model_output = await asyncio.to_thread( - floorplan.calculate_ocr, floorplan_path) - objs.append({ - "floorplan_path": str(floorplan_path), - "estimated_sqm": estimated_sqm, - "text": model_output, - }) + floorplan.calculate_ocr, floorplan_path + ) + objs.append( + { + "floorplan_path": str(floorplan_path), + "estimated_sqm": estimated_sqm, + "text": model_output, + } + ) with open(self.path_floorplan_ocr_json(), "w") as f: json.dump(objs, f) @@ -160,22 +169,20 @@ class Listing: with open(self.path_floorplan_ocr_json()) as f: objs = json.load(f) - sqms = [ - o["estimated_sqm"] for o in objs if o["estimated_sqm"] is not None - ] + sqms = [o["estimated_sqm"] for o in objs if o["estimated_sqm"] is not None] if len(sqms) == 0: return None max_sqm = max(sqms) return max_sqm - def calculate_route(self, - dest_address: str, - travel_mode: routing.TravelMode, - recalculate=False) -> dict[str, Any]: + def calculate_route( + self, dest_address: str, travel_mode: routing.TravelMode, recalculate=False + ) -> dict[str, Any]: routing_cache = self.__get_routing_cache() cache_key = self.__routing_cache_key(dest_address, travel_mode) - if (route_cache := - routing_cache.get(cache_key)) is not None and not recalculate: + if ( + route_cache := routing_cache.get(cache_key) + ) is not None and not recalculate: return {cache_key: route_cache} result = routing.transit_route( @@ -185,8 +192,12 @@ class Listing: travel_mode, ) if not result: - raise Exception((f"Error calculating route from {self.identifier} " - f"to '{dest_address}' by {travel_mode}")) + raise Exception( + ( + f"Error calculating route from {self.identifier} " + f"to '{dest_address}' by {travel_mode}" + ) + ) result = {**{cache_key: result}, **routing_cache} with open(self.path_routing_json(), "w") as f: json.dump(result, f) @@ -198,8 +209,7 @@ class Listing: travel_mode: routing.TravelMode, ) -> list[dict[str, Any]]: data = self.calculate_route(destination_address, travel_mode) - return self.__extract_travel_times(data, destination_address, - travel_mode) + return self.__extract_travel_times(data, destination_address, travel_mode) @property def url(self): @@ -246,8 +256,7 @@ class Listing: @property def leaseLeft(self) -> float | None: - ds = self.detailobject["property"].get("tenureInfo", - {}).get("content", []) + ds = self.detailobject["property"].get("tenureInfo", {}).get("content", []) for d in ds: if d["type"] == "lengthOfLease": matches = re.findall(r"(\d+\.?\d*)", d["value"]) @@ -267,15 +276,14 @@ class Listing: if not self.path_last_seen_listing().exists(): return None - with open(self.path_last_seen_listing(), 'r') as f: + with open(self.path_last_seen_listing(), "r") as f: datetime_str = json.load(f) dt = datetime.datetime.fromisoformat(datetime_str) return (datetime.datetime.now() - dt).days @property def serviceCharge(self) -> float | None: - ds = self.detailobject["property"].get("tenureInfo", - {}).get("content", []) + ds = self.detailobject["property"].get("tenureInfo", {}).get("content", []) for d in ds: if d["type"] == "annualServiceCharge": matches = re.findall(r"([\d,.]+)", d["value"]) @@ -300,25 +308,24 @@ class Listing: @property def status(self) -> str: if self.isRemoved: - return 'removed' + return "removed" status = self.detailobject["property"]["status"] return status @property def agency(self) -> str: - return self.detailobject['property']["branch"]["brandName"] + return self.detailobject["property"]["branch"]["brandName"] @property def councilTaxBand(self) -> str: - return self.detailobject['property']["councilTaxInfo"]["content"][0][ - "value"] + return self.detailobject["property"]["councilTaxInfo"]["content"][0]["value"] @property def photoThumbnail(self) -> str | None: # options are: 'url', 'thumbnailUrl', 'maxSizeUrl' - photos = self.detailobject['property']['photos'] + photos = self.detailobject["property"]["photos"] if len(photos) > 0: - return photos[0]['url'] + return photos[0]["url"] return None async def dict_nicely(self): @@ -328,57 +335,48 @@ class Listing: with open(self.path_routing_json(), "r") as f: travel_times = json.load(f) for destination_mode in travel_times.keys(): - destination_mode_clean = destination_mode.replace(" ", - "_").replace( - ",", "_") + destination_mode_clean = destination_mode.replace(" ", "_").replace( + ",", "_" + ) destination, travel_mode = self.__from_routing_cache_key( - destination_mode) + destination_mode + ) travel_time_fastest[destination_mode_clean] = self.travel_time( - destination, travel_mode)[0]['duration'] + destination, travel_mode + )[0]["duration"] travel_time_second[destination_mode_clean] = self.travel_time( - destination, travel_mode)[1]['duration'] + destination, travel_mode + )[1]["duration"] return { - "identifier": - self.identifier, - "sqm_ocr": - await self.sqm_ocr(), - "price": - self.price, - "price_per_sqm": - await self.price_per_sqm(), - "url": - self.url, - "bedrooms": - self.bedrooms, - "travel_time_fastest": - ":".join( - sorted(f'{dest} in {travel_mode//60}min' - for dest, travel_mode in travel_time_fastest.items())), - "travel_time_second": - ":".join( - sorted(f'{dest} in {travel_mode//60}min' - for dest, travel_mode in travel_time_second.items())), - "lease_left": - self.leaseLeft, - "service_charge": - self.serviceCharge, - "development": - self.development, - "tenure_type": - self.tenure_type, - "updated_days": - self.updateDaysAgo, - "status": - self.status, - "last_seen": - self.last_seen, - "agency": - self.agency, - "council_tax_band": - self.councilTaxBand, - "photo_thumbnail": - self.photoThumbnail, + "identifier": self.identifier, + "sqm_ocr": await self.sqm_ocr(), + "price": self.price, + "price_per_sqm": await self.price_per_sqm(), + "url": self.url, + "bedrooms": self.bedrooms, + "travel_time_fastest": ":".join( + sorted( + f"{dest} in {travel_mode//60}min" + for dest, travel_mode in travel_time_fastest.items() + ) + ), + "travel_time_second": ":".join( + sorted( + f"{dest} in {travel_mode//60}min" + for dest, travel_mode in travel_time_second.items() + ) + ), + "lease_left": self.leaseLeft, + "service_charge": self.serviceCharge, + "development": self.development, + "tenure_type": self.tenure_type, + "updated_days": self.updateDaysAgo, + "status": self.status, + "last_seen": self.last_seen, + "agency": self.agency, + "council_tax_band": self.councilTaxBand, + "photo_thumbnail": self.photoThumbnail, } def __routing_cache_key( @@ -420,35 +418,38 @@ class Listing: for step in steps: if not used_transit and step["travelMode"] == "WALK": - initial_walk_duration += int( - step["staticDuration"].strip("s")) + initial_walk_duration += int(step["staticDuration"].strip("s")) else: used_transit = True duration_per_transit[step["travelMode"]] += int( - step["staticDuration"].strip("s")) + step["staticDuration"].strip("s") + ) distance_per_transit[step["travelMode"]] += step.get( - "distanceMeters", 0) + "distanceMeters", 0 + ) if step["travelMode"] == "TRANSIT": number_of_transit_stops += 1 - res.append({ - "duration": duration, - "distance": distance, - "duration_static": duration_static, - "initial_walk_duration": initial_walk_duration, - "duration_per_transit": dict(duration_per_transit), - "distance_per_transit": dict(distance_per_transit), - "number_of_transit_stops": number_of_transit_stops, - }) + res.append( + { + "duration": duration, + "distance": distance, + "duration_static": duration_static, + "initial_walk_duration": initial_walk_duration, + "duration_per_transit": dict(duration_per_transit), + "distance_per_transit": dict(distance_per_transit), + "number_of_transit_stops": number_of_transit_stops, + } + ) return res[:limit] def __get_routing_cache(self) -> dict[str, Any]: try: - with open(self.path_routing_json(), 'x') as f: + with open(self.path_routing_json(), "x") as f: json.dump({}, f) return {} except FileExistsError: pass - with open(self.path_routing_json(), 'r') as f: + with open(self.path_routing_json(), "r") as f: return json.load(f) diff --git a/crawler/logger.py b/crawler/logger.py index a0676fb..532d337 100644 --- a/crawler/logger.py +++ b/crawler/logger.py @@ -1,12 +1,10 @@ import logging + def createLogger(name): logging.basicConfig( level=logging.INFO, - format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', - handlers=[ - logging.FileHandler('app.log'), - logging.StreamHandler() - ] + format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", + handlers=[logging.FileHandler("app.log"), logging.StreamHandler()], ) return logging.getLogger(name) diff --git a/crawler/main.py b/crawler/main.py index ef0dcfd..9de9d39 100644 --- a/crawler/main.py +++ b/crawler/main.py @@ -12,18 +12,18 @@ from rec.query import ListingType, FurnishType from rec.routing import API_KEY_ENVIRONMENT_VARIABLE, TravelMode from ui_exporter import export_immoweb as export_immoweb_ui -dump_listings_module = importlib.import_module('1_dump_listings') -dump_detail_module = importlib.import_module('2_dump_detail') -dump_images_module = importlib.import_module('3_dump_images') -detect_floorplan_module = importlib.import_module('4_detect_floorplan') -routing_module = importlib.import_module('5_routing') +dump_listings_module = importlib.import_module("1_dump_listings") +dump_detail_module = importlib.import_module("2_dump_detail") +dump_images_module = importlib.import_module("3_dump_images") +detect_floorplan_module = importlib.import_module("4_detect_floorplan") +routing_module = importlib.import_module("5_routing") @click.group() @click.option( - '--data-dir', + "--data-dir", default=pathlib.Path("data/rs/"), - help='Districts to scrape', + help="Districts to scrape", type=click.Path( writable=True, file_okay=False, @@ -34,15 +34,15 @@ routing_module = importlib.import_module('5_routing') @click.pass_context def cli(ctx, data_dir: str): ctx.ensure_object(dict) - ctx.obj['data_dir'] = data_dir + ctx.obj["data_dir"] = data_dir pass @cli.command() @click.option( - '--type', - '-t', - help='Type of listing to scrape', + "--type", + "-t", + help="Type of listing to scrape", type=click.Choice( ListingType.__members__.keys(), case_sensitive=False, @@ -50,45 +50,42 @@ def cli(ctx, data_dir: str): required=True, ) @click.option( - '--min-bedrooms', + "--min-bedrooms", default=1, - help='Minimum number of bedrooms', + help="Minimum number of bedrooms", type=click.IntRange(min=1), ) @click.option( - '--max-bedrooms', + "--max-bedrooms", default=5, - help='Maximum number of bedrooms', + help="Maximum number of bedrooms", type=click.IntRange(min=1), ) @click.option( - '--min-price', + "--min-price", default=0, - help='Minimum price', + help="Minimum price", type=click.IntRange(min=0), ) @click.option( - '--max-price', + "--max-price", default=1000000, - help='Maximum price', + help="Maximum price", type=click.IntRange(min=0), ) @click.option( - '--district', + "--district", default=None, - help='Districts to scrape', + help="Districts to scrape", type=click.Choice(get_districts().keys(), case_sensitive=False), multiple=True, ) @click.option( - '--furnish-types', - '-f', - help='Furnish types for rented listings', + "--furnish-types", + "-f", + help="Furnish types for rented listings", type=click.Choice( - [ - furnish_type.name - for furnish_type in FurnishType.__members__.values() - ], + [furnish_type.name for furnish_type in FurnishType.__members__.values()], case_sensitive=False, ), multiple=True, @@ -104,7 +101,7 @@ def dump_listings( type: str, furnish_types: list[str], ): - data_dir: str = ctx.obj['data_dir'] + data_dir: str = ctx.obj["data_dir"] query_parameters = dump_listings_module.QueryParameters( listing_type=ListingType[type], district_names=set(district), @@ -112,23 +109,21 @@ def dump_listings( max_bedrooms=max_bedrooms, min_price=min_price, max_price=max_price, - furnish_types=[ - FurnishType[furnish_type] for furnish_type in furnish_types - ], + furnish_types=[FurnishType[furnish_type] for furnish_type in furnish_types], ) click.echo( - f'Running dump_listings for districts {district}, data dir {data_dir} and parameters: ' - f'{query_parameters}') + f"Running dump_listings for districts {district}, data dir {data_dir} and parameters: " + f"{query_parameters}" + ) data_dir_path = pathlib.Path(data_dir) - asyncio.run( - dump_listings_module.dump_listings(query_parameters, data_dir_path)) + asyncio.run(dump_listings_module.dump_listings(query_parameters, data_dir_path)) @cli.command() @click.pass_context def dump_details(ctx: click.core.Context): - data_dir = ctx.obj['data_dir'] - click.echo(f'Running dump_detail for listings stored in {data_dir}') + data_dir = ctx.obj["data_dir"] + click.echo(f"Running dump_detail for listings stored in {data_dir}") listing_paths = sorted(list(pathlib.Path(data_dir).glob("*/listing.json"))) asyncio.run(dump_detail_module.dump_detail(listing_paths)) @@ -136,8 +131,8 @@ def dump_details(ctx: click.core.Context): @cli.command() @click.pass_context def dump_images(ctx: click.core.Context): - data_dir = ctx.obj['data_dir'] - click.echo(f'Running dump_images stored in {data_dir}') + data_dir = ctx.obj["data_dir"] + click.echo(f"Running dump_images stored in {data_dir}") listing_paths = sorted(list(pathlib.Path(data_dir).glob("*/listing.json"))) asyncio.run(dump_images_module.dump_images(listing_paths)) @@ -145,24 +140,24 @@ def dump_images(ctx: click.core.Context): @cli.command() @click.pass_context def detect_floorplan(ctx: click.core.Context): - data_dir = ctx.obj['data_dir'] - click.echo(f'Running detect_floorplan in {data_dir}') + data_dir = ctx.obj["data_dir"] + click.echo(f"Running detect_floorplan in {data_dir}") listing_paths = sorted(list(pathlib.Path(data_dir).glob("*/listing.json"))) asyncio.run(detect_floorplan_module.detect_floorplan(listing_paths)) @cli.command() @click.option( - '--destination-address', - '-d', - help='Destination address for routing', + "--destination-address", + "-d", + help="Destination address for routing", required=True, type=click.STRING, ) @click.option( - '--travel-mode', - '-m', - help='Travel mode for routing', + "--travel-mode", + "-m", + help="Travel mode for routing", type=click.Choice( TravelMode.__members__.keys(), case_sensitive=False, @@ -170,23 +165,25 @@ def detect_floorplan(ctx: click.core.Context): required=True, ) @click.option( - '--limit', - '-l', - help='Limit the number of listings to process', + "--limit", + "-l", + help="Limit the number of listings to process", type=click.IntRange(min=1), default=1, # by default limit to 1 to avoid accidental API usage ) @click.pass_context -def routing(ctx: click.core.Context, destination_address: str, - travel_mode: str, limit: int): - data_dir = ctx.obj['data_dir'] - click.echo(f'Running routing for the first {limit} listings in {data_dir}') +def routing( + ctx: click.core.Context, destination_address: str, travel_mode: str, limit: int +): + data_dir = ctx.obj["data_dir"] + click.echo(f"Running routing for the first {limit} listings in {data_dir}") listing_paths = sorted(list(pathlib.Path(data_dir).glob("*/listing.json"))) listing_paths = listing_paths[:limit] if os.environ.get(API_KEY_ENVIRONMENT_VARIABLE) is None: raise click.exceptions.MissingParameter( - f'{API_KEY_ENVIRONMENT_VARIABLE} environment variable is not set. ' - 'Please set it to your API key for the routing service.') + f"{API_KEY_ENVIRONMENT_VARIABLE} environment variable is not set. " + "Please set it to your API key for the routing service." + ) asyncio.run( routing_module.calculate_route( @@ -194,14 +191,15 @@ def routing(ctx: click.core.Context, destination_address: str, destination_address, # destination_address_coordinates, TravelMode[travel_mode], - )) + ) + ) @cli.command() @click.option( - '--columns', - '-C', - help='Columns to include in the CSV file', + "--columns", + "-C", + help="Columns to include in the CSV file", type=click.Choice( Listing.ALL_COLUMNS, case_sensitive=False, @@ -210,9 +208,9 @@ def routing(ctx: click.core.Context, destination_address: str, default=Listing.ALL_COLUMNS, ) @click.option( - '--output-file', - '-O', - help='Path to the output CSV file', + "--output-file", + "-O", + help="Path to the output CSV file", required=True, type=click.Path( writable=True, @@ -223,20 +221,21 @@ def routing(ctx: click.core.Context, destination_address: str, ) @click.pass_context def export_csv(ctx: click.core.Context, output_file: str, columns: tuple[str]): - data_dir = ctx.obj['data_dir'] - click.echo(f'Exporting data to {output_file} using {data_dir=}') + data_dir = ctx.obj["data_dir"] + click.echo(f"Exporting data to {output_file} using {data_dir=}") output_file_path = pathlib.Path(output_file) listing_paths = sorted(list(pathlib.Path(data_dir).glob("*/listing.json"))) listings = Listing.get_all_listings([str(path) for path in listing_paths]) asyncio.run( - csv_exporter.export_to_csv(listings, output_file_path, - list(columns)), ) - + csv_exporter.export_to_csv(listings, output_file_path, list(columns)), + ) + + @cli.command() @click.option( - '--output-file', - '-O', - help='Path to the output immoweb file', + "--output-file", + "-O", + help="Path to the output immoweb file", required=True, type=click.Path( writable=True, @@ -247,10 +246,9 @@ def export_csv(ctx: click.core.Context, output_file: str, columns: tuple[str]): ) @click.pass_context def export_immoweb(ctx, output_file: str): - click.echo(f'Exporting data to {output_file}') + click.echo(f"Exporting data to {output_file}") asyncio.run(export_immoweb_ui(ctx, output_file)) - -if __name__ == '__main__': +if __name__ == "__main__": cli() diff --git a/crawler/rec/floorplan.py b/crawler/rec/floorplan.py index 7afd84f..bf9aa66 100644 --- a/crawler/rec/floorplan.py +++ b/crawler/rec/floorplan.py @@ -6,6 +6,7 @@ import numpy as np def inference(image_path): from transformers import Pix2StructProcessor, Pix2StructForConditionalGeneration + image = Image.open(image_path) question = "How many living rooms are displayed on this floor plan?" # not sure if it even has an effect processor = Pix2StructProcessor.from_pretrained("google/deplot") @@ -35,15 +36,17 @@ def calculate_model(image_path): def improve_img_for_ocr(img: Image): - img2 = np.array(img.convert('L')) + img2 = np.array(img.convert("L")) cv2.resize(img2, None, fx=1.2, fy=1.2, interpolation=cv2.INTER_CUBIC) - thresh = cv2.adaptiveThreshold(img2, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, - cv2.THRESH_BINARY, 11, 2) + thresh = cv2.adaptiveThreshold( + img2, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2 + ) return Image.fromarray(thresh) def calculate_ocr(image_path): import pytesseract + img = Image.open(image_path) text = pytesseract.image_to_string(img) estimated_sqm = extract_total_sqm(text) @@ -52,9 +55,7 @@ def calculate_ocr(image_path): text2 = pytesseract.image_to_string(improved_img) estimated_sqm2 = extract_total_sqm(text2) with open("recalculating.log", "a") as f: - f.write( - f"before: {estimated_sqm} after: {estimated_sqm2} - {image_path}\n" - ) + f.write(f"before: {estimated_sqm} after: {estimated_sqm2} - {image_path}\n") return estimated_sqm2, text2 return estimated_sqm, text diff --git a/crawler/rec/routing.py b/crawler/rec/routing.py index bafb8e3..319c4a7 100644 --- a/crawler/rec/routing.py +++ b/crawler/rec/routing.py @@ -30,19 +30,13 @@ def transit_route( header = { "X-Goog-Api-Key": api_key, "Content-Type": "application/json", - "X-Goog-FieldMask": # "routes.*", - "routes.distanceMeters,routes.duration,routes.staticDuration,routes.legs.steps.distanceMeters,routes.legs.steps.staticDuration,routes.legs.steps.travelMode", + "X-Goog-FieldMask": "routes.distanceMeters,routes.duration,routes.staticDuration,routes.legs.steps.distanceMeters,routes.legs.steps.staticDuration,routes.legs.steps.travelMode", # "routes.*", } body = { "origin": { # "address": origin_address - "location": { - "latLng": { - "latitude": origin_lat, - "longitude": origin_lon - } - } + "location": {"latLng": {"latitude": origin_lat, "longitude": origin_lon}} }, "destination": { "address": dest_address