reformat with black; looks better
This commit is contained in:
parent
1122f5a96f
commit
0b9d50af47
11 changed files with 240 additions and 244 deletions
|
|
@ -25,8 +25,8 @@ class QueryParameters:
|
||||||
|
|
||||||
|
|
||||||
async def dump_listings(
|
async def dump_listings(
|
||||||
parameters: QueryParameters,
|
parameters: QueryParameters,
|
||||||
data_dir: pathlib.Path = pathlib.Path("data/rs/"),
|
data_dir: pathlib.Path = pathlib.Path("data/rs/"),
|
||||||
) -> list[Listing]:
|
) -> list[Listing]:
|
||||||
if parameters.district_names:
|
if parameters.district_names:
|
||||||
districts = {
|
districts = {
|
||||||
|
|
@ -39,29 +39,32 @@ async def dump_listings(
|
||||||
print("Valid districts to scrape:", districts.keys())
|
print("Valid districts to scrape:", districts.keys())
|
||||||
listings = []
|
listings = []
|
||||||
|
|
||||||
json_responses = await asyncio.gather(*[
|
json_responses = await asyncio.gather(
|
||||||
listing_query(
|
*[
|
||||||
page=i,
|
listing_query(
|
||||||
channel=parameters.listing_type,
|
page=i,
|
||||||
min_bedrooms=parameters.min_bedrooms,
|
channel=parameters.listing_type,
|
||||||
max_bedrooms=parameters.max_bedrooms,
|
min_bedrooms=parameters.min_bedrooms,
|
||||||
radius=parameters.radius,
|
max_bedrooms=parameters.max_bedrooms,
|
||||||
min_price=parameters.min_price,
|
radius=parameters.radius,
|
||||||
max_price=parameters.max_price,
|
min_price=parameters.min_price,
|
||||||
location_id=locid,
|
max_price=parameters.max_price,
|
||||||
page_size=parameters.page_size,
|
location_id=locid,
|
||||||
max_days_since_added=parameters.max_days_since_added,
|
page_size=parameters.page_size,
|
||||||
furnish_types=parameters.furnish_types or [],
|
max_days_since_added=parameters.max_days_since_added,
|
||||||
) for locid in districts.values() for i in [1, 2]
|
furnish_types=parameters.furnish_types or [],
|
||||||
])
|
)
|
||||||
|
for locid in districts.values()
|
||||||
|
for i in [1, 2]
|
||||||
|
]
|
||||||
|
)
|
||||||
listings = []
|
listings = []
|
||||||
for response_json in json_responses:
|
for response_json in json_responses:
|
||||||
if response_json["totalAvailableResults"] == 0:
|
if response_json["totalAvailableResults"] == 0:
|
||||||
print("No results found")
|
print("No results found")
|
||||||
continue
|
continue
|
||||||
if response_json["totalAvailableResults"] > 0:
|
if response_json["totalAvailableResults"] > 0:
|
||||||
print("totalAvailableResults: ",
|
print("totalAvailableResults: ", response_json["totalAvailableResults"])
|
||||||
response_json["totalAvailableResults"])
|
|
||||||
for property in response_json["properties"]:
|
for property in response_json["properties"]:
|
||||||
identifier = property["identifier"]
|
identifier = property["identifier"]
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -12,7 +12,8 @@ semaphore = asyncio.Semaphore(10)
|
||||||
async def dump_detail(listing_paths: list[str]):
|
async def dump_detail(listing_paths: list[str]):
|
||||||
listings = Listing.get_all_listings(listing_paths)
|
listings = Listing.get_all_listings(listing_paths)
|
||||||
filtered_listings = await tqdm.gather(
|
filtered_listings = await tqdm.gather(
|
||||||
*[_dump_detail_for_listing(listing) for listing in listings])
|
*[_dump_detail_for_listing(listing) for listing in listings]
|
||||||
|
)
|
||||||
return filtered_listings
|
return filtered_listings
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -10,8 +10,7 @@ semaphore = asyncio.Semaphore(10)
|
||||||
|
|
||||||
async def dump_images(listing_paths: list[str]):
|
async def dump_images(listing_paths: list[str]):
|
||||||
listings = Listing.get_all_listings(listing_paths)
|
listings = Listing.get_all_listings(listing_paths)
|
||||||
await tqdm.gather(
|
await tqdm.gather(*[dump_images_for_listing(listing) for listing in listings])
|
||||||
*[dump_images_for_listing(listing) for listing in listings])
|
|
||||||
|
|
||||||
|
|
||||||
async def dump_images_for_listing(listing: Listing):
|
async def dump_images_for_listing(listing: Listing):
|
||||||
|
|
@ -30,8 +29,7 @@ async def dump_images_for_listing(listing: Listing):
|
||||||
async with semaphore:
|
async with semaphore:
|
||||||
async with session.get(url) as response:
|
async with session.get(url) as response:
|
||||||
if response.status != 200:
|
if response.status != 200:
|
||||||
raise Exception(
|
raise Exception(f"Error for {url}: {response.status}")
|
||||||
f"Error for {url}: {response.status}")
|
|
||||||
with open(p, "wb") as f:
|
with open(p, "wb") as f:
|
||||||
f.write(await response.read())
|
f.write(await response.read())
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
|
|
||||||
|
|
@ -9,13 +9,13 @@ async def detect_floorplan(listing_paths: list[str]):
|
||||||
cpu_count = multiprocessing.cpu_count() // 4
|
cpu_count = multiprocessing.cpu_count() // 4
|
||||||
semaphore = asyncio.Semaphore(cpu_count)
|
semaphore = asyncio.Semaphore(cpu_count)
|
||||||
|
|
||||||
await tqdm.gather(*[
|
await tqdm.gather(
|
||||||
_detect_floorplan_with_semaphore(listing, semaphore)
|
*[_detect_floorplan_with_semaphore(listing, semaphore) for listing in listings]
|
||||||
for listing in listings
|
)
|
||||||
])
|
|
||||||
|
|
||||||
|
|
||||||
async def _detect_floorplan_with_semaphore(listing: Listing,
|
async def _detect_floorplan_with_semaphore(
|
||||||
semaphore: asyncio.Semaphore):
|
listing: Listing, semaphore: asyncio.Semaphore
|
||||||
|
):
|
||||||
async with semaphore:
|
async with semaphore:
|
||||||
return await listing.calculate_sqm_ocr(recalculate=False)
|
return await listing.calculate_sqm_ocr(recalculate=False)
|
||||||
|
|
|
||||||
|
|
@ -14,20 +14,22 @@ async def calculate_route(
|
||||||
# reduce listings to everything within 7 miles
|
# reduce listings to everything within 7 miles
|
||||||
filtered_listings = []
|
filtered_listings = []
|
||||||
for listing in listings:
|
for listing in listings:
|
||||||
print(f'Processing {listing.identifier}')
|
print(f"Processing {listing.identifier}")
|
||||||
if listing.isRemoved:
|
if listing.isRemoved:
|
||||||
print(f"Removed-Skip: Skipping {listing.identifier} "
|
print(f"Removed-Skip: Skipping {listing.identifier} " "is already removed.")
|
||||||
"is already removed.")
|
|
||||||
continue
|
continue
|
||||||
sqm_ocr = await listing.sqm_ocr()
|
sqm_ocr = await listing.sqm_ocr()
|
||||||
if (sqm_ocr is None or sqm_ocr < 30 or sqm_ocr > 200):
|
if sqm_ocr is None or sqm_ocr < 30 or sqm_ocr > 200:
|
||||||
print((f"Floorplan-Skip: Skipping {listing.identifier} as "
|
print(
|
||||||
f"sqm_ocr is {sqm_ocr}"))
|
(
|
||||||
|
f"Floorplan-Skip: Skipping {listing.identifier} as "
|
||||||
|
f"sqm_ocr is {sqm_ocr}"
|
||||||
|
)
|
||||||
|
)
|
||||||
continue
|
continue
|
||||||
filtered_listings.append(listing)
|
filtered_listings.append(listing)
|
||||||
|
|
||||||
print(
|
print(f"Filtered listings from {len(listings)} to {len(filtered_listings)}")
|
||||||
f"Filtered listings from {len(listings)} to {len(filtered_listings)}")
|
|
||||||
|
|
||||||
for listing in tqdm(filtered_listings):
|
for listing in tqdm(filtered_listings):
|
||||||
listing.calculate_route(
|
listing.calculate_route(
|
||||||
|
|
|
||||||
|
|
@ -12,9 +12,9 @@ async def export_to_csv(
|
||||||
ds = await asyncio.gather(*[listing.dict_nicely() for listing in listings])
|
ds = await asyncio.gather(*[listing.dict_nicely() for listing in listings])
|
||||||
df = pd.DataFrame(ds)
|
df = pd.DataFrame(ds)
|
||||||
# read decisions on file
|
# read decisions on file
|
||||||
decisions_path = 'data/decisions.json'
|
decisions_path = "data/decisions.json"
|
||||||
decisions = pd.read_json(decisions_path)
|
decisions = pd.read_json(decisions_path)
|
||||||
df.loc[:, 'decision'] = df.identifier.apply(lambda x: decisions.get(x))
|
df.loc[:, "decision"] = df.identifier.apply(lambda x: decisions.get(x))
|
||||||
|
|
||||||
# remove all entries where we didnt calculate transit time (probably due to a too far distance)
|
# remove all entries where we didnt calculate transit time (probably due to a too far distance)
|
||||||
# df2 = df[df.travel_time_fastest.notna()]
|
# df2 = df[df.travel_time_fastest.notna()]
|
||||||
|
|
@ -26,9 +26,9 @@ async def export_to_csv(
|
||||||
# s1 = df2
|
# s1 = df2
|
||||||
|
|
||||||
# fill in gap values for service charge and lease left. This is for excel so we can use filters better there
|
# fill in gap values for service charge and lease left. This is for excel so we can use filters better there
|
||||||
df2.loc[:, 'service_charge'] = df2.service_charge.fillna(-1)
|
df2.loc[:, "service_charge"] = df2.service_charge.fillna(-1)
|
||||||
df2.loc[:, 'lease_left'] = df2.lease_left.fillna(-1)
|
df2.loc[:, "lease_left"] = df2.lease_left.fillna(-1)
|
||||||
df2.loc[:, 'sqm_ocr'] = df2.sqm_ocr.fillna(-1)
|
df2.loc[:, "sqm_ocr"] = df2.sqm_ocr.fillna(-1)
|
||||||
|
|
||||||
df3 = df2
|
df3 = df2
|
||||||
# df3 = pd.concat([df2.drop(['travel_time_fastest', 'travel_time_second'], axis=1), s1], axis=1)
|
# df3 = pd.concat([df2.drop(['travel_time_fastest', 'travel_time_second'], axis=1), s1], axis=1)
|
||||||
|
|
@ -37,5 +37,5 @@ async def export_to_csv(
|
||||||
df4 = df3
|
df4 = df3
|
||||||
|
|
||||||
df5 = df4[columns]
|
df5 = df4[columns]
|
||||||
df6 = df5.sort_values(by=['price_per_sqm'], ascending=True)
|
df6 = df5.sort_values(by=["price_per_sqm"], ascending=True)
|
||||||
df6.to_csv(str(output_file), index=False)
|
df6.to_csv(str(output_file), index=False)
|
||||||
|
|
|
||||||
|
|
@ -46,11 +46,13 @@ class Listing:
|
||||||
|
|
||||||
# data_dir is the first directory before the listing_path
|
# data_dir is the first directory before the listing_path
|
||||||
data_dir = pathlib.Path(listing_path)
|
data_dir = pathlib.Path(listing_path)
|
||||||
while str(d['identifier']) in str(data_dir.resolve().absolute()):
|
while str(d["identifier"]) in str(data_dir.resolve().absolute()):
|
||||||
data_dir = data_dir.parent
|
data_dir = data_dir.parent
|
||||||
listing = Listing(d["identifier"], data_dir=data_dir)
|
listing = Listing(d["identifier"], data_dir=data_dir)
|
||||||
if (listing.last_seen is not None
|
if (
|
||||||
and listing.last_seen < seen_in_the_last_n_days):
|
listing.last_seen is not None
|
||||||
|
and listing.last_seen < seen_in_the_last_n_days
|
||||||
|
):
|
||||||
identifiers.append(listing)
|
identifiers.append(listing)
|
||||||
|
|
||||||
return identifiers
|
return identifiers
|
||||||
|
|
@ -107,16 +109,19 @@ class Listing:
|
||||||
def calculate_sqm_model(self):
|
def calculate_sqm_model(self):
|
||||||
objs = []
|
objs = []
|
||||||
for floorplan_path in self.list_floorplans():
|
for floorplan_path in self.list_floorplans():
|
||||||
estimated_sqm, model_output, predictions = (
|
estimated_sqm, model_output, predictions = floorplan.calculate_model(
|
||||||
floorplan.calculate_model(floorplan_path))
|
floorplan_path
|
||||||
objs.append({
|
)
|
||||||
"floorplan_path": str(floorplan_path),
|
objs.append(
|
||||||
"estimated_sqm": estimated_sqm,
|
{
|
||||||
"model_output": model_output,
|
"floorplan_path": str(floorplan_path),
|
||||||
"no_predictions": len(
|
"estimated_sqm": estimated_sqm,
|
||||||
predictions
|
"model_output": model_output,
|
||||||
), # cant serialize the predictions itself since its a tensor
|
"no_predictions": len(
|
||||||
})
|
predictions
|
||||||
|
), # cant serialize the predictions itself since its a tensor
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
with open(self.path_floorplan_model_json(), "w") as f:
|
with open(self.path_floorplan_model_json(), "w") as f:
|
||||||
json.dump(objs, f)
|
json.dump(objs, f)
|
||||||
|
|
@ -129,8 +134,9 @@ class Listing:
|
||||||
with open(self.path_floorplan_json()) as f:
|
with open(self.path_floorplan_json()) as f:
|
||||||
objs = json.load(f)
|
objs = json.load(f)
|
||||||
|
|
||||||
max_sqm = max([o["estimated_sqm"] for o in objs
|
max_sqm = max(
|
||||||
if o is None]) # filter out Nones
|
[o["estimated_sqm"] for o in objs if o is None]
|
||||||
|
) # filter out Nones
|
||||||
return max_sqm
|
return max_sqm
|
||||||
|
|
||||||
async def calculate_sqm_ocr(self, recalculate=True):
|
async def calculate_sqm_ocr(self, recalculate=True):
|
||||||
|
|
@ -143,12 +149,15 @@ class Listing:
|
||||||
|
|
||||||
for floorplan_path in self.list_floorplans():
|
for floorplan_path in self.list_floorplans():
|
||||||
estimated_sqm, model_output = await asyncio.to_thread(
|
estimated_sqm, model_output = await asyncio.to_thread(
|
||||||
floorplan.calculate_ocr, floorplan_path)
|
floorplan.calculate_ocr, floorplan_path
|
||||||
objs.append({
|
)
|
||||||
"floorplan_path": str(floorplan_path),
|
objs.append(
|
||||||
"estimated_sqm": estimated_sqm,
|
{
|
||||||
"text": model_output,
|
"floorplan_path": str(floorplan_path),
|
||||||
})
|
"estimated_sqm": estimated_sqm,
|
||||||
|
"text": model_output,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
with open(self.path_floorplan_ocr_json(), "w") as f:
|
with open(self.path_floorplan_ocr_json(), "w") as f:
|
||||||
json.dump(objs, f)
|
json.dump(objs, f)
|
||||||
|
|
@ -160,22 +169,20 @@ class Listing:
|
||||||
with open(self.path_floorplan_ocr_json()) as f:
|
with open(self.path_floorplan_ocr_json()) as f:
|
||||||
objs = json.load(f)
|
objs = json.load(f)
|
||||||
|
|
||||||
sqms = [
|
sqms = [o["estimated_sqm"] for o in objs if o["estimated_sqm"] is not None]
|
||||||
o["estimated_sqm"] for o in objs if o["estimated_sqm"] is not None
|
|
||||||
]
|
|
||||||
if len(sqms) == 0:
|
if len(sqms) == 0:
|
||||||
return None
|
return None
|
||||||
max_sqm = max(sqms)
|
max_sqm = max(sqms)
|
||||||
return max_sqm
|
return max_sqm
|
||||||
|
|
||||||
def calculate_route(self,
|
def calculate_route(
|
||||||
dest_address: str,
|
self, dest_address: str, travel_mode: routing.TravelMode, recalculate=False
|
||||||
travel_mode: routing.TravelMode,
|
) -> dict[str, Any]:
|
||||||
recalculate=False) -> dict[str, Any]:
|
|
||||||
routing_cache = self.__get_routing_cache()
|
routing_cache = self.__get_routing_cache()
|
||||||
cache_key = self.__routing_cache_key(dest_address, travel_mode)
|
cache_key = self.__routing_cache_key(dest_address, travel_mode)
|
||||||
if (route_cache :=
|
if (
|
||||||
routing_cache.get(cache_key)) is not None and not recalculate:
|
route_cache := routing_cache.get(cache_key)
|
||||||
|
) is not None and not recalculate:
|
||||||
return {cache_key: route_cache}
|
return {cache_key: route_cache}
|
||||||
|
|
||||||
result = routing.transit_route(
|
result = routing.transit_route(
|
||||||
|
|
@ -185,8 +192,12 @@ class Listing:
|
||||||
travel_mode,
|
travel_mode,
|
||||||
)
|
)
|
||||||
if not result:
|
if not result:
|
||||||
raise Exception((f"Error calculating route from {self.identifier} "
|
raise Exception(
|
||||||
f"to '{dest_address}' by {travel_mode}"))
|
(
|
||||||
|
f"Error calculating route from {self.identifier} "
|
||||||
|
f"to '{dest_address}' by {travel_mode}"
|
||||||
|
)
|
||||||
|
)
|
||||||
result = {**{cache_key: result}, **routing_cache}
|
result = {**{cache_key: result}, **routing_cache}
|
||||||
with open(self.path_routing_json(), "w") as f:
|
with open(self.path_routing_json(), "w") as f:
|
||||||
json.dump(result, f)
|
json.dump(result, f)
|
||||||
|
|
@ -198,8 +209,7 @@ class Listing:
|
||||||
travel_mode: routing.TravelMode,
|
travel_mode: routing.TravelMode,
|
||||||
) -> list[dict[str, Any]]:
|
) -> list[dict[str, Any]]:
|
||||||
data = self.calculate_route(destination_address, travel_mode)
|
data = self.calculate_route(destination_address, travel_mode)
|
||||||
return self.__extract_travel_times(data, destination_address,
|
return self.__extract_travel_times(data, destination_address, travel_mode)
|
||||||
travel_mode)
|
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def url(self):
|
def url(self):
|
||||||
|
|
@ -246,8 +256,7 @@ class Listing:
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def leaseLeft(self) -> float | None:
|
def leaseLeft(self) -> float | None:
|
||||||
ds = self.detailobject["property"].get("tenureInfo",
|
ds = self.detailobject["property"].get("tenureInfo", {}).get("content", [])
|
||||||
{}).get("content", [])
|
|
||||||
for d in ds:
|
for d in ds:
|
||||||
if d["type"] == "lengthOfLease":
|
if d["type"] == "lengthOfLease":
|
||||||
matches = re.findall(r"(\d+\.?\d*)", d["value"])
|
matches = re.findall(r"(\d+\.?\d*)", d["value"])
|
||||||
|
|
@ -267,15 +276,14 @@ class Listing:
|
||||||
if not self.path_last_seen_listing().exists():
|
if not self.path_last_seen_listing().exists():
|
||||||
return None
|
return None
|
||||||
|
|
||||||
with open(self.path_last_seen_listing(), 'r') as f:
|
with open(self.path_last_seen_listing(), "r") as f:
|
||||||
datetime_str = json.load(f)
|
datetime_str = json.load(f)
|
||||||
dt = datetime.datetime.fromisoformat(datetime_str)
|
dt = datetime.datetime.fromisoformat(datetime_str)
|
||||||
return (datetime.datetime.now() - dt).days
|
return (datetime.datetime.now() - dt).days
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def serviceCharge(self) -> float | None:
|
def serviceCharge(self) -> float | None:
|
||||||
ds = self.detailobject["property"].get("tenureInfo",
|
ds = self.detailobject["property"].get("tenureInfo", {}).get("content", [])
|
||||||
{}).get("content", [])
|
|
||||||
for d in ds:
|
for d in ds:
|
||||||
if d["type"] == "annualServiceCharge":
|
if d["type"] == "annualServiceCharge":
|
||||||
matches = re.findall(r"([\d,.]+)", d["value"])
|
matches = re.findall(r"([\d,.]+)", d["value"])
|
||||||
|
|
@ -300,25 +308,24 @@ class Listing:
|
||||||
@property
|
@property
|
||||||
def status(self) -> str:
|
def status(self) -> str:
|
||||||
if self.isRemoved:
|
if self.isRemoved:
|
||||||
return 'removed'
|
return "removed"
|
||||||
status = self.detailobject["property"]["status"]
|
status = self.detailobject["property"]["status"]
|
||||||
return status
|
return status
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def agency(self) -> str:
|
def agency(self) -> str:
|
||||||
return self.detailobject['property']["branch"]["brandName"]
|
return self.detailobject["property"]["branch"]["brandName"]
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def councilTaxBand(self) -> str:
|
def councilTaxBand(self) -> str:
|
||||||
return self.detailobject['property']["councilTaxInfo"]["content"][0][
|
return self.detailobject["property"]["councilTaxInfo"]["content"][0]["value"]
|
||||||
"value"]
|
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def photoThumbnail(self) -> str | None:
|
def photoThumbnail(self) -> str | None:
|
||||||
# options are: 'url', 'thumbnailUrl', 'maxSizeUrl'
|
# options are: 'url', 'thumbnailUrl', 'maxSizeUrl'
|
||||||
photos = self.detailobject['property']['photos']
|
photos = self.detailobject["property"]["photos"]
|
||||||
if len(photos) > 0:
|
if len(photos) > 0:
|
||||||
return photos[0]['url']
|
return photos[0]["url"]
|
||||||
return None
|
return None
|
||||||
|
|
||||||
async def dict_nicely(self):
|
async def dict_nicely(self):
|
||||||
|
|
@ -328,57 +335,48 @@ class Listing:
|
||||||
with open(self.path_routing_json(), "r") as f:
|
with open(self.path_routing_json(), "r") as f:
|
||||||
travel_times = json.load(f)
|
travel_times = json.load(f)
|
||||||
for destination_mode in travel_times.keys():
|
for destination_mode in travel_times.keys():
|
||||||
destination_mode_clean = destination_mode.replace(" ",
|
destination_mode_clean = destination_mode.replace(" ", "_").replace(
|
||||||
"_").replace(
|
",", "_"
|
||||||
",", "_")
|
)
|
||||||
destination, travel_mode = self.__from_routing_cache_key(
|
destination, travel_mode = self.__from_routing_cache_key(
|
||||||
destination_mode)
|
destination_mode
|
||||||
|
)
|
||||||
travel_time_fastest[destination_mode_clean] = self.travel_time(
|
travel_time_fastest[destination_mode_clean] = self.travel_time(
|
||||||
destination, travel_mode)[0]['duration']
|
destination, travel_mode
|
||||||
|
)[0]["duration"]
|
||||||
travel_time_second[destination_mode_clean] = self.travel_time(
|
travel_time_second[destination_mode_clean] = self.travel_time(
|
||||||
destination, travel_mode)[1]['duration']
|
destination, travel_mode
|
||||||
|
)[1]["duration"]
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"identifier":
|
"identifier": self.identifier,
|
||||||
self.identifier,
|
"sqm_ocr": await self.sqm_ocr(),
|
||||||
"sqm_ocr":
|
"price": self.price,
|
||||||
await self.sqm_ocr(),
|
"price_per_sqm": await self.price_per_sqm(),
|
||||||
"price":
|
"url": self.url,
|
||||||
self.price,
|
"bedrooms": self.bedrooms,
|
||||||
"price_per_sqm":
|
"travel_time_fastest": ":".join(
|
||||||
await self.price_per_sqm(),
|
sorted(
|
||||||
"url":
|
f"{dest} in {travel_mode//60}min"
|
||||||
self.url,
|
for dest, travel_mode in travel_time_fastest.items()
|
||||||
"bedrooms":
|
)
|
||||||
self.bedrooms,
|
),
|
||||||
"travel_time_fastest":
|
"travel_time_second": ":".join(
|
||||||
":".join(
|
sorted(
|
||||||
sorted(f'{dest} in {travel_mode//60}min'
|
f"{dest} in {travel_mode//60}min"
|
||||||
for dest, travel_mode in travel_time_fastest.items())),
|
for dest, travel_mode in travel_time_second.items()
|
||||||
"travel_time_second":
|
)
|
||||||
":".join(
|
),
|
||||||
sorted(f'{dest} in {travel_mode//60}min'
|
"lease_left": self.leaseLeft,
|
||||||
for dest, travel_mode in travel_time_second.items())),
|
"service_charge": self.serviceCharge,
|
||||||
"lease_left":
|
"development": self.development,
|
||||||
self.leaseLeft,
|
"tenure_type": self.tenure_type,
|
||||||
"service_charge":
|
"updated_days": self.updateDaysAgo,
|
||||||
self.serviceCharge,
|
"status": self.status,
|
||||||
"development":
|
"last_seen": self.last_seen,
|
||||||
self.development,
|
"agency": self.agency,
|
||||||
"tenure_type":
|
"council_tax_band": self.councilTaxBand,
|
||||||
self.tenure_type,
|
"photo_thumbnail": self.photoThumbnail,
|
||||||
"updated_days":
|
|
||||||
self.updateDaysAgo,
|
|
||||||
"status":
|
|
||||||
self.status,
|
|
||||||
"last_seen":
|
|
||||||
self.last_seen,
|
|
||||||
"agency":
|
|
||||||
self.agency,
|
|
||||||
"council_tax_band":
|
|
||||||
self.councilTaxBand,
|
|
||||||
"photo_thumbnail":
|
|
||||||
self.photoThumbnail,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
def __routing_cache_key(
|
def __routing_cache_key(
|
||||||
|
|
@ -420,35 +418,38 @@ class Listing:
|
||||||
|
|
||||||
for step in steps:
|
for step in steps:
|
||||||
if not used_transit and step["travelMode"] == "WALK":
|
if not used_transit and step["travelMode"] == "WALK":
|
||||||
initial_walk_duration += int(
|
initial_walk_duration += int(step["staticDuration"].strip("s"))
|
||||||
step["staticDuration"].strip("s"))
|
|
||||||
else:
|
else:
|
||||||
used_transit = True
|
used_transit = True
|
||||||
duration_per_transit[step["travelMode"]] += int(
|
duration_per_transit[step["travelMode"]] += int(
|
||||||
step["staticDuration"].strip("s"))
|
step["staticDuration"].strip("s")
|
||||||
|
)
|
||||||
distance_per_transit[step["travelMode"]] += step.get(
|
distance_per_transit[step["travelMode"]] += step.get(
|
||||||
"distanceMeters", 0)
|
"distanceMeters", 0
|
||||||
|
)
|
||||||
if step["travelMode"] == "TRANSIT":
|
if step["travelMode"] == "TRANSIT":
|
||||||
number_of_transit_stops += 1
|
number_of_transit_stops += 1
|
||||||
|
|
||||||
res.append({
|
res.append(
|
||||||
"duration": duration,
|
{
|
||||||
"distance": distance,
|
"duration": duration,
|
||||||
"duration_static": duration_static,
|
"distance": distance,
|
||||||
"initial_walk_duration": initial_walk_duration,
|
"duration_static": duration_static,
|
||||||
"duration_per_transit": dict(duration_per_transit),
|
"initial_walk_duration": initial_walk_duration,
|
||||||
"distance_per_transit": dict(distance_per_transit),
|
"duration_per_transit": dict(duration_per_transit),
|
||||||
"number_of_transit_stops": number_of_transit_stops,
|
"distance_per_transit": dict(distance_per_transit),
|
||||||
})
|
"number_of_transit_stops": number_of_transit_stops,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
return res[:limit]
|
return res[:limit]
|
||||||
|
|
||||||
def __get_routing_cache(self) -> dict[str, Any]:
|
def __get_routing_cache(self) -> dict[str, Any]:
|
||||||
try:
|
try:
|
||||||
with open(self.path_routing_json(), 'x') as f:
|
with open(self.path_routing_json(), "x") as f:
|
||||||
json.dump({}, f)
|
json.dump({}, f)
|
||||||
return {}
|
return {}
|
||||||
except FileExistsError:
|
except FileExistsError:
|
||||||
pass
|
pass
|
||||||
with open(self.path_routing_json(), 'r') as f:
|
with open(self.path_routing_json(), "r") as f:
|
||||||
return json.load(f)
|
return json.load(f)
|
||||||
|
|
|
||||||
|
|
@ -1,12 +1,10 @@
|
||||||
import logging
|
import logging
|
||||||
|
|
||||||
|
|
||||||
def createLogger(name):
|
def createLogger(name):
|
||||||
logging.basicConfig(
|
logging.basicConfig(
|
||||||
level=logging.INFO,
|
level=logging.INFO,
|
||||||
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
|
||||||
handlers=[
|
handlers=[logging.FileHandler("app.log"), logging.StreamHandler()],
|
||||||
logging.FileHandler('app.log'),
|
|
||||||
logging.StreamHandler()
|
|
||||||
]
|
|
||||||
)
|
)
|
||||||
return logging.getLogger(name)
|
return logging.getLogger(name)
|
||||||
|
|
|
||||||
148
crawler/main.py
148
crawler/main.py
|
|
@ -12,18 +12,18 @@ from rec.query import ListingType, FurnishType
|
||||||
from rec.routing import API_KEY_ENVIRONMENT_VARIABLE, TravelMode
|
from rec.routing import API_KEY_ENVIRONMENT_VARIABLE, TravelMode
|
||||||
from ui_exporter import export_immoweb as export_immoweb_ui
|
from ui_exporter import export_immoweb as export_immoweb_ui
|
||||||
|
|
||||||
dump_listings_module = importlib.import_module('1_dump_listings')
|
dump_listings_module = importlib.import_module("1_dump_listings")
|
||||||
dump_detail_module = importlib.import_module('2_dump_detail')
|
dump_detail_module = importlib.import_module("2_dump_detail")
|
||||||
dump_images_module = importlib.import_module('3_dump_images')
|
dump_images_module = importlib.import_module("3_dump_images")
|
||||||
detect_floorplan_module = importlib.import_module('4_detect_floorplan')
|
detect_floorplan_module = importlib.import_module("4_detect_floorplan")
|
||||||
routing_module = importlib.import_module('5_routing')
|
routing_module = importlib.import_module("5_routing")
|
||||||
|
|
||||||
|
|
||||||
@click.group()
|
@click.group()
|
||||||
@click.option(
|
@click.option(
|
||||||
'--data-dir',
|
"--data-dir",
|
||||||
default=pathlib.Path("data/rs/"),
|
default=pathlib.Path("data/rs/"),
|
||||||
help='Districts to scrape',
|
help="Districts to scrape",
|
||||||
type=click.Path(
|
type=click.Path(
|
||||||
writable=True,
|
writable=True,
|
||||||
file_okay=False,
|
file_okay=False,
|
||||||
|
|
@ -34,15 +34,15 @@ routing_module = importlib.import_module('5_routing')
|
||||||
@click.pass_context
|
@click.pass_context
|
||||||
def cli(ctx, data_dir: str):
|
def cli(ctx, data_dir: str):
|
||||||
ctx.ensure_object(dict)
|
ctx.ensure_object(dict)
|
||||||
ctx.obj['data_dir'] = data_dir
|
ctx.obj["data_dir"] = data_dir
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
@cli.command()
|
@cli.command()
|
||||||
@click.option(
|
@click.option(
|
||||||
'--type',
|
"--type",
|
||||||
'-t',
|
"-t",
|
||||||
help='Type of listing to scrape',
|
help="Type of listing to scrape",
|
||||||
type=click.Choice(
|
type=click.Choice(
|
||||||
ListingType.__members__.keys(),
|
ListingType.__members__.keys(),
|
||||||
case_sensitive=False,
|
case_sensitive=False,
|
||||||
|
|
@ -50,45 +50,42 @@ def cli(ctx, data_dir: str):
|
||||||
required=True,
|
required=True,
|
||||||
)
|
)
|
||||||
@click.option(
|
@click.option(
|
||||||
'--min-bedrooms',
|
"--min-bedrooms",
|
||||||
default=1,
|
default=1,
|
||||||
help='Minimum number of bedrooms',
|
help="Minimum number of bedrooms",
|
||||||
type=click.IntRange(min=1),
|
type=click.IntRange(min=1),
|
||||||
)
|
)
|
||||||
@click.option(
|
@click.option(
|
||||||
'--max-bedrooms',
|
"--max-bedrooms",
|
||||||
default=5,
|
default=5,
|
||||||
help='Maximum number of bedrooms',
|
help="Maximum number of bedrooms",
|
||||||
type=click.IntRange(min=1),
|
type=click.IntRange(min=1),
|
||||||
)
|
)
|
||||||
@click.option(
|
@click.option(
|
||||||
'--min-price',
|
"--min-price",
|
||||||
default=0,
|
default=0,
|
||||||
help='Minimum price',
|
help="Minimum price",
|
||||||
type=click.IntRange(min=0),
|
type=click.IntRange(min=0),
|
||||||
)
|
)
|
||||||
@click.option(
|
@click.option(
|
||||||
'--max-price',
|
"--max-price",
|
||||||
default=1000000,
|
default=1000000,
|
||||||
help='Maximum price',
|
help="Maximum price",
|
||||||
type=click.IntRange(min=0),
|
type=click.IntRange(min=0),
|
||||||
)
|
)
|
||||||
@click.option(
|
@click.option(
|
||||||
'--district',
|
"--district",
|
||||||
default=None,
|
default=None,
|
||||||
help='Districts to scrape',
|
help="Districts to scrape",
|
||||||
type=click.Choice(get_districts().keys(), case_sensitive=False),
|
type=click.Choice(get_districts().keys(), case_sensitive=False),
|
||||||
multiple=True,
|
multiple=True,
|
||||||
)
|
)
|
||||||
@click.option(
|
@click.option(
|
||||||
'--furnish-types',
|
"--furnish-types",
|
||||||
'-f',
|
"-f",
|
||||||
help='Furnish types for rented listings',
|
help="Furnish types for rented listings",
|
||||||
type=click.Choice(
|
type=click.Choice(
|
||||||
[
|
[furnish_type.name for furnish_type in FurnishType.__members__.values()],
|
||||||
furnish_type.name
|
|
||||||
for furnish_type in FurnishType.__members__.values()
|
|
||||||
],
|
|
||||||
case_sensitive=False,
|
case_sensitive=False,
|
||||||
),
|
),
|
||||||
multiple=True,
|
multiple=True,
|
||||||
|
|
@ -104,7 +101,7 @@ def dump_listings(
|
||||||
type: str,
|
type: str,
|
||||||
furnish_types: list[str],
|
furnish_types: list[str],
|
||||||
):
|
):
|
||||||
data_dir: str = ctx.obj['data_dir']
|
data_dir: str = ctx.obj["data_dir"]
|
||||||
query_parameters = dump_listings_module.QueryParameters(
|
query_parameters = dump_listings_module.QueryParameters(
|
||||||
listing_type=ListingType[type],
|
listing_type=ListingType[type],
|
||||||
district_names=set(district),
|
district_names=set(district),
|
||||||
|
|
@ -112,23 +109,21 @@ def dump_listings(
|
||||||
max_bedrooms=max_bedrooms,
|
max_bedrooms=max_bedrooms,
|
||||||
min_price=min_price,
|
min_price=min_price,
|
||||||
max_price=max_price,
|
max_price=max_price,
|
||||||
furnish_types=[
|
furnish_types=[FurnishType[furnish_type] for furnish_type in furnish_types],
|
||||||
FurnishType[furnish_type] for furnish_type in furnish_types
|
|
||||||
],
|
|
||||||
)
|
)
|
||||||
click.echo(
|
click.echo(
|
||||||
f'Running dump_listings for districts {district}, data dir {data_dir} and parameters: '
|
f"Running dump_listings for districts {district}, data dir {data_dir} and parameters: "
|
||||||
f'{query_parameters}')
|
f"{query_parameters}"
|
||||||
|
)
|
||||||
data_dir_path = pathlib.Path(data_dir)
|
data_dir_path = pathlib.Path(data_dir)
|
||||||
asyncio.run(
|
asyncio.run(dump_listings_module.dump_listings(query_parameters, data_dir_path))
|
||||||
dump_listings_module.dump_listings(query_parameters, data_dir_path))
|
|
||||||
|
|
||||||
|
|
||||||
@cli.command()
|
@cli.command()
|
||||||
@click.pass_context
|
@click.pass_context
|
||||||
def dump_details(ctx: click.core.Context):
|
def dump_details(ctx: click.core.Context):
|
||||||
data_dir = ctx.obj['data_dir']
|
data_dir = ctx.obj["data_dir"]
|
||||||
click.echo(f'Running dump_detail for listings stored in {data_dir}')
|
click.echo(f"Running dump_detail for listings stored in {data_dir}")
|
||||||
listing_paths = sorted(list(pathlib.Path(data_dir).glob("*/listing.json")))
|
listing_paths = sorted(list(pathlib.Path(data_dir).glob("*/listing.json")))
|
||||||
asyncio.run(dump_detail_module.dump_detail(listing_paths))
|
asyncio.run(dump_detail_module.dump_detail(listing_paths))
|
||||||
|
|
||||||
|
|
@ -136,8 +131,8 @@ def dump_details(ctx: click.core.Context):
|
||||||
@cli.command()
|
@cli.command()
|
||||||
@click.pass_context
|
@click.pass_context
|
||||||
def dump_images(ctx: click.core.Context):
|
def dump_images(ctx: click.core.Context):
|
||||||
data_dir = ctx.obj['data_dir']
|
data_dir = ctx.obj["data_dir"]
|
||||||
click.echo(f'Running dump_images stored in {data_dir}')
|
click.echo(f"Running dump_images stored in {data_dir}")
|
||||||
listing_paths = sorted(list(pathlib.Path(data_dir).glob("*/listing.json")))
|
listing_paths = sorted(list(pathlib.Path(data_dir).glob("*/listing.json")))
|
||||||
asyncio.run(dump_images_module.dump_images(listing_paths))
|
asyncio.run(dump_images_module.dump_images(listing_paths))
|
||||||
|
|
||||||
|
|
@ -145,24 +140,24 @@ def dump_images(ctx: click.core.Context):
|
||||||
@cli.command()
|
@cli.command()
|
||||||
@click.pass_context
|
@click.pass_context
|
||||||
def detect_floorplan(ctx: click.core.Context):
|
def detect_floorplan(ctx: click.core.Context):
|
||||||
data_dir = ctx.obj['data_dir']
|
data_dir = ctx.obj["data_dir"]
|
||||||
click.echo(f'Running detect_floorplan in {data_dir}')
|
click.echo(f"Running detect_floorplan in {data_dir}")
|
||||||
listing_paths = sorted(list(pathlib.Path(data_dir).glob("*/listing.json")))
|
listing_paths = sorted(list(pathlib.Path(data_dir).glob("*/listing.json")))
|
||||||
asyncio.run(detect_floorplan_module.detect_floorplan(listing_paths))
|
asyncio.run(detect_floorplan_module.detect_floorplan(listing_paths))
|
||||||
|
|
||||||
|
|
||||||
@cli.command()
|
@cli.command()
|
||||||
@click.option(
|
@click.option(
|
||||||
'--destination-address',
|
"--destination-address",
|
||||||
'-d',
|
"-d",
|
||||||
help='Destination address for routing',
|
help="Destination address for routing",
|
||||||
required=True,
|
required=True,
|
||||||
type=click.STRING,
|
type=click.STRING,
|
||||||
)
|
)
|
||||||
@click.option(
|
@click.option(
|
||||||
'--travel-mode',
|
"--travel-mode",
|
||||||
'-m',
|
"-m",
|
||||||
help='Travel mode for routing',
|
help="Travel mode for routing",
|
||||||
type=click.Choice(
|
type=click.Choice(
|
||||||
TravelMode.__members__.keys(),
|
TravelMode.__members__.keys(),
|
||||||
case_sensitive=False,
|
case_sensitive=False,
|
||||||
|
|
@ -170,23 +165,25 @@ def detect_floorplan(ctx: click.core.Context):
|
||||||
required=True,
|
required=True,
|
||||||
)
|
)
|
||||||
@click.option(
|
@click.option(
|
||||||
'--limit',
|
"--limit",
|
||||||
'-l',
|
"-l",
|
||||||
help='Limit the number of listings to process',
|
help="Limit the number of listings to process",
|
||||||
type=click.IntRange(min=1),
|
type=click.IntRange(min=1),
|
||||||
default=1, # by default limit to 1 to avoid accidental API usage
|
default=1, # by default limit to 1 to avoid accidental API usage
|
||||||
)
|
)
|
||||||
@click.pass_context
|
@click.pass_context
|
||||||
def routing(ctx: click.core.Context, destination_address: str,
|
def routing(
|
||||||
travel_mode: str, limit: int):
|
ctx: click.core.Context, destination_address: str, travel_mode: str, limit: int
|
||||||
data_dir = ctx.obj['data_dir']
|
):
|
||||||
click.echo(f'Running routing for the first {limit} listings in {data_dir}')
|
data_dir = ctx.obj["data_dir"]
|
||||||
|
click.echo(f"Running routing for the first {limit} listings in {data_dir}")
|
||||||
listing_paths = sorted(list(pathlib.Path(data_dir).glob("*/listing.json")))
|
listing_paths = sorted(list(pathlib.Path(data_dir).glob("*/listing.json")))
|
||||||
listing_paths = listing_paths[:limit]
|
listing_paths = listing_paths[:limit]
|
||||||
if os.environ.get(API_KEY_ENVIRONMENT_VARIABLE) is None:
|
if os.environ.get(API_KEY_ENVIRONMENT_VARIABLE) is None:
|
||||||
raise click.exceptions.MissingParameter(
|
raise click.exceptions.MissingParameter(
|
||||||
f'{API_KEY_ENVIRONMENT_VARIABLE} environment variable is not set. '
|
f"{API_KEY_ENVIRONMENT_VARIABLE} environment variable is not set. "
|
||||||
'Please set it to your API key for the routing service.')
|
"Please set it to your API key for the routing service."
|
||||||
|
)
|
||||||
|
|
||||||
asyncio.run(
|
asyncio.run(
|
||||||
routing_module.calculate_route(
|
routing_module.calculate_route(
|
||||||
|
|
@ -194,14 +191,15 @@ def routing(ctx: click.core.Context, destination_address: str,
|
||||||
destination_address,
|
destination_address,
|
||||||
# destination_address_coordinates,
|
# destination_address_coordinates,
|
||||||
TravelMode[travel_mode],
|
TravelMode[travel_mode],
|
||||||
))
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
@cli.command()
|
@cli.command()
|
||||||
@click.option(
|
@click.option(
|
||||||
'--columns',
|
"--columns",
|
||||||
'-C',
|
"-C",
|
||||||
help='Columns to include in the CSV file',
|
help="Columns to include in the CSV file",
|
||||||
type=click.Choice(
|
type=click.Choice(
|
||||||
Listing.ALL_COLUMNS,
|
Listing.ALL_COLUMNS,
|
||||||
case_sensitive=False,
|
case_sensitive=False,
|
||||||
|
|
@ -210,9 +208,9 @@ def routing(ctx: click.core.Context, destination_address: str,
|
||||||
default=Listing.ALL_COLUMNS,
|
default=Listing.ALL_COLUMNS,
|
||||||
)
|
)
|
||||||
@click.option(
|
@click.option(
|
||||||
'--output-file',
|
"--output-file",
|
||||||
'-O',
|
"-O",
|
||||||
help='Path to the output CSV file',
|
help="Path to the output CSV file",
|
||||||
required=True,
|
required=True,
|
||||||
type=click.Path(
|
type=click.Path(
|
||||||
writable=True,
|
writable=True,
|
||||||
|
|
@ -223,20 +221,21 @@ def routing(ctx: click.core.Context, destination_address: str,
|
||||||
)
|
)
|
||||||
@click.pass_context
|
@click.pass_context
|
||||||
def export_csv(ctx: click.core.Context, output_file: str, columns: tuple[str]):
|
def export_csv(ctx: click.core.Context, output_file: str, columns: tuple[str]):
|
||||||
data_dir = ctx.obj['data_dir']
|
data_dir = ctx.obj["data_dir"]
|
||||||
click.echo(f'Exporting data to {output_file} using {data_dir=}')
|
click.echo(f"Exporting data to {output_file} using {data_dir=}")
|
||||||
output_file_path = pathlib.Path(output_file)
|
output_file_path = pathlib.Path(output_file)
|
||||||
listing_paths = sorted(list(pathlib.Path(data_dir).glob("*/listing.json")))
|
listing_paths = sorted(list(pathlib.Path(data_dir).glob("*/listing.json")))
|
||||||
listings = Listing.get_all_listings([str(path) for path in listing_paths])
|
listings = Listing.get_all_listings([str(path) for path in listing_paths])
|
||||||
asyncio.run(
|
asyncio.run(
|
||||||
csv_exporter.export_to_csv(listings, output_file_path,
|
csv_exporter.export_to_csv(listings, output_file_path, list(columns)),
|
||||||
list(columns)), )
|
)
|
||||||
|
|
||||||
|
|
||||||
@cli.command()
|
@cli.command()
|
||||||
@click.option(
|
@click.option(
|
||||||
'--output-file',
|
"--output-file",
|
||||||
'-O',
|
"-O",
|
||||||
help='Path to the output immoweb file',
|
help="Path to the output immoweb file",
|
||||||
required=True,
|
required=True,
|
||||||
type=click.Path(
|
type=click.Path(
|
||||||
writable=True,
|
writable=True,
|
||||||
|
|
@ -247,10 +246,9 @@ def export_csv(ctx: click.core.Context, output_file: str, columns: tuple[str]):
|
||||||
)
|
)
|
||||||
@click.pass_context
|
@click.pass_context
|
||||||
def export_immoweb(ctx, output_file: str):
|
def export_immoweb(ctx, output_file: str):
|
||||||
click.echo(f'Exporting data to {output_file}')
|
click.echo(f"Exporting data to {output_file}")
|
||||||
asyncio.run(export_immoweb_ui(ctx, output_file))
|
asyncio.run(export_immoweb_ui(ctx, output_file))
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
if __name__ == '__main__':
|
|
||||||
cli()
|
cli()
|
||||||
|
|
|
||||||
|
|
@ -6,6 +6,7 @@ import numpy as np
|
||||||
|
|
||||||
def inference(image_path):
|
def inference(image_path):
|
||||||
from transformers import Pix2StructProcessor, Pix2StructForConditionalGeneration
|
from transformers import Pix2StructProcessor, Pix2StructForConditionalGeneration
|
||||||
|
|
||||||
image = Image.open(image_path)
|
image = Image.open(image_path)
|
||||||
question = "How many living rooms are displayed on this floor plan?" # not sure if it even has an effect
|
question = "How many living rooms are displayed on this floor plan?" # not sure if it even has an effect
|
||||||
processor = Pix2StructProcessor.from_pretrained("google/deplot")
|
processor = Pix2StructProcessor.from_pretrained("google/deplot")
|
||||||
|
|
@ -35,15 +36,17 @@ def calculate_model(image_path):
|
||||||
|
|
||||||
|
|
||||||
def improve_img_for_ocr(img: Image):
|
def improve_img_for_ocr(img: Image):
|
||||||
img2 = np.array(img.convert('L'))
|
img2 = np.array(img.convert("L"))
|
||||||
cv2.resize(img2, None, fx=1.2, fy=1.2, interpolation=cv2.INTER_CUBIC)
|
cv2.resize(img2, None, fx=1.2, fy=1.2, interpolation=cv2.INTER_CUBIC)
|
||||||
thresh = cv2.adaptiveThreshold(img2, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
|
thresh = cv2.adaptiveThreshold(
|
||||||
cv2.THRESH_BINARY, 11, 2)
|
img2, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2
|
||||||
|
)
|
||||||
return Image.fromarray(thresh)
|
return Image.fromarray(thresh)
|
||||||
|
|
||||||
|
|
||||||
def calculate_ocr(image_path):
|
def calculate_ocr(image_path):
|
||||||
import pytesseract
|
import pytesseract
|
||||||
|
|
||||||
img = Image.open(image_path)
|
img = Image.open(image_path)
|
||||||
text = pytesseract.image_to_string(img)
|
text = pytesseract.image_to_string(img)
|
||||||
estimated_sqm = extract_total_sqm(text)
|
estimated_sqm = extract_total_sqm(text)
|
||||||
|
|
@ -52,9 +55,7 @@ def calculate_ocr(image_path):
|
||||||
text2 = pytesseract.image_to_string(improved_img)
|
text2 = pytesseract.image_to_string(improved_img)
|
||||||
estimated_sqm2 = extract_total_sqm(text2)
|
estimated_sqm2 = extract_total_sqm(text2)
|
||||||
with open("recalculating.log", "a") as f:
|
with open("recalculating.log", "a") as f:
|
||||||
f.write(
|
f.write(f"before: {estimated_sqm} after: {estimated_sqm2} - {image_path}\n")
|
||||||
f"before: {estimated_sqm} after: {estimated_sqm2} - {image_path}\n"
|
|
||||||
)
|
|
||||||
return estimated_sqm2, text2
|
return estimated_sqm2, text2
|
||||||
|
|
||||||
return estimated_sqm, text
|
return estimated_sqm, text
|
||||||
|
|
|
||||||
|
|
@ -30,19 +30,13 @@ def transit_route(
|
||||||
header = {
|
header = {
|
||||||
"X-Goog-Api-Key": api_key,
|
"X-Goog-Api-Key": api_key,
|
||||||
"Content-Type": "application/json",
|
"Content-Type": "application/json",
|
||||||
"X-Goog-FieldMask": # "routes.*",
|
"X-Goog-FieldMask": "routes.distanceMeters,routes.duration,routes.staticDuration,routes.legs.steps.distanceMeters,routes.legs.steps.staticDuration,routes.legs.steps.travelMode", # "routes.*",
|
||||||
"routes.distanceMeters,routes.duration,routes.staticDuration,routes.legs.steps.distanceMeters,routes.legs.steps.staticDuration,routes.legs.steps.travelMode",
|
|
||||||
}
|
}
|
||||||
|
|
||||||
body = {
|
body = {
|
||||||
"origin": {
|
"origin": {
|
||||||
# "address": origin_address
|
# "address": origin_address
|
||||||
"location": {
|
"location": {"latLng": {"latitude": origin_lat, "longitude": origin_lon}}
|
||||||
"latLng": {
|
|
||||||
"latitude": origin_lat,
|
|
||||||
"longitude": origin_lon
|
|
||||||
}
|
|
||||||
}
|
|
||||||
},
|
},
|
||||||
"destination": {
|
"destination": {
|
||||||
"address": dest_address
|
"address": dest_address
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue