reformat with black; looks better

This commit is contained in:
Viktor Barzin 2025-05-31 23:50:43 +00:00
parent 1122f5a96f
commit 0b9d50af47
No known key found for this signature in database
GPG key ID: 4056458DBDBF8863
11 changed files with 240 additions and 244 deletions

View file

@ -25,8 +25,8 @@ class QueryParameters:
async def dump_listings( async def dump_listings(
parameters: QueryParameters, parameters: QueryParameters,
data_dir: pathlib.Path = pathlib.Path("data/rs/"), data_dir: pathlib.Path = pathlib.Path("data/rs/"),
) -> list[Listing]: ) -> list[Listing]:
if parameters.district_names: if parameters.district_names:
districts = { districts = {
@ -39,29 +39,32 @@ async def dump_listings(
print("Valid districts to scrape:", districts.keys()) print("Valid districts to scrape:", districts.keys())
listings = [] listings = []
json_responses = await asyncio.gather(*[ json_responses = await asyncio.gather(
listing_query( *[
page=i, listing_query(
channel=parameters.listing_type, page=i,
min_bedrooms=parameters.min_bedrooms, channel=parameters.listing_type,
max_bedrooms=parameters.max_bedrooms, min_bedrooms=parameters.min_bedrooms,
radius=parameters.radius, max_bedrooms=parameters.max_bedrooms,
min_price=parameters.min_price, radius=parameters.radius,
max_price=parameters.max_price, min_price=parameters.min_price,
location_id=locid, max_price=parameters.max_price,
page_size=parameters.page_size, location_id=locid,
max_days_since_added=parameters.max_days_since_added, page_size=parameters.page_size,
furnish_types=parameters.furnish_types or [], max_days_since_added=parameters.max_days_since_added,
) for locid in districts.values() for i in [1, 2] furnish_types=parameters.furnish_types or [],
]) )
for locid in districts.values()
for i in [1, 2]
]
)
listings = [] listings = []
for response_json in json_responses: for response_json in json_responses:
if response_json["totalAvailableResults"] == 0: if response_json["totalAvailableResults"] == 0:
print("No results found") print("No results found")
continue continue
if response_json["totalAvailableResults"] > 0: if response_json["totalAvailableResults"] > 0:
print("totalAvailableResults: ", print("totalAvailableResults: ", response_json["totalAvailableResults"])
response_json["totalAvailableResults"])
for property in response_json["properties"]: for property in response_json["properties"]:
identifier = property["identifier"] identifier = property["identifier"]

View file

@ -12,7 +12,8 @@ semaphore = asyncio.Semaphore(10)
async def dump_detail(listing_paths: list[str]): async def dump_detail(listing_paths: list[str]):
listings = Listing.get_all_listings(listing_paths) listings = Listing.get_all_listings(listing_paths)
filtered_listings = await tqdm.gather( filtered_listings = await tqdm.gather(
*[_dump_detail_for_listing(listing) for listing in listings]) *[_dump_detail_for_listing(listing) for listing in listings]
)
return filtered_listings return filtered_listings

View file

@ -10,8 +10,7 @@ semaphore = asyncio.Semaphore(10)
async def dump_images(listing_paths: list[str]): async def dump_images(listing_paths: list[str]):
listings = Listing.get_all_listings(listing_paths) listings = Listing.get_all_listings(listing_paths)
await tqdm.gather( await tqdm.gather(*[dump_images_for_listing(listing) for listing in listings])
*[dump_images_for_listing(listing) for listing in listings])
async def dump_images_for_listing(listing: Listing): async def dump_images_for_listing(listing: Listing):
@ -30,8 +29,7 @@ async def dump_images_for_listing(listing: Listing):
async with semaphore: async with semaphore:
async with session.get(url) as response: async with session.get(url) as response:
if response.status != 200: if response.status != 200:
raise Exception( raise Exception(f"Error for {url}: {response.status}")
f"Error for {url}: {response.status}")
with open(p, "wb") as f: with open(p, "wb") as f:
f.write(await response.read()) f.write(await response.read())
except Exception as e: except Exception as e:

View file

@ -9,13 +9,13 @@ async def detect_floorplan(listing_paths: list[str]):
cpu_count = multiprocessing.cpu_count() // 4 cpu_count = multiprocessing.cpu_count() // 4
semaphore = asyncio.Semaphore(cpu_count) semaphore = asyncio.Semaphore(cpu_count)
await tqdm.gather(*[ await tqdm.gather(
_detect_floorplan_with_semaphore(listing, semaphore) *[_detect_floorplan_with_semaphore(listing, semaphore) for listing in listings]
for listing in listings )
])
async def _detect_floorplan_with_semaphore(listing: Listing, async def _detect_floorplan_with_semaphore(
semaphore: asyncio.Semaphore): listing: Listing, semaphore: asyncio.Semaphore
):
async with semaphore: async with semaphore:
return await listing.calculate_sqm_ocr(recalculate=False) return await listing.calculate_sqm_ocr(recalculate=False)

View file

@ -14,20 +14,22 @@ async def calculate_route(
# reduce listings to everything within 7 miles # reduce listings to everything within 7 miles
filtered_listings = [] filtered_listings = []
for listing in listings: for listing in listings:
print(f'Processing {listing.identifier}') print(f"Processing {listing.identifier}")
if listing.isRemoved: if listing.isRemoved:
print(f"Removed-Skip: Skipping {listing.identifier} " print(f"Removed-Skip: Skipping {listing.identifier} " "is already removed.")
"is already removed.")
continue continue
sqm_ocr = await listing.sqm_ocr() sqm_ocr = await listing.sqm_ocr()
if (sqm_ocr is None or sqm_ocr < 30 or sqm_ocr > 200): if sqm_ocr is None or sqm_ocr < 30 or sqm_ocr > 200:
print((f"Floorplan-Skip: Skipping {listing.identifier} as " print(
f"sqm_ocr is {sqm_ocr}")) (
f"Floorplan-Skip: Skipping {listing.identifier} as "
f"sqm_ocr is {sqm_ocr}"
)
)
continue continue
filtered_listings.append(listing) filtered_listings.append(listing)
print( print(f"Filtered listings from {len(listings)} to {len(filtered_listings)}")
f"Filtered listings from {len(listings)} to {len(filtered_listings)}")
for listing in tqdm(filtered_listings): for listing in tqdm(filtered_listings):
listing.calculate_route( listing.calculate_route(

View file

@ -12,9 +12,9 @@ async def export_to_csv(
ds = await asyncio.gather(*[listing.dict_nicely() for listing in listings]) ds = await asyncio.gather(*[listing.dict_nicely() for listing in listings])
df = pd.DataFrame(ds) df = pd.DataFrame(ds)
# read decisions on file # read decisions on file
decisions_path = 'data/decisions.json' decisions_path = "data/decisions.json"
decisions = pd.read_json(decisions_path) decisions = pd.read_json(decisions_path)
df.loc[:, 'decision'] = df.identifier.apply(lambda x: decisions.get(x)) df.loc[:, "decision"] = df.identifier.apply(lambda x: decisions.get(x))
# remove all entries where we didnt calculate transit time (probably due to a too far distance) # remove all entries where we didnt calculate transit time (probably due to a too far distance)
# df2 = df[df.travel_time_fastest.notna()] # df2 = df[df.travel_time_fastest.notna()]
@ -26,9 +26,9 @@ async def export_to_csv(
# s1 = df2 # s1 = df2
# fill in gap values for service charge and lease left. This is for excel so we can use filters better there # fill in gap values for service charge and lease left. This is for excel so we can use filters better there
df2.loc[:, 'service_charge'] = df2.service_charge.fillna(-1) df2.loc[:, "service_charge"] = df2.service_charge.fillna(-1)
df2.loc[:, 'lease_left'] = df2.lease_left.fillna(-1) df2.loc[:, "lease_left"] = df2.lease_left.fillna(-1)
df2.loc[:, 'sqm_ocr'] = df2.sqm_ocr.fillna(-1) df2.loc[:, "sqm_ocr"] = df2.sqm_ocr.fillna(-1)
df3 = df2 df3 = df2
# df3 = pd.concat([df2.drop(['travel_time_fastest', 'travel_time_second'], axis=1), s1], axis=1) # df3 = pd.concat([df2.drop(['travel_time_fastest', 'travel_time_second'], axis=1), s1], axis=1)
@ -37,5 +37,5 @@ async def export_to_csv(
df4 = df3 df4 = df3
df5 = df4[columns] df5 = df4[columns]
df6 = df5.sort_values(by=['price_per_sqm'], ascending=True) df6 = df5.sort_values(by=["price_per_sqm"], ascending=True)
df6.to_csv(str(output_file), index=False) df6.to_csv(str(output_file), index=False)

View file

@ -46,11 +46,13 @@ class Listing:
# data_dir is the first directory before the listing_path # data_dir is the first directory before the listing_path
data_dir = pathlib.Path(listing_path) data_dir = pathlib.Path(listing_path)
while str(d['identifier']) in str(data_dir.resolve().absolute()): while str(d["identifier"]) in str(data_dir.resolve().absolute()):
data_dir = data_dir.parent data_dir = data_dir.parent
listing = Listing(d["identifier"], data_dir=data_dir) listing = Listing(d["identifier"], data_dir=data_dir)
if (listing.last_seen is not None if (
and listing.last_seen < seen_in_the_last_n_days): listing.last_seen is not None
and listing.last_seen < seen_in_the_last_n_days
):
identifiers.append(listing) identifiers.append(listing)
return identifiers return identifiers
@ -107,16 +109,19 @@ class Listing:
def calculate_sqm_model(self): def calculate_sqm_model(self):
objs = [] objs = []
for floorplan_path in self.list_floorplans(): for floorplan_path in self.list_floorplans():
estimated_sqm, model_output, predictions = ( estimated_sqm, model_output, predictions = floorplan.calculate_model(
floorplan.calculate_model(floorplan_path)) floorplan_path
objs.append({ )
"floorplan_path": str(floorplan_path), objs.append(
"estimated_sqm": estimated_sqm, {
"model_output": model_output, "floorplan_path": str(floorplan_path),
"no_predictions": len( "estimated_sqm": estimated_sqm,
predictions "model_output": model_output,
), # cant serialize the predictions itself since its a tensor "no_predictions": len(
}) predictions
), # cant serialize the predictions itself since its a tensor
}
)
with open(self.path_floorplan_model_json(), "w") as f: with open(self.path_floorplan_model_json(), "w") as f:
json.dump(objs, f) json.dump(objs, f)
@ -129,8 +134,9 @@ class Listing:
with open(self.path_floorplan_json()) as f: with open(self.path_floorplan_json()) as f:
objs = json.load(f) objs = json.load(f)
max_sqm = max([o["estimated_sqm"] for o in objs max_sqm = max(
if o is None]) # filter out Nones [o["estimated_sqm"] for o in objs if o is None]
) # filter out Nones
return max_sqm return max_sqm
async def calculate_sqm_ocr(self, recalculate=True): async def calculate_sqm_ocr(self, recalculate=True):
@ -143,12 +149,15 @@ class Listing:
for floorplan_path in self.list_floorplans(): for floorplan_path in self.list_floorplans():
estimated_sqm, model_output = await asyncio.to_thread( estimated_sqm, model_output = await asyncio.to_thread(
floorplan.calculate_ocr, floorplan_path) floorplan.calculate_ocr, floorplan_path
objs.append({ )
"floorplan_path": str(floorplan_path), objs.append(
"estimated_sqm": estimated_sqm, {
"text": model_output, "floorplan_path": str(floorplan_path),
}) "estimated_sqm": estimated_sqm,
"text": model_output,
}
)
with open(self.path_floorplan_ocr_json(), "w") as f: with open(self.path_floorplan_ocr_json(), "w") as f:
json.dump(objs, f) json.dump(objs, f)
@ -160,22 +169,20 @@ class Listing:
with open(self.path_floorplan_ocr_json()) as f: with open(self.path_floorplan_ocr_json()) as f:
objs = json.load(f) objs = json.load(f)
sqms = [ sqms = [o["estimated_sqm"] for o in objs if o["estimated_sqm"] is not None]
o["estimated_sqm"] for o in objs if o["estimated_sqm"] is not None
]
if len(sqms) == 0: if len(sqms) == 0:
return None return None
max_sqm = max(sqms) max_sqm = max(sqms)
return max_sqm return max_sqm
def calculate_route(self, def calculate_route(
dest_address: str, self, dest_address: str, travel_mode: routing.TravelMode, recalculate=False
travel_mode: routing.TravelMode, ) -> dict[str, Any]:
recalculate=False) -> dict[str, Any]:
routing_cache = self.__get_routing_cache() routing_cache = self.__get_routing_cache()
cache_key = self.__routing_cache_key(dest_address, travel_mode) cache_key = self.__routing_cache_key(dest_address, travel_mode)
if (route_cache := if (
routing_cache.get(cache_key)) is not None and not recalculate: route_cache := routing_cache.get(cache_key)
) is not None and not recalculate:
return {cache_key: route_cache} return {cache_key: route_cache}
result = routing.transit_route( result = routing.transit_route(
@ -185,8 +192,12 @@ class Listing:
travel_mode, travel_mode,
) )
if not result: if not result:
raise Exception((f"Error calculating route from {self.identifier} " raise Exception(
f"to '{dest_address}' by {travel_mode}")) (
f"Error calculating route from {self.identifier} "
f"to '{dest_address}' by {travel_mode}"
)
)
result = {**{cache_key: result}, **routing_cache} result = {**{cache_key: result}, **routing_cache}
with open(self.path_routing_json(), "w") as f: with open(self.path_routing_json(), "w") as f:
json.dump(result, f) json.dump(result, f)
@ -198,8 +209,7 @@ class Listing:
travel_mode: routing.TravelMode, travel_mode: routing.TravelMode,
) -> list[dict[str, Any]]: ) -> list[dict[str, Any]]:
data = self.calculate_route(destination_address, travel_mode) data = self.calculate_route(destination_address, travel_mode)
return self.__extract_travel_times(data, destination_address, return self.__extract_travel_times(data, destination_address, travel_mode)
travel_mode)
@property @property
def url(self): def url(self):
@ -246,8 +256,7 @@ class Listing:
@property @property
def leaseLeft(self) -> float | None: def leaseLeft(self) -> float | None:
ds = self.detailobject["property"].get("tenureInfo", ds = self.detailobject["property"].get("tenureInfo", {}).get("content", [])
{}).get("content", [])
for d in ds: for d in ds:
if d["type"] == "lengthOfLease": if d["type"] == "lengthOfLease":
matches = re.findall(r"(\d+\.?\d*)", d["value"]) matches = re.findall(r"(\d+\.?\d*)", d["value"])
@ -267,15 +276,14 @@ class Listing:
if not self.path_last_seen_listing().exists(): if not self.path_last_seen_listing().exists():
return None return None
with open(self.path_last_seen_listing(), 'r') as f: with open(self.path_last_seen_listing(), "r") as f:
datetime_str = json.load(f) datetime_str = json.load(f)
dt = datetime.datetime.fromisoformat(datetime_str) dt = datetime.datetime.fromisoformat(datetime_str)
return (datetime.datetime.now() - dt).days return (datetime.datetime.now() - dt).days
@property @property
def serviceCharge(self) -> float | None: def serviceCharge(self) -> float | None:
ds = self.detailobject["property"].get("tenureInfo", ds = self.detailobject["property"].get("tenureInfo", {}).get("content", [])
{}).get("content", [])
for d in ds: for d in ds:
if d["type"] == "annualServiceCharge": if d["type"] == "annualServiceCharge":
matches = re.findall(r"([\d,.]+)", d["value"]) matches = re.findall(r"([\d,.]+)", d["value"])
@ -300,25 +308,24 @@ class Listing:
@property @property
def status(self) -> str: def status(self) -> str:
if self.isRemoved: if self.isRemoved:
return 'removed' return "removed"
status = self.detailobject["property"]["status"] status = self.detailobject["property"]["status"]
return status return status
@property @property
def agency(self) -> str: def agency(self) -> str:
return self.detailobject['property']["branch"]["brandName"] return self.detailobject["property"]["branch"]["brandName"]
@property @property
def councilTaxBand(self) -> str: def councilTaxBand(self) -> str:
return self.detailobject['property']["councilTaxInfo"]["content"][0][ return self.detailobject["property"]["councilTaxInfo"]["content"][0]["value"]
"value"]
@property @property
def photoThumbnail(self) -> str | None: def photoThumbnail(self) -> str | None:
# options are: 'url', 'thumbnailUrl', 'maxSizeUrl' # options are: 'url', 'thumbnailUrl', 'maxSizeUrl'
photos = self.detailobject['property']['photos'] photos = self.detailobject["property"]["photos"]
if len(photos) > 0: if len(photos) > 0:
return photos[0]['url'] return photos[0]["url"]
return None return None
async def dict_nicely(self): async def dict_nicely(self):
@ -328,57 +335,48 @@ class Listing:
with open(self.path_routing_json(), "r") as f: with open(self.path_routing_json(), "r") as f:
travel_times = json.load(f) travel_times = json.load(f)
for destination_mode in travel_times.keys(): for destination_mode in travel_times.keys():
destination_mode_clean = destination_mode.replace(" ", destination_mode_clean = destination_mode.replace(" ", "_").replace(
"_").replace( ",", "_"
",", "_") )
destination, travel_mode = self.__from_routing_cache_key( destination, travel_mode = self.__from_routing_cache_key(
destination_mode) destination_mode
)
travel_time_fastest[destination_mode_clean] = self.travel_time( travel_time_fastest[destination_mode_clean] = self.travel_time(
destination, travel_mode)[0]['duration'] destination, travel_mode
)[0]["duration"]
travel_time_second[destination_mode_clean] = self.travel_time( travel_time_second[destination_mode_clean] = self.travel_time(
destination, travel_mode)[1]['duration'] destination, travel_mode
)[1]["duration"]
return { return {
"identifier": "identifier": self.identifier,
self.identifier, "sqm_ocr": await self.sqm_ocr(),
"sqm_ocr": "price": self.price,
await self.sqm_ocr(), "price_per_sqm": await self.price_per_sqm(),
"price": "url": self.url,
self.price, "bedrooms": self.bedrooms,
"price_per_sqm": "travel_time_fastest": ":".join(
await self.price_per_sqm(), sorted(
"url": f"{dest} in {travel_mode//60}min"
self.url, for dest, travel_mode in travel_time_fastest.items()
"bedrooms": )
self.bedrooms, ),
"travel_time_fastest": "travel_time_second": ":".join(
":".join( sorted(
sorted(f'{dest} in {travel_mode//60}min' f"{dest} in {travel_mode//60}min"
for dest, travel_mode in travel_time_fastest.items())), for dest, travel_mode in travel_time_second.items()
"travel_time_second": )
":".join( ),
sorted(f'{dest} in {travel_mode//60}min' "lease_left": self.leaseLeft,
for dest, travel_mode in travel_time_second.items())), "service_charge": self.serviceCharge,
"lease_left": "development": self.development,
self.leaseLeft, "tenure_type": self.tenure_type,
"service_charge": "updated_days": self.updateDaysAgo,
self.serviceCharge, "status": self.status,
"development": "last_seen": self.last_seen,
self.development, "agency": self.agency,
"tenure_type": "council_tax_band": self.councilTaxBand,
self.tenure_type, "photo_thumbnail": self.photoThumbnail,
"updated_days":
self.updateDaysAgo,
"status":
self.status,
"last_seen":
self.last_seen,
"agency":
self.agency,
"council_tax_band":
self.councilTaxBand,
"photo_thumbnail":
self.photoThumbnail,
} }
def __routing_cache_key( def __routing_cache_key(
@ -420,35 +418,38 @@ class Listing:
for step in steps: for step in steps:
if not used_transit and step["travelMode"] == "WALK": if not used_transit and step["travelMode"] == "WALK":
initial_walk_duration += int( initial_walk_duration += int(step["staticDuration"].strip("s"))
step["staticDuration"].strip("s"))
else: else:
used_transit = True used_transit = True
duration_per_transit[step["travelMode"]] += int( duration_per_transit[step["travelMode"]] += int(
step["staticDuration"].strip("s")) step["staticDuration"].strip("s")
)
distance_per_transit[step["travelMode"]] += step.get( distance_per_transit[step["travelMode"]] += step.get(
"distanceMeters", 0) "distanceMeters", 0
)
if step["travelMode"] == "TRANSIT": if step["travelMode"] == "TRANSIT":
number_of_transit_stops += 1 number_of_transit_stops += 1
res.append({ res.append(
"duration": duration, {
"distance": distance, "duration": duration,
"duration_static": duration_static, "distance": distance,
"initial_walk_duration": initial_walk_duration, "duration_static": duration_static,
"duration_per_transit": dict(duration_per_transit), "initial_walk_duration": initial_walk_duration,
"distance_per_transit": dict(distance_per_transit), "duration_per_transit": dict(duration_per_transit),
"number_of_transit_stops": number_of_transit_stops, "distance_per_transit": dict(distance_per_transit),
}) "number_of_transit_stops": number_of_transit_stops,
}
)
return res[:limit] return res[:limit]
def __get_routing_cache(self) -> dict[str, Any]: def __get_routing_cache(self) -> dict[str, Any]:
try: try:
with open(self.path_routing_json(), 'x') as f: with open(self.path_routing_json(), "x") as f:
json.dump({}, f) json.dump({}, f)
return {} return {}
except FileExistsError: except FileExistsError:
pass pass
with open(self.path_routing_json(), 'r') as f: with open(self.path_routing_json(), "r") as f:
return json.load(f) return json.load(f)

View file

@ -1,12 +1,10 @@
import logging import logging
def createLogger(name): def createLogger(name):
logging.basicConfig( logging.basicConfig(
level=logging.INFO, level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
handlers=[ handlers=[logging.FileHandler("app.log"), logging.StreamHandler()],
logging.FileHandler('app.log'),
logging.StreamHandler()
]
) )
return logging.getLogger(name) return logging.getLogger(name)

View file

@ -12,18 +12,18 @@ from rec.query import ListingType, FurnishType
from rec.routing import API_KEY_ENVIRONMENT_VARIABLE, TravelMode from rec.routing import API_KEY_ENVIRONMENT_VARIABLE, TravelMode
from ui_exporter import export_immoweb as export_immoweb_ui from ui_exporter import export_immoweb as export_immoweb_ui
dump_listings_module = importlib.import_module('1_dump_listings') dump_listings_module = importlib.import_module("1_dump_listings")
dump_detail_module = importlib.import_module('2_dump_detail') dump_detail_module = importlib.import_module("2_dump_detail")
dump_images_module = importlib.import_module('3_dump_images') dump_images_module = importlib.import_module("3_dump_images")
detect_floorplan_module = importlib.import_module('4_detect_floorplan') detect_floorplan_module = importlib.import_module("4_detect_floorplan")
routing_module = importlib.import_module('5_routing') routing_module = importlib.import_module("5_routing")
@click.group() @click.group()
@click.option( @click.option(
'--data-dir', "--data-dir",
default=pathlib.Path("data/rs/"), default=pathlib.Path("data/rs/"),
help='Districts to scrape', help="Districts to scrape",
type=click.Path( type=click.Path(
writable=True, writable=True,
file_okay=False, file_okay=False,
@ -34,15 +34,15 @@ routing_module = importlib.import_module('5_routing')
@click.pass_context @click.pass_context
def cli(ctx, data_dir: str): def cli(ctx, data_dir: str):
ctx.ensure_object(dict) ctx.ensure_object(dict)
ctx.obj['data_dir'] = data_dir ctx.obj["data_dir"] = data_dir
pass pass
@cli.command() @cli.command()
@click.option( @click.option(
'--type', "--type",
'-t', "-t",
help='Type of listing to scrape', help="Type of listing to scrape",
type=click.Choice( type=click.Choice(
ListingType.__members__.keys(), ListingType.__members__.keys(),
case_sensitive=False, case_sensitive=False,
@ -50,45 +50,42 @@ def cli(ctx, data_dir: str):
required=True, required=True,
) )
@click.option( @click.option(
'--min-bedrooms', "--min-bedrooms",
default=1, default=1,
help='Minimum number of bedrooms', help="Minimum number of bedrooms",
type=click.IntRange(min=1), type=click.IntRange(min=1),
) )
@click.option( @click.option(
'--max-bedrooms', "--max-bedrooms",
default=5, default=5,
help='Maximum number of bedrooms', help="Maximum number of bedrooms",
type=click.IntRange(min=1), type=click.IntRange(min=1),
) )
@click.option( @click.option(
'--min-price', "--min-price",
default=0, default=0,
help='Minimum price', help="Minimum price",
type=click.IntRange(min=0), type=click.IntRange(min=0),
) )
@click.option( @click.option(
'--max-price', "--max-price",
default=1000000, default=1000000,
help='Maximum price', help="Maximum price",
type=click.IntRange(min=0), type=click.IntRange(min=0),
) )
@click.option( @click.option(
'--district', "--district",
default=None, default=None,
help='Districts to scrape', help="Districts to scrape",
type=click.Choice(get_districts().keys(), case_sensitive=False), type=click.Choice(get_districts().keys(), case_sensitive=False),
multiple=True, multiple=True,
) )
@click.option( @click.option(
'--furnish-types', "--furnish-types",
'-f', "-f",
help='Furnish types for rented listings', help="Furnish types for rented listings",
type=click.Choice( type=click.Choice(
[ [furnish_type.name for furnish_type in FurnishType.__members__.values()],
furnish_type.name
for furnish_type in FurnishType.__members__.values()
],
case_sensitive=False, case_sensitive=False,
), ),
multiple=True, multiple=True,
@ -104,7 +101,7 @@ def dump_listings(
type: str, type: str,
furnish_types: list[str], furnish_types: list[str],
): ):
data_dir: str = ctx.obj['data_dir'] data_dir: str = ctx.obj["data_dir"]
query_parameters = dump_listings_module.QueryParameters( query_parameters = dump_listings_module.QueryParameters(
listing_type=ListingType[type], listing_type=ListingType[type],
district_names=set(district), district_names=set(district),
@ -112,23 +109,21 @@ def dump_listings(
max_bedrooms=max_bedrooms, max_bedrooms=max_bedrooms,
min_price=min_price, min_price=min_price,
max_price=max_price, max_price=max_price,
furnish_types=[ furnish_types=[FurnishType[furnish_type] for furnish_type in furnish_types],
FurnishType[furnish_type] for furnish_type in furnish_types
],
) )
click.echo( click.echo(
f'Running dump_listings for districts {district}, data dir {data_dir} and parameters: ' f"Running dump_listings for districts {district}, data dir {data_dir} and parameters: "
f'{query_parameters}') f"{query_parameters}"
)
data_dir_path = pathlib.Path(data_dir) data_dir_path = pathlib.Path(data_dir)
asyncio.run( asyncio.run(dump_listings_module.dump_listings(query_parameters, data_dir_path))
dump_listings_module.dump_listings(query_parameters, data_dir_path))
@cli.command() @cli.command()
@click.pass_context @click.pass_context
def dump_details(ctx: click.core.Context): def dump_details(ctx: click.core.Context):
data_dir = ctx.obj['data_dir'] data_dir = ctx.obj["data_dir"]
click.echo(f'Running dump_detail for listings stored in {data_dir}') click.echo(f"Running dump_detail for listings stored in {data_dir}")
listing_paths = sorted(list(pathlib.Path(data_dir).glob("*/listing.json"))) listing_paths = sorted(list(pathlib.Path(data_dir).glob("*/listing.json")))
asyncio.run(dump_detail_module.dump_detail(listing_paths)) asyncio.run(dump_detail_module.dump_detail(listing_paths))
@ -136,8 +131,8 @@ def dump_details(ctx: click.core.Context):
@cli.command() @cli.command()
@click.pass_context @click.pass_context
def dump_images(ctx: click.core.Context): def dump_images(ctx: click.core.Context):
data_dir = ctx.obj['data_dir'] data_dir = ctx.obj["data_dir"]
click.echo(f'Running dump_images stored in {data_dir}') click.echo(f"Running dump_images stored in {data_dir}")
listing_paths = sorted(list(pathlib.Path(data_dir).glob("*/listing.json"))) listing_paths = sorted(list(pathlib.Path(data_dir).glob("*/listing.json")))
asyncio.run(dump_images_module.dump_images(listing_paths)) asyncio.run(dump_images_module.dump_images(listing_paths))
@ -145,24 +140,24 @@ def dump_images(ctx: click.core.Context):
@cli.command() @cli.command()
@click.pass_context @click.pass_context
def detect_floorplan(ctx: click.core.Context): def detect_floorplan(ctx: click.core.Context):
data_dir = ctx.obj['data_dir'] data_dir = ctx.obj["data_dir"]
click.echo(f'Running detect_floorplan in {data_dir}') click.echo(f"Running detect_floorplan in {data_dir}")
listing_paths = sorted(list(pathlib.Path(data_dir).glob("*/listing.json"))) listing_paths = sorted(list(pathlib.Path(data_dir).glob("*/listing.json")))
asyncio.run(detect_floorplan_module.detect_floorplan(listing_paths)) asyncio.run(detect_floorplan_module.detect_floorplan(listing_paths))
@cli.command() @cli.command()
@click.option( @click.option(
'--destination-address', "--destination-address",
'-d', "-d",
help='Destination address for routing', help="Destination address for routing",
required=True, required=True,
type=click.STRING, type=click.STRING,
) )
@click.option( @click.option(
'--travel-mode', "--travel-mode",
'-m', "-m",
help='Travel mode for routing', help="Travel mode for routing",
type=click.Choice( type=click.Choice(
TravelMode.__members__.keys(), TravelMode.__members__.keys(),
case_sensitive=False, case_sensitive=False,
@ -170,23 +165,25 @@ def detect_floorplan(ctx: click.core.Context):
required=True, required=True,
) )
@click.option( @click.option(
'--limit', "--limit",
'-l', "-l",
help='Limit the number of listings to process', help="Limit the number of listings to process",
type=click.IntRange(min=1), type=click.IntRange(min=1),
default=1, # by default limit to 1 to avoid accidental API usage default=1, # by default limit to 1 to avoid accidental API usage
) )
@click.pass_context @click.pass_context
def routing(ctx: click.core.Context, destination_address: str, def routing(
travel_mode: str, limit: int): ctx: click.core.Context, destination_address: str, travel_mode: str, limit: int
data_dir = ctx.obj['data_dir'] ):
click.echo(f'Running routing for the first {limit} listings in {data_dir}') data_dir = ctx.obj["data_dir"]
click.echo(f"Running routing for the first {limit} listings in {data_dir}")
listing_paths = sorted(list(pathlib.Path(data_dir).glob("*/listing.json"))) listing_paths = sorted(list(pathlib.Path(data_dir).glob("*/listing.json")))
listing_paths = listing_paths[:limit] listing_paths = listing_paths[:limit]
if os.environ.get(API_KEY_ENVIRONMENT_VARIABLE) is None: if os.environ.get(API_KEY_ENVIRONMENT_VARIABLE) is None:
raise click.exceptions.MissingParameter( raise click.exceptions.MissingParameter(
f'{API_KEY_ENVIRONMENT_VARIABLE} environment variable is not set. ' f"{API_KEY_ENVIRONMENT_VARIABLE} environment variable is not set. "
'Please set it to your API key for the routing service.') "Please set it to your API key for the routing service."
)
asyncio.run( asyncio.run(
routing_module.calculate_route( routing_module.calculate_route(
@ -194,14 +191,15 @@ def routing(ctx: click.core.Context, destination_address: str,
destination_address, destination_address,
# destination_address_coordinates, # destination_address_coordinates,
TravelMode[travel_mode], TravelMode[travel_mode],
)) )
)
@cli.command() @cli.command()
@click.option( @click.option(
'--columns', "--columns",
'-C', "-C",
help='Columns to include in the CSV file', help="Columns to include in the CSV file",
type=click.Choice( type=click.Choice(
Listing.ALL_COLUMNS, Listing.ALL_COLUMNS,
case_sensitive=False, case_sensitive=False,
@ -210,9 +208,9 @@ def routing(ctx: click.core.Context, destination_address: str,
default=Listing.ALL_COLUMNS, default=Listing.ALL_COLUMNS,
) )
@click.option( @click.option(
'--output-file', "--output-file",
'-O', "-O",
help='Path to the output CSV file', help="Path to the output CSV file",
required=True, required=True,
type=click.Path( type=click.Path(
writable=True, writable=True,
@ -223,20 +221,21 @@ def routing(ctx: click.core.Context, destination_address: str,
) )
@click.pass_context @click.pass_context
def export_csv(ctx: click.core.Context, output_file: str, columns: tuple[str]): def export_csv(ctx: click.core.Context, output_file: str, columns: tuple[str]):
data_dir = ctx.obj['data_dir'] data_dir = ctx.obj["data_dir"]
click.echo(f'Exporting data to {output_file} using {data_dir=}') click.echo(f"Exporting data to {output_file} using {data_dir=}")
output_file_path = pathlib.Path(output_file) output_file_path = pathlib.Path(output_file)
listing_paths = sorted(list(pathlib.Path(data_dir).glob("*/listing.json"))) listing_paths = sorted(list(pathlib.Path(data_dir).glob("*/listing.json")))
listings = Listing.get_all_listings([str(path) for path in listing_paths]) listings = Listing.get_all_listings([str(path) for path in listing_paths])
asyncio.run( asyncio.run(
csv_exporter.export_to_csv(listings, output_file_path, csv_exporter.export_to_csv(listings, output_file_path, list(columns)),
list(columns)), ) )
@cli.command() @cli.command()
@click.option( @click.option(
'--output-file', "--output-file",
'-O', "-O",
help='Path to the output immoweb file', help="Path to the output immoweb file",
required=True, required=True,
type=click.Path( type=click.Path(
writable=True, writable=True,
@ -247,10 +246,9 @@ def export_csv(ctx: click.core.Context, output_file: str, columns: tuple[str]):
) )
@click.pass_context @click.pass_context
def export_immoweb(ctx, output_file: str): def export_immoweb(ctx, output_file: str):
click.echo(f'Exporting data to {output_file}') click.echo(f"Exporting data to {output_file}")
asyncio.run(export_immoweb_ui(ctx, output_file)) asyncio.run(export_immoweb_ui(ctx, output_file))
if __name__ == "__main__":
if __name__ == '__main__':
cli() cli()

View file

@ -6,6 +6,7 @@ import numpy as np
def inference(image_path): def inference(image_path):
from transformers import Pix2StructProcessor, Pix2StructForConditionalGeneration from transformers import Pix2StructProcessor, Pix2StructForConditionalGeneration
image = Image.open(image_path) image = Image.open(image_path)
question = "How many living rooms are displayed on this floor plan?" # not sure if it even has an effect question = "How many living rooms are displayed on this floor plan?" # not sure if it even has an effect
processor = Pix2StructProcessor.from_pretrained("google/deplot") processor = Pix2StructProcessor.from_pretrained("google/deplot")
@ -35,15 +36,17 @@ def calculate_model(image_path):
def improve_img_for_ocr(img: Image): def improve_img_for_ocr(img: Image):
img2 = np.array(img.convert('L')) img2 = np.array(img.convert("L"))
cv2.resize(img2, None, fx=1.2, fy=1.2, interpolation=cv2.INTER_CUBIC) cv2.resize(img2, None, fx=1.2, fy=1.2, interpolation=cv2.INTER_CUBIC)
thresh = cv2.adaptiveThreshold(img2, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, thresh = cv2.adaptiveThreshold(
cv2.THRESH_BINARY, 11, 2) img2, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2
)
return Image.fromarray(thresh) return Image.fromarray(thresh)
def calculate_ocr(image_path): def calculate_ocr(image_path):
import pytesseract import pytesseract
img = Image.open(image_path) img = Image.open(image_path)
text = pytesseract.image_to_string(img) text = pytesseract.image_to_string(img)
estimated_sqm = extract_total_sqm(text) estimated_sqm = extract_total_sqm(text)
@ -52,9 +55,7 @@ def calculate_ocr(image_path):
text2 = pytesseract.image_to_string(improved_img) text2 = pytesseract.image_to_string(improved_img)
estimated_sqm2 = extract_total_sqm(text2) estimated_sqm2 = extract_total_sqm(text2)
with open("recalculating.log", "a") as f: with open("recalculating.log", "a") as f:
f.write( f.write(f"before: {estimated_sqm} after: {estimated_sqm2} - {image_path}\n")
f"before: {estimated_sqm} after: {estimated_sqm2} - {image_path}\n"
)
return estimated_sqm2, text2 return estimated_sqm2, text2
return estimated_sqm, text return estimated_sqm, text

View file

@ -30,19 +30,13 @@ def transit_route(
header = { header = {
"X-Goog-Api-Key": api_key, "X-Goog-Api-Key": api_key,
"Content-Type": "application/json", "Content-Type": "application/json",
"X-Goog-FieldMask": # "routes.*", "X-Goog-FieldMask": "routes.distanceMeters,routes.duration,routes.staticDuration,routes.legs.steps.distanceMeters,routes.legs.steps.staticDuration,routes.legs.steps.travelMode", # "routes.*",
"routes.distanceMeters,routes.duration,routes.staticDuration,routes.legs.steps.distanceMeters,routes.legs.steps.staticDuration,routes.legs.steps.travelMode",
} }
body = { body = {
"origin": { "origin": {
# "address": origin_address # "address": origin_address
"location": { "location": {"latLng": {"latitude": origin_lat, "longitude": origin_lon}}
"latLng": {
"latitude": origin_lat,
"longitude": origin_lon
}
}
}, },
"destination": { "destination": {
"address": dest_address "address": dest_address