2025-05-17 20:13:28 +00:00
|
|
|
from pathlib import Path
|
|
|
|
|
import pandas as pd
|
2025-06-08 17:01:33 +00:00
|
|
|
from rec.query import QueryParameters
|
|
|
|
|
from repositories.listing_repository import ListingRepository
|
2025-05-17 20:13:28 +00:00
|
|
|
|
|
|
|
|
|
2025-05-20 21:58:08 +00:00
|
|
|
async def export_to_csv(
|
2025-06-08 17:01:33 +00:00
|
|
|
repository: ListingRepository,
|
2025-05-18 12:27:26 +00:00
|
|
|
output_file: Path,
|
2025-06-01 20:11:00 +00:00
|
|
|
query_parameters: QueryParameters | None = None,
|
2025-05-17 20:13:28 +00:00
|
|
|
) -> None:
|
2025-06-08 17:01:33 +00:00
|
|
|
listings = await repository.get_listings(query_parameters=query_parameters)
|
|
|
|
|
ds = [*[listing.__dict__ for listing in listings]]
|
2025-05-17 20:13:28 +00:00
|
|
|
df = pd.DataFrame(ds)
|
2025-06-08 17:01:33 +00:00
|
|
|
|
2025-05-17 20:13:28 +00:00
|
|
|
# read decisions on file
|
2025-05-31 23:50:43 +00:00
|
|
|
decisions_path = "data/decisions.json"
|
2025-05-17 20:13:28 +00:00
|
|
|
decisions = pd.read_json(decisions_path)
|
2025-06-08 17:01:33 +00:00
|
|
|
df.loc[:, "decision"] = df.id.apply(lambda x: decisions.get(x))
|
|
|
|
|
|
|
|
|
|
# remove _sa_instance_state column
|
|
|
|
|
drop_columns = ["_sa_instance_state", "additional_info"]
|
|
|
|
|
df = df.drop(columns=drop_columns)
|
2025-05-17 20:13:28 +00:00
|
|
|
|
|
|
|
|
# remove all entries where we didnt calculate transit time (probably due to a too far distance)
|
|
|
|
|
# df2 = df[df.travel_time_fastest.notna()]
|
|
|
|
|
df2 = df
|
|
|
|
|
|
|
|
|
|
# drop columns
|
|
|
|
|
# dropcolumns = ['distance_per_transit', 'duration_static', 'distance']
|
|
|
|
|
# s1 = df2['travel_time_fastest'].apply(pd.Series).drop(dropcolumns, axis=1)
|
|
|
|
|
# s1 = df2
|
|
|
|
|
|
|
|
|
|
# fill in gap values for service charge and lease left. This is for excel so we can use filters better there
|
2025-06-08 17:01:33 +00:00
|
|
|
if "service_charge" not in df2.columns:
|
|
|
|
|
df2.loc[:, "service_charge"] = -1
|
2025-05-31 23:50:43 +00:00
|
|
|
df2.loc[:, "service_charge"] = df2.service_charge.fillna(-1)
|
2025-06-08 17:01:33 +00:00
|
|
|
if "lease_left" not in df2.columns:
|
|
|
|
|
df2.loc[:, "lease_left"] = -1
|
2025-05-31 23:50:43 +00:00
|
|
|
df2.loc[:, "lease_left"] = df2.lease_left.fillna(-1)
|
2025-06-08 17:01:33 +00:00
|
|
|
if "square_meters" not in df2.columns:
|
|
|
|
|
df2.loc[:, "square_meters"] = -1
|
|
|
|
|
df2.loc[:, "square_meters"] = df2.square_meters.fillna(-1)
|
2025-05-17 20:13:28 +00:00
|
|
|
|
|
|
|
|
df3 = df2
|
|
|
|
|
# df3 = pd.concat([df2.drop(['travel_time_fastest', 'travel_time_second'], axis=1), s1], axis=1)
|
|
|
|
|
# df3.loc[:, 'duration'] = (df3.loc[:, ['duration']].min(axis=1) / 60).round()
|
|
|
|
|
df3.shape
|
|
|
|
|
df4 = df3
|
|
|
|
|
|
2025-06-08 17:01:33 +00:00
|
|
|
# df5 = df4[columns]
|
|
|
|
|
|
|
|
|
|
# Add some interesting columns
|
|
|
|
|
df4.loc[:, "price_per_sqm"] = df4.price / df4.square_meters
|
|
|
|
|
df5 = df4
|
|
|
|
|
|
2025-05-31 23:50:43 +00:00
|
|
|
df6 = df5.sort_values(by=["price_per_sqm"], ascending=True)
|
2025-05-17 23:14:18 +00:00
|
|
|
df6.to_csv(str(output_file), index=False)
|