from pathlib import Path import pandas as pd from rec.query import QueryParameters from repositories.listing_repository import ListingRepository async def export_to_csv( repository: ListingRepository, output_file: Path, query_parameters: QueryParameters | None = None, ) -> None: listings = await repository.get_listings(query_parameters=query_parameters) ds = [*[listing.__dict__ for listing in listings]] df = pd.DataFrame(ds) # read decisions on file decisions_path = "data/decisions.json" decisions = pd.read_json(decisions_path) df.loc[:, "decision"] = df.id.apply(lambda x: decisions.get(x)) # remove _sa_instance_state column drop_columns = ["_sa_instance_state", "additional_info"] df = df.drop(columns=drop_columns) # remove all entries where we didnt calculate transit time (probably due to a too far distance) # df2 = df[df.travel_time_fastest.notna()] df2 = df # drop columns # dropcolumns = ['distance_per_transit', 'duration_static', 'distance'] # s1 = df2['travel_time_fastest'].apply(pd.Series).drop(dropcolumns, axis=1) # s1 = df2 # fill in gap values for service charge and lease left. This is for excel so we can use filters better there if "service_charge" not in df2.columns: df2.loc[:, "service_charge"] = -1 df2.loc[:, "service_charge"] = df2.service_charge.fillna(-1) if "lease_left" not in df2.columns: df2.loc[:, "lease_left"] = -1 df2.loc[:, "lease_left"] = df2.lease_left.fillna(-1) if "square_meters" not in df2.columns: df2.loc[:, "square_meters"] = -1 df2.loc[:, "square_meters"] = df2.square_meters.fillna(-1) df3 = df2 # df3 = pd.concat([df2.drop(['travel_time_fastest', 'travel_time_second'], axis=1), s1], axis=1) # df3.loc[:, 'duration'] = (df3.loc[:, ['duration']].min(axis=1) / 60).round() df3.shape df4 = df3 # df5 = df4[columns] # Add some interesting columns df4.loc[:, "price_per_sqm"] = df4.price / df4.square_meters df5 = df4 df6 = df5.sort_values(by=["price_per_sqm"], ascending=True) df6.to_csv(str(output_file), index=False)