wrongmove/crawler/csv_exporter.py
2025-05-17 23:14:18 +00:00

38 lines
1.4 KiB
Python

from pathlib import Path
from data_access import Listing
import pandas as pd
def export_to_csv(
listings: list[Listing], output_file: Path, columns: list[str]
) -> None:
ds = [listing.dict_nicely() for listing in listings]
df = pd.DataFrame(ds)
# read decisions on file
decisions_path = 'data/decisions.json'
decisions = pd.read_json(decisions_path)
df.loc[:, 'decision'] = df.identifier.apply(lambda x: decisions.get(x))
# remove all entries where we didnt calculate transit time (probably due to a too far distance)
# df2 = df[df.travel_time_fastest.notna()]
df2 = df
# drop columns
# dropcolumns = ['distance_per_transit', 'duration_static', 'distance']
# s1 = df2['travel_time_fastest'].apply(pd.Series).drop(dropcolumns, axis=1)
# s1 = df2
# fill in gap values for service charge and lease left. This is for excel so we can use filters better there
df2.loc[:, 'service_charge'] = df2.service_charge.fillna(-1)
df2.loc[:, 'lease_left'] = df2.lease_left.fillna(-1)
df2.loc[:, 'sqm_ocr'] = df2.sqm_ocr.fillna(-1)
df3 = df2
# df3 = pd.concat([df2.drop(['travel_time_fastest', 'travel_time_second'], axis=1), s1], axis=1)
# df3.loc[:, 'duration'] = (df3.loc[:, ['duration']].min(axis=1) / 60).round()
df3.shape
df4 = df3
df5 = df4[columns]
df6 = df5.sort_values(by=['price_per_sqm'], ascending=True)
df6.to_csv(str(output_file), index=False)