wrongmove/crawler/csv_exporter.py

41 lines
1.4 KiB
Python
Raw Normal View History

2025-05-17 20:13:28 +00:00
from pathlib import Path
import pandas as pd
from models.listing import QueryParameters
from repositories.listing_repository import ListingRepository
2025-05-17 20:13:28 +00:00
async def export_to_csv(
repository: ListingRepository,
2025-05-18 12:27:26 +00:00
output_file: Path,
query_parameters: QueryParameters | None = None,
2025-05-17 20:13:28 +00:00
) -> None:
listings = await repository.get_listings(query_parameters=query_parameters)
ds = [listing.__dict__ for listing in listings]
2025-05-17 20:13:28 +00:00
df = pd.DataFrame(ds)
2025-05-17 20:13:28 +00:00
# read decisions on file
2025-05-31 23:50:43 +00:00
decisions_path = "data/decisions.json"
2025-05-17 20:13:28 +00:00
decisions = pd.read_json(decisions_path)
df.loc[:, "decision"] = df.id.apply(lambda x: decisions.get(x))
# remove _sa_instance_state column
drop_columns = ["_sa_instance_state", "additional_info"]
df = df.drop(columns=drop_columns)
2025-05-17 20:13:28 +00:00
# fill in gap values for service charge and lease left for Excel filters
if "service_charge" not in df.columns:
df.loc[:, "service_charge"] = -1
df.loc[:, "service_charge"] = df.service_charge.fillna(-1)
if "lease_left" not in df.columns:
df.loc[:, "lease_left"] = -1
df.loc[:, "lease_left"] = df.lease_left.fillna(-1)
if "square_meters" not in df.columns:
df.loc[:, "square_meters"] = -1
df.loc[:, "square_meters"] = df.square_meters.fillna(-1)
# Add price per sqm column
df.loc[:, "price_per_sqm"] = df.price / df.square_meters
df = df.sort_values(by=["price_per_sqm"], ascending=True)
df.to_csv(str(output_file), index=False)