add csv exporter command

This commit is contained in:
Viktor Barzin 2025-05-17 20:13:28 +00:00
parent ca5619976f
commit 96562c0895
No known key found for this signature in database
GPG key ID: 4056458DBDBF8863
3 changed files with 100 additions and 2 deletions

37
crawler/csv_exporter.py Normal file
View file

@ -0,0 +1,37 @@
from pathlib import Path
from data_access import Listing
import pandas as pd
def export_to_csv(
listings: list[Listing], output_file: Path, columns: list[str]
) -> None:
ds = [listing.dict_nicely() for listing in listings]
df = pd.DataFrame(ds)
# read decisions on file
decisions_path = 'data/decisions.json'
decisions = pd.read_json(decisions_path)
df.loc[:, 'decision'] = df.identifier.apply(lambda x: decisions.get(x))
# remove all entries where we didnt calculate transit time (probably due to a too far distance)
# df2 = df[df.travel_time_fastest.notna()]
df2 = df
# drop columns
# dropcolumns = ['distance_per_transit', 'duration_static', 'distance']
# s1 = df2['travel_time_fastest'].apply(pd.Series).drop(dropcolumns, axis=1)
# s1 = df2
# fill in gap values for service charge and lease left. This is for excel so we can use filters better there
df2.loc[:, 'service_charge'] = df2.service_charge.fillna(-1)
df2.loc[:, 'lease_left'] = df2.lease_left.fillna(-1)
df2.loc[:, 'sqm_ocr'] = df2.sqm_ocr.fillna(-1)
df3 = df2
# df3 = pd.concat([df2.drop(['travel_time_fastest', 'travel_time_second'], axis=1), s1], axis=1)
# df3.loc[:, 'duration'] = (df3.loc[:, ['duration']].min(axis=1) / 60).round()
df3.shape
df4 = df3
df5 = df4[columns]
df5.to_csv(str(output_file), index=False)

View file

@ -12,9 +12,29 @@ class Listing:
identifier: int
_cached: Dict = None
data_dir: pathlib.Path = pathlib.Path("data/rs/")
ALL_COLUMNS = [
"identifier",
"sqm_ocr",
"price",
"price_per_sqm",
"url",
"bedrooms",
"travel_time_fastest",
"travel_time_second",
"lease_left",
"service_charge",
"development",
"tenure_type",
"updated_days",
"status",
"last_seen",
]
@staticmethod
def get_all_listings(listing_paths: list[str]) -> List["Listing"]:
def get_all_listings(
listing_paths: list[str],
seen_in_the_last_n_days: int = 30,
) -> List["Listing"]:
identifiers = []
for listing_path in listing_paths:
with open(listing_path) as f:
@ -24,7 +44,12 @@ class Listing:
data_dir = pathlib.Path(listing_path)
while str(d['identifier']) in str(data_dir.resolve().absolute()):
data_dir = data_dir.parent
identifiers.append(Listing(d["identifier"], data_dir=data_dir))
listing = Listing(d["identifier"], data_dir=data_dir)
if (
listing.last_seen is not None
and listing.last_seen < seen_in_the_last_n_days
):
identifiers.append(listing)
return identifiers

View file

@ -3,6 +3,8 @@ import click
import importlib
from rec.districts import get_districts
from data_access import Listing
import csv_exporter
dump_listings_module = importlib.import_module('1_dump_listings')
dump_detail_module = importlib.import_module('2_dump_detail')
@ -92,5 +94,39 @@ def routing(ctx: click.core.Context):
routing_module.calculate_route(listing_paths)
@cli.command()
@click.option(
'--columns',
'-C',
help='Columns to include in the CSV file',
type=click.Choice(
Listing.ALL_COLUMNS,
case_sensitive=False,
),
multiple=True,
default=Listing.ALL_COLUMNS,
)
@click.option(
'--output-file',
'-O',
help='Path to the output CSV file',
required=True,
type=click.Path(
writable=True,
file_okay=True,
dir_okay=False,
resolve_path=True,
),
)
@click.pass_context
def export_csv(ctx: click.core.Context, output_file: str, columns: tuple[str]):
data_dir = ctx.obj['data_dir']
click.echo(f'Exporting data to {output_file} using {data_dir=}')
output_file_path = pathlib.Path(output_file)
listing_paths = sorted(list(pathlib.Path(data_dir).glob("*/listing.json")))
listings = Listing.get_all_listings(listing_paths)
csv_exporter.export_to_csv(listings, output_file_path, list(columns))
if __name__ == '__main__':
cli()