From 96562c08955a850e301085fc57f5461f0a5f3d47 Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Sat, 17 May 2025 20:13:28 +0000 Subject: [PATCH] add csv exporter command --- crawler/csv_exporter.py | 37 +++++++++++++++++++++++++++++++++++++ crawler/data_access.py | 29 +++++++++++++++++++++++++++-- crawler/main.py | 36 ++++++++++++++++++++++++++++++++++++ 3 files changed, 100 insertions(+), 2 deletions(-) create mode 100644 crawler/csv_exporter.py diff --git a/crawler/csv_exporter.py b/crawler/csv_exporter.py new file mode 100644 index 0000000..05e54a1 --- /dev/null +++ b/crawler/csv_exporter.py @@ -0,0 +1,37 @@ +from pathlib import Path +from data_access import Listing +import pandas as pd + + +def export_to_csv( + listings: list[Listing], output_file: Path, columns: list[str] +) -> None: + ds = [listing.dict_nicely() for listing in listings] + df = pd.DataFrame(ds) + # read decisions on file + decisions_path = 'data/decisions.json' + decisions = pd.read_json(decisions_path) + df.loc[:, 'decision'] = df.identifier.apply(lambda x: decisions.get(x)) + + # remove all entries where we didnt calculate transit time (probably due to a too far distance) + # df2 = df[df.travel_time_fastest.notna()] + df2 = df + + # drop columns + # dropcolumns = ['distance_per_transit', 'duration_static', 'distance'] + # s1 = df2['travel_time_fastest'].apply(pd.Series).drop(dropcolumns, axis=1) + # s1 = df2 + + # fill in gap values for service charge and lease left. This is for excel so we can use filters better there + df2.loc[:, 'service_charge'] = df2.service_charge.fillna(-1) + df2.loc[:, 'lease_left'] = df2.lease_left.fillna(-1) + df2.loc[:, 'sqm_ocr'] = df2.sqm_ocr.fillna(-1) + + df3 = df2 + # df3 = pd.concat([df2.drop(['travel_time_fastest', 'travel_time_second'], axis=1), s1], axis=1) + # df3.loc[:, 'duration'] = (df3.loc[:, ['duration']].min(axis=1) / 60).round() + df3.shape + df4 = df3 + + df5 = df4[columns] + df5.to_csv(str(output_file), index=False) diff --git a/crawler/data_access.py b/crawler/data_access.py index 71a9c82..53c4791 100644 --- a/crawler/data_access.py +++ b/crawler/data_access.py @@ -12,9 +12,29 @@ class Listing: identifier: int _cached: Dict = None data_dir: pathlib.Path = pathlib.Path("data/rs/") + ALL_COLUMNS = [ + "identifier", + "sqm_ocr", + "price", + "price_per_sqm", + "url", + "bedrooms", + "travel_time_fastest", + "travel_time_second", + "lease_left", + "service_charge", + "development", + "tenure_type", + "updated_days", + "status", + "last_seen", + ] @staticmethod - def get_all_listings(listing_paths: list[str]) -> List["Listing"]: + def get_all_listings( + listing_paths: list[str], + seen_in_the_last_n_days: int = 30, + ) -> List["Listing"]: identifiers = [] for listing_path in listing_paths: with open(listing_path) as f: @@ -24,7 +44,12 @@ class Listing: data_dir = pathlib.Path(listing_path) while str(d['identifier']) in str(data_dir.resolve().absolute()): data_dir = data_dir.parent - identifiers.append(Listing(d["identifier"], data_dir=data_dir)) + listing = Listing(d["identifier"], data_dir=data_dir) + if ( + listing.last_seen is not None + and listing.last_seen < seen_in_the_last_n_days + ): + identifiers.append(listing) return identifiers diff --git a/crawler/main.py b/crawler/main.py index cc2e61d..6e40416 100644 --- a/crawler/main.py +++ b/crawler/main.py @@ -3,6 +3,8 @@ import click import importlib from rec.districts import get_districts +from data_access import Listing +import csv_exporter dump_listings_module = importlib.import_module('1_dump_listings') dump_detail_module = importlib.import_module('2_dump_detail') @@ -92,5 +94,39 @@ def routing(ctx: click.core.Context): routing_module.calculate_route(listing_paths) +@cli.command() +@click.option( + '--columns', + '-C', + help='Columns to include in the CSV file', + type=click.Choice( + Listing.ALL_COLUMNS, + case_sensitive=False, + ), + multiple=True, + default=Listing.ALL_COLUMNS, +) +@click.option( + '--output-file', + '-O', + help='Path to the output CSV file', + required=True, + type=click.Path( + writable=True, + file_okay=True, + dir_okay=False, + resolve_path=True, + ), +) +@click.pass_context +def export_csv(ctx: click.core.Context, output_file: str, columns: tuple[str]): + data_dir = ctx.obj['data_dir'] + click.echo(f'Exporting data to {output_file} using {data_dir=}') + output_file_path = pathlib.Path(output_file) + listing_paths = sorted(list(pathlib.Path(data_dir).glob("*/listing.json"))) + listings = Listing.get_all_listings(listing_paths) + csv_exporter.export_to_csv(listings, output_file_path, list(columns)) + + if __name__ == '__main__': cli()