add csv exporter command

2025-05-17 20:13:28 +00:00 · 2025-05-17 20:13:28 +00:00 · 96562c0895
commit 96562c0895
parent ca5619976f
3 changed files with 100 additions and 2 deletions
--- a/crawler/csv_exporter.py
+++ b/crawler/csv_exporter.py
@ -0,0 +1,37 @@
 from pathlib import Path
 from data_access import Listing
 import pandas as pd
 def export_to_csv(
    listings: list[Listing], output_file: Path, columns: list[str]
 ) -> None:
    ds = [listing.dict_nicely() for listing in listings]
    df = pd.DataFrame(ds)
    # read decisions on file
    decisions_path = 'data/decisions.json'
    decisions = pd.read_json(decisions_path)
    df.loc[:, 'decision'] = df.identifier.apply(lambda x: decisions.get(x))
    # remove all entries where we didnt calculate transit time (probably due to a too far distance)
    # df2 = df[df.travel_time_fastest.notna()]
    df2 = df
    # drop columns
    # dropcolumns = ['distance_per_transit', 'duration_static', 'distance']
    # s1 = df2['travel_time_fastest'].apply(pd.Series).drop(dropcolumns, axis=1)
    # s1 = df2
    # fill in gap values for service charge and lease left. This is for excel so we can use filters better there
    df2.loc[:, 'service_charge'] = df2.service_charge.fillna(-1)
    df2.loc[:, 'lease_left'] = df2.lease_left.fillna(-1)
    df2.loc[:, 'sqm_ocr'] = df2.sqm_ocr.fillna(-1)
    df3 = df2
    # df3 = pd.concat([df2.drop(['travel_time_fastest', 'travel_time_second'], axis=1), s1], axis=1)
    # df3.loc[:, 'duration'] = (df3.loc[:, ['duration']].min(axis=1) / 60).round()
    df3.shape
    df4 = df3
    df5 = df4[columns]
    df5.to_csv(str(output_file), index=False)
--- a/crawler/data_access.py
+++ b/crawler/data_access.py
@ -12,9 +12,29 @@ class Listing:
    identifier: int
    _cached: Dict = None
    data_dir: pathlib.Path = pathlib.Path("data/rs/")
    ALL_COLUMNS = [
        "identifier",
        "sqm_ocr",
        "price",
        "price_per_sqm",
        "url",
        "bedrooms",
        "travel_time_fastest",
        "travel_time_second",
        "lease_left",
        "service_charge",
        "development",
        "tenure_type",
        "updated_days",
        "status",
        "last_seen",
    ]
    @staticmethod
-    def get_all_listings(listing_paths: list[str]) -> List["Listing"]:
+    def get_all_listings(
        listing_paths: list[str],
        seen_in_the_last_n_days: int = 30,
    ) -> List["Listing"]:
        identifiers = []
        for listing_path in listing_paths:
            with open(listing_path) as f:
@ -24,7 +44,12 @@ class Listing:
            data_dir = pathlib.Path(listing_path)
            while str(d['identifier']) in str(data_dir.resolve().absolute()):
                data_dir = data_dir.parent
-            identifiers.append(Listing(d["identifier"], data_dir=data_dir))
+            listing = Listing(d["identifier"], data_dir=data_dir)
            if (
                listing.last_seen is not None
                and listing.last_seen < seen_in_the_last_n_days
            ):
                identifiers.append(listing)
        return identifiers
--- a/crawler/main.py
+++ b/crawler/main.py
@ -3,6 +3,8 @@ import click
 import importlib
 from rec.districts import get_districts
 from data_access import Listing
 import csv_exporter
 dump_listings_module = importlib.import_module('1_dump_listings')
 dump_detail_module = importlib.import_module('2_dump_detail')
@ -92,5 +94,39 @@ def routing(ctx: click.core.Context):
    routing_module.calculate_route(listing_paths)
@cli.command()
@click.option(
    '--columns',
    '-C',
    help='Columns to include in the CSV file',
    type=click.Choice(
        Listing.ALL_COLUMNS,
        case_sensitive=False,
    ),
    multiple=True,
    default=Listing.ALL_COLUMNS,
 )
@click.option(
    '--output-file',
    '-O',
    help='Path to the output CSV file',
    required=True,
    type=click.Path(
        writable=True,
        file_okay=True,
        dir_okay=False,
        resolve_path=True,
    ),
 )
@click.pass_context
 def export_csv(ctx: click.core.Context, output_file: str, columns: tuple[str]):
    data_dir = ctx.obj['data_dir']
    click.echo(f'Exporting data to {output_file} using {data_dir=}')
    output_file_path = pathlib.Path(output_file)
    listing_paths = sorted(list(pathlib.Path(data_dir).glob("*/listing.json")))
    listings = Listing.get_all_listings(listing_paths)
    csv_exporter.export_to_csv(listings, output_file_path, list(columns))
 if __name__ == '__main__':
    cli()