add csv exporter command

2025-05-17 20:13:28 +00:00 · 2025-05-17 20:13:28 +00:00 · 96562c0895
commit 96562c0895
parent ca5619976f
3 changed files with 100 additions and 2 deletions
--- a/crawler/csv_exporter.py
+++ b/crawler/csv_exporter.py
@ -0,0 +1,37 @@
+from pathlib import Path
+from data_access import Listing
+import pandas as pd
+
+
+def export_to_csv(
+    listings: list[Listing], output_file: Path, columns: list[str]
+) -> None:
+    ds = [listing.dict_nicely() for listing in listings]
+    df = pd.DataFrame(ds)
+    # read decisions on file
+    decisions_path = 'data/decisions.json'
+    decisions = pd.read_json(decisions_path)
+    df.loc[:, 'decision'] = df.identifier.apply(lambda x: decisions.get(x))
+
+    # remove all entries where we didnt calculate transit time (probably due to a too far distance)
+    # df2 = df[df.travel_time_fastest.notna()]
+    df2 = df
+
+    # drop columns
+    # dropcolumns = ['distance_per_transit', 'duration_static', 'distance']
+    # s1 = df2['travel_time_fastest'].apply(pd.Series).drop(dropcolumns, axis=1)
+    # s1 = df2
+
+    # fill in gap values for service charge and lease left. This is for excel so we can use filters better there
+    df2.loc[:, 'service_charge'] = df2.service_charge.fillna(-1)
+    df2.loc[:, 'lease_left'] = df2.lease_left.fillna(-1)
+    df2.loc[:, 'sqm_ocr'] = df2.sqm_ocr.fillna(-1)
+
+    df3 = df2
+    # df3 = pd.concat([df2.drop(['travel_time_fastest', 'travel_time_second'], axis=1), s1], axis=1)
+    # df3.loc[:, 'duration'] = (df3.loc[:, ['duration']].min(axis=1) / 60).round()
+    df3.shape
+    df4 = df3
+
+    df5 = df4[columns]
+    df5.to_csv(str(output_file), index=False)
--- a/crawler/data_access.py
+++ b/crawler/data_access.py
@ -12,9 +12,29 @@ class Listing:
    identifier: int
    _cached: Dict = None
    data_dir: pathlib.Path = pathlib.Path("data/rs/")
+    ALL_COLUMNS = [
+        "identifier",
+        "sqm_ocr",
+        "price",
+        "price_per_sqm",
+        "url",
+        "bedrooms",
+        "travel_time_fastest",
+        "travel_time_second",
+        "lease_left",
+        "service_charge",
+        "development",
+        "tenure_type",
+        "updated_days",
+        "status",
+        "last_seen",
+    ]

    @staticmethod
-    def get_all_listings(listing_paths: list[str]) -> List["Listing"]:
+    def get_all_listings(
+        listing_paths: list[str],
+        seen_in_the_last_n_days: int = 30,
+    ) -> List["Listing"]:
        identifiers = []
        for listing_path in listing_paths:
            with open(listing_path) as f:
@ -24,7 +44,12 @@ class Listing:
            data_dir = pathlib.Path(listing_path)
            while str(d['identifier']) in str(data_dir.resolve().absolute()):
                data_dir = data_dir.parent
-            identifiers.append(Listing(d["identifier"], data_dir=data_dir))
+            listing = Listing(d["identifier"], data_dir=data_dir)
+            if (
+                listing.last_seen is not None
+                and listing.last_seen < seen_in_the_last_n_days
+            ):
+                identifiers.append(listing)

        return identifiers

--- a/crawler/main.py
+++ b/crawler/main.py
@ -3,6 +3,8 @@ import click
 import importlib

 from rec.districts import get_districts
+from data_access import Listing
+import csv_exporter

 dump_listings_module = importlib.import_module('1_dump_listings')
 dump_detail_module = importlib.import_module('2_dump_detail')
@ -92,5 +94,39 @@ def routing(ctx: click.core.Context):
    routing_module.calculate_route(listing_paths)


+@cli.command()
+@click.option(
+    '--columns',
+    '-C',
+    help='Columns to include in the CSV file',
+    type=click.Choice(
+        Listing.ALL_COLUMNS,
+        case_sensitive=False,
+    ),
+    multiple=True,
+    default=Listing.ALL_COLUMNS,
+)
+@click.option(
+    '--output-file',
+    '-O',
+    help='Path to the output CSV file',
+    required=True,
+    type=click.Path(
+        writable=True,
+        file_okay=True,
+        dir_okay=False,
+        resolve_path=True,
+    ),
+)
+@click.pass_context
+def export_csv(ctx: click.core.Context, output_file: str, columns: tuple[str]):
+    data_dir = ctx.obj['data_dir']
+    click.echo(f'Exporting data to {output_file} using {data_dir=}')
+    output_file_path = pathlib.Path(output_file)
+    listing_paths = sorted(list(pathlib.Path(data_dir).glob("*/listing.json")))
+    listings = Listing.get_all_listings(listing_paths)
+    csv_exporter.export_to_csv(listings, output_file_path, list(columns))
+
+
 if __name__ == '__main__':
    cli()