From 96562c08955a850e301085fc57f5461f0a5f3d47 Mon Sep 17 00:00:00 2001
From: Viktor Barzin <vbarzin@gmail.com>
Date: Sat, 17 May 2025 20:13:28 +0000
Subject: [PATCH] add csv exporter command

---
 crawler/csv_exporter.py | 37 +++++++++++++++++++++++++++++++++++++
 crawler/data_access.py  | 29 +++++++++++++++++++++++++++--
 crawler/main.py         | 36 ++++++++++++++++++++++++++++++++++++
 3 files changed, 100 insertions(+), 2 deletions(-)
 create mode 100644 crawler/csv_exporter.py

diff --git a/crawler/csv_exporter.py b/crawler/csv_exporter.py
new file mode 100644
index 0000000..05e54a1
--- /dev/null
+++ b/crawler/csv_exporter.py
@@ -0,0 +1,37 @@
+from pathlib import Path
+from data_access import Listing
+import pandas as pd
+
+
+def export_to_csv(
+    listings: list[Listing], output_file: Path, columns: list[str]
+) -> None:
+    ds = [listing.dict_nicely() for listing in listings]
+    df = pd.DataFrame(ds)
+    # read decisions on file
+    decisions_path = 'data/decisions.json'
+    decisions = pd.read_json(decisions_path)
+    df.loc[:, 'decision'] = df.identifier.apply(lambda x: decisions.get(x))
+
+    # remove all entries where we didnt calculate transit time (probably due to a too far distance)
+    # df2 = df[df.travel_time_fastest.notna()]
+    df2 = df
+
+    # drop columns
+    # dropcolumns = ['distance_per_transit', 'duration_static', 'distance']
+    # s1 = df2['travel_time_fastest'].apply(pd.Series).drop(dropcolumns, axis=1)
+    # s1 = df2
+
+    # fill in gap values for service charge and lease left. This is for excel so we can use filters better there
+    df2.loc[:, 'service_charge'] = df2.service_charge.fillna(-1)
+    df2.loc[:, 'lease_left'] = df2.lease_left.fillna(-1)
+    df2.loc[:, 'sqm_ocr'] = df2.sqm_ocr.fillna(-1)
+
+    df3 = df2
+    # df3 = pd.concat([df2.drop(['travel_time_fastest', 'travel_time_second'], axis=1), s1], axis=1)
+    # df3.loc[:, 'duration'] = (df3.loc[:, ['duration']].min(axis=1) / 60).round()
+    df3.shape
+    df4 = df3
+
+    df5 = df4[columns]
+    df5.to_csv(str(output_file), index=False)
diff --git a/crawler/data_access.py b/crawler/data_access.py
index 71a9c82..53c4791 100644
--- a/crawler/data_access.py
+++ b/crawler/data_access.py
@@ -12,9 +12,29 @@ class Listing:
     identifier: int
     _cached: Dict = None
     data_dir: pathlib.Path = pathlib.Path("data/rs/")
+    ALL_COLUMNS = [
+        "identifier",
+        "sqm_ocr",
+        "price",
+        "price_per_sqm",
+        "url",
+        "bedrooms",
+        "travel_time_fastest",
+        "travel_time_second",
+        "lease_left",
+        "service_charge",
+        "development",
+        "tenure_type",
+        "updated_days",
+        "status",
+        "last_seen",
+    ]
 
     @staticmethod
-    def get_all_listings(listing_paths: list[str]) -> List["Listing"]:
+    def get_all_listings(
+        listing_paths: list[str],
+        seen_in_the_last_n_days: int = 30,
+    ) -> List["Listing"]:
         identifiers = []
         for listing_path in listing_paths:
             with open(listing_path) as f:
@@ -24,7 +44,12 @@ class Listing:
             data_dir = pathlib.Path(listing_path)
             while str(d['identifier']) in str(data_dir.resolve().absolute()):
                 data_dir = data_dir.parent
-            identifiers.append(Listing(d["identifier"], data_dir=data_dir))
+            listing = Listing(d["identifier"], data_dir=data_dir)
+            if (
+                listing.last_seen is not None
+                and listing.last_seen < seen_in_the_last_n_days
+            ):
+                identifiers.append(listing)
 
         return identifiers
 
diff --git a/crawler/main.py b/crawler/main.py
index cc2e61d..6e40416 100644
--- a/crawler/main.py
+++ b/crawler/main.py
@@ -3,6 +3,8 @@ import click
 import importlib
 
 from rec.districts import get_districts
+from data_access import Listing
+import csv_exporter
 
 dump_listings_module = importlib.import_module('1_dump_listings')
 dump_detail_module = importlib.import_module('2_dump_detail')
@@ -92,5 +94,39 @@ def routing(ctx: click.core.Context):
     routing_module.calculate_route(listing_paths)
 
 
+@cli.command()
+@click.option(
+    '--columns',
+    '-C',
+    help='Columns to include in the CSV file',
+    type=click.Choice(
+        Listing.ALL_COLUMNS,
+        case_sensitive=False,
+    ),
+    multiple=True,
+    default=Listing.ALL_COLUMNS,
+)
+@click.option(
+    '--output-file',
+    '-O',
+    help='Path to the output CSV file',
+    required=True,
+    type=click.Path(
+        writable=True,
+        file_okay=True,
+        dir_okay=False,
+        resolve_path=True,
+    ),
+)
+@click.pass_context
+def export_csv(ctx: click.core.Context, output_file: str, columns: tuple[str]):
+    data_dir = ctx.obj['data_dir']
+    click.echo(f'Exporting data to {output_file} using {data_dir=}')
+    output_file_path = pathlib.Path(output_file)
+    listing_paths = sorted(list(pathlib.Path(data_dir).glob("*/listing.json")))
+    listings = Listing.get_all_listings(listing_paths)
+    csv_exporter.export_to_csv(listings, output_file_path, list(columns))
+
+
 if __name__ == '__main__':
     cli()