add csv exporter command
This commit is contained in:
parent
ca5619976f
commit
96562c0895
3 changed files with 100 additions and 2 deletions
37
crawler/csv_exporter.py
Normal file
37
crawler/csv_exporter.py
Normal file
|
|
@ -0,0 +1,37 @@
|
|||
from pathlib import Path
|
||||
from data_access import Listing
|
||||
import pandas as pd
|
||||
|
||||
|
||||
def export_to_csv(
|
||||
listings: list[Listing], output_file: Path, columns: list[str]
|
||||
) -> None:
|
||||
ds = [listing.dict_nicely() for listing in listings]
|
||||
df = pd.DataFrame(ds)
|
||||
# read decisions on file
|
||||
decisions_path = 'data/decisions.json'
|
||||
decisions = pd.read_json(decisions_path)
|
||||
df.loc[:, 'decision'] = df.identifier.apply(lambda x: decisions.get(x))
|
||||
|
||||
# remove all entries where we didnt calculate transit time (probably due to a too far distance)
|
||||
# df2 = df[df.travel_time_fastest.notna()]
|
||||
df2 = df
|
||||
|
||||
# drop columns
|
||||
# dropcolumns = ['distance_per_transit', 'duration_static', 'distance']
|
||||
# s1 = df2['travel_time_fastest'].apply(pd.Series).drop(dropcolumns, axis=1)
|
||||
# s1 = df2
|
||||
|
||||
# fill in gap values for service charge and lease left. This is for excel so we can use filters better there
|
||||
df2.loc[:, 'service_charge'] = df2.service_charge.fillna(-1)
|
||||
df2.loc[:, 'lease_left'] = df2.lease_left.fillna(-1)
|
||||
df2.loc[:, 'sqm_ocr'] = df2.sqm_ocr.fillna(-1)
|
||||
|
||||
df3 = df2
|
||||
# df3 = pd.concat([df2.drop(['travel_time_fastest', 'travel_time_second'], axis=1), s1], axis=1)
|
||||
# df3.loc[:, 'duration'] = (df3.loc[:, ['duration']].min(axis=1) / 60).round()
|
||||
df3.shape
|
||||
df4 = df3
|
||||
|
||||
df5 = df4[columns]
|
||||
df5.to_csv(str(output_file), index=False)
|
||||
|
|
@ -12,9 +12,29 @@ class Listing:
|
|||
identifier: int
|
||||
_cached: Dict = None
|
||||
data_dir: pathlib.Path = pathlib.Path("data/rs/")
|
||||
ALL_COLUMNS = [
|
||||
"identifier",
|
||||
"sqm_ocr",
|
||||
"price",
|
||||
"price_per_sqm",
|
||||
"url",
|
||||
"bedrooms",
|
||||
"travel_time_fastest",
|
||||
"travel_time_second",
|
||||
"lease_left",
|
||||
"service_charge",
|
||||
"development",
|
||||
"tenure_type",
|
||||
"updated_days",
|
||||
"status",
|
||||
"last_seen",
|
||||
]
|
||||
|
||||
@staticmethod
|
||||
def get_all_listings(listing_paths: list[str]) -> List["Listing"]:
|
||||
def get_all_listings(
|
||||
listing_paths: list[str],
|
||||
seen_in_the_last_n_days: int = 30,
|
||||
) -> List["Listing"]:
|
||||
identifiers = []
|
||||
for listing_path in listing_paths:
|
||||
with open(listing_path) as f:
|
||||
|
|
@ -24,7 +44,12 @@ class Listing:
|
|||
data_dir = pathlib.Path(listing_path)
|
||||
while str(d['identifier']) in str(data_dir.resolve().absolute()):
|
||||
data_dir = data_dir.parent
|
||||
identifiers.append(Listing(d["identifier"], data_dir=data_dir))
|
||||
listing = Listing(d["identifier"], data_dir=data_dir)
|
||||
if (
|
||||
listing.last_seen is not None
|
||||
and listing.last_seen < seen_in_the_last_n_days
|
||||
):
|
||||
identifiers.append(listing)
|
||||
|
||||
return identifiers
|
||||
|
||||
|
|
|
|||
|
|
@ -3,6 +3,8 @@ import click
|
|||
import importlib
|
||||
|
||||
from rec.districts import get_districts
|
||||
from data_access import Listing
|
||||
import csv_exporter
|
||||
|
||||
dump_listings_module = importlib.import_module('1_dump_listings')
|
||||
dump_detail_module = importlib.import_module('2_dump_detail')
|
||||
|
|
@ -92,5 +94,39 @@ def routing(ctx: click.core.Context):
|
|||
routing_module.calculate_route(listing_paths)
|
||||
|
||||
|
||||
@cli.command()
|
||||
@click.option(
|
||||
'--columns',
|
||||
'-C',
|
||||
help='Columns to include in the CSV file',
|
||||
type=click.Choice(
|
||||
Listing.ALL_COLUMNS,
|
||||
case_sensitive=False,
|
||||
),
|
||||
multiple=True,
|
||||
default=Listing.ALL_COLUMNS,
|
||||
)
|
||||
@click.option(
|
||||
'--output-file',
|
||||
'-O',
|
||||
help='Path to the output CSV file',
|
||||
required=True,
|
||||
type=click.Path(
|
||||
writable=True,
|
||||
file_okay=True,
|
||||
dir_okay=False,
|
||||
resolve_path=True,
|
||||
),
|
||||
)
|
||||
@click.pass_context
|
||||
def export_csv(ctx: click.core.Context, output_file: str, columns: tuple[str]):
|
||||
data_dir = ctx.obj['data_dir']
|
||||
click.echo(f'Exporting data to {output_file} using {data_dir=}')
|
||||
output_file_path = pathlib.Path(output_file)
|
||||
listing_paths = sorted(list(pathlib.Path(data_dir).glob("*/listing.json")))
|
||||
listings = Listing.get_all_listings(listing_paths)
|
||||
csv_exporter.export_to_csv(listings, output_file_path, list(columns))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
cli()
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue