add csv exporter command
This commit is contained in:
parent
ca5619976f
commit
96562c0895
3 changed files with 100 additions and 2 deletions
37
crawler/csv_exporter.py
Normal file
37
crawler/csv_exporter.py
Normal file
|
|
@ -0,0 +1,37 @@
|
||||||
|
from pathlib import Path
|
||||||
|
from data_access import Listing
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
|
||||||
|
def export_to_csv(
|
||||||
|
listings: list[Listing], output_file: Path, columns: list[str]
|
||||||
|
) -> None:
|
||||||
|
ds = [listing.dict_nicely() for listing in listings]
|
||||||
|
df = pd.DataFrame(ds)
|
||||||
|
# read decisions on file
|
||||||
|
decisions_path = 'data/decisions.json'
|
||||||
|
decisions = pd.read_json(decisions_path)
|
||||||
|
df.loc[:, 'decision'] = df.identifier.apply(lambda x: decisions.get(x))
|
||||||
|
|
||||||
|
# remove all entries where we didnt calculate transit time (probably due to a too far distance)
|
||||||
|
# df2 = df[df.travel_time_fastest.notna()]
|
||||||
|
df2 = df
|
||||||
|
|
||||||
|
# drop columns
|
||||||
|
# dropcolumns = ['distance_per_transit', 'duration_static', 'distance']
|
||||||
|
# s1 = df2['travel_time_fastest'].apply(pd.Series).drop(dropcolumns, axis=1)
|
||||||
|
# s1 = df2
|
||||||
|
|
||||||
|
# fill in gap values for service charge and lease left. This is for excel so we can use filters better there
|
||||||
|
df2.loc[:, 'service_charge'] = df2.service_charge.fillna(-1)
|
||||||
|
df2.loc[:, 'lease_left'] = df2.lease_left.fillna(-1)
|
||||||
|
df2.loc[:, 'sqm_ocr'] = df2.sqm_ocr.fillna(-1)
|
||||||
|
|
||||||
|
df3 = df2
|
||||||
|
# df3 = pd.concat([df2.drop(['travel_time_fastest', 'travel_time_second'], axis=1), s1], axis=1)
|
||||||
|
# df3.loc[:, 'duration'] = (df3.loc[:, ['duration']].min(axis=1) / 60).round()
|
||||||
|
df3.shape
|
||||||
|
df4 = df3
|
||||||
|
|
||||||
|
df5 = df4[columns]
|
||||||
|
df5.to_csv(str(output_file), index=False)
|
||||||
|
|
@ -12,9 +12,29 @@ class Listing:
|
||||||
identifier: int
|
identifier: int
|
||||||
_cached: Dict = None
|
_cached: Dict = None
|
||||||
data_dir: pathlib.Path = pathlib.Path("data/rs/")
|
data_dir: pathlib.Path = pathlib.Path("data/rs/")
|
||||||
|
ALL_COLUMNS = [
|
||||||
|
"identifier",
|
||||||
|
"sqm_ocr",
|
||||||
|
"price",
|
||||||
|
"price_per_sqm",
|
||||||
|
"url",
|
||||||
|
"bedrooms",
|
||||||
|
"travel_time_fastest",
|
||||||
|
"travel_time_second",
|
||||||
|
"lease_left",
|
||||||
|
"service_charge",
|
||||||
|
"development",
|
||||||
|
"tenure_type",
|
||||||
|
"updated_days",
|
||||||
|
"status",
|
||||||
|
"last_seen",
|
||||||
|
]
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def get_all_listings(listing_paths: list[str]) -> List["Listing"]:
|
def get_all_listings(
|
||||||
|
listing_paths: list[str],
|
||||||
|
seen_in_the_last_n_days: int = 30,
|
||||||
|
) -> List["Listing"]:
|
||||||
identifiers = []
|
identifiers = []
|
||||||
for listing_path in listing_paths:
|
for listing_path in listing_paths:
|
||||||
with open(listing_path) as f:
|
with open(listing_path) as f:
|
||||||
|
|
@ -24,7 +44,12 @@ class Listing:
|
||||||
data_dir = pathlib.Path(listing_path)
|
data_dir = pathlib.Path(listing_path)
|
||||||
while str(d['identifier']) in str(data_dir.resolve().absolute()):
|
while str(d['identifier']) in str(data_dir.resolve().absolute()):
|
||||||
data_dir = data_dir.parent
|
data_dir = data_dir.parent
|
||||||
identifiers.append(Listing(d["identifier"], data_dir=data_dir))
|
listing = Listing(d["identifier"], data_dir=data_dir)
|
||||||
|
if (
|
||||||
|
listing.last_seen is not None
|
||||||
|
and listing.last_seen < seen_in_the_last_n_days
|
||||||
|
):
|
||||||
|
identifiers.append(listing)
|
||||||
|
|
||||||
return identifiers
|
return identifiers
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -3,6 +3,8 @@ import click
|
||||||
import importlib
|
import importlib
|
||||||
|
|
||||||
from rec.districts import get_districts
|
from rec.districts import get_districts
|
||||||
|
from data_access import Listing
|
||||||
|
import csv_exporter
|
||||||
|
|
||||||
dump_listings_module = importlib.import_module('1_dump_listings')
|
dump_listings_module = importlib.import_module('1_dump_listings')
|
||||||
dump_detail_module = importlib.import_module('2_dump_detail')
|
dump_detail_module = importlib.import_module('2_dump_detail')
|
||||||
|
|
@ -92,5 +94,39 @@ def routing(ctx: click.core.Context):
|
||||||
routing_module.calculate_route(listing_paths)
|
routing_module.calculate_route(listing_paths)
|
||||||
|
|
||||||
|
|
||||||
|
@cli.command()
|
||||||
|
@click.option(
|
||||||
|
'--columns',
|
||||||
|
'-C',
|
||||||
|
help='Columns to include in the CSV file',
|
||||||
|
type=click.Choice(
|
||||||
|
Listing.ALL_COLUMNS,
|
||||||
|
case_sensitive=False,
|
||||||
|
),
|
||||||
|
multiple=True,
|
||||||
|
default=Listing.ALL_COLUMNS,
|
||||||
|
)
|
||||||
|
@click.option(
|
||||||
|
'--output-file',
|
||||||
|
'-O',
|
||||||
|
help='Path to the output CSV file',
|
||||||
|
required=True,
|
||||||
|
type=click.Path(
|
||||||
|
writable=True,
|
||||||
|
file_okay=True,
|
||||||
|
dir_okay=False,
|
||||||
|
resolve_path=True,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
@click.pass_context
|
||||||
|
def export_csv(ctx: click.core.Context, output_file: str, columns: tuple[str]):
|
||||||
|
data_dir = ctx.obj['data_dir']
|
||||||
|
click.echo(f'Exporting data to {output_file} using {data_dir=}')
|
||||||
|
output_file_path = pathlib.Path(output_file)
|
||||||
|
listing_paths = sorted(list(pathlib.Path(data_dir).glob("*/listing.json")))
|
||||||
|
listings = Listing.get_all_listings(listing_paths)
|
||||||
|
csv_exporter.export_to_csv(listings, output_file_path, list(columns))
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
cli()
|
cli()
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue