wrongmove/crawler/main.py
2025-05-17 20:40:44 +00:00

171 lines
4.6 KiB
Python

import pathlib
import click
import importlib
from rec.districts import get_districts
from data_access import Listing
import csv_exporter
dump_listings_module = importlib.import_module('1_dump_listings')
dump_detail_module = importlib.import_module('2_dump_detail')
dump_images_module = importlib.import_module('3_dump_images')
detect_floorplan_module = importlib.import_module('4_detect_floorplan')
routing_module = importlib.import_module('5_routing')
steps_to_handlers = {
'dump_listings': dump_listings_module.dump_listings,
'dump_detail': dump_detail_module.dump_detail,
'dump_images': dump_images_module.dump_images,
'detect_floorplan': detect_floorplan_module.detect_floorplan,
'routing': routing_module.calculate_route,
}
@click.group()
@click.option(
'--data-dir',
default=pathlib.Path("data/rs/"),
help='Districts to scrape',
type=click.Path(
writable=True,
file_okay=False,
dir_okay=True,
resolve_path=True,
),
)
@click.pass_context
def cli(ctx, data_dir: str):
ctx.ensure_object(dict)
ctx.obj['data_dir'] = data_dir
pass
@cli.command()
@click.option(
'--min-bedrooms',
default=1,
help='Minimum number of bedrooms',
type=click.IntRange(min=1),
)
@click.option(
'--max-bedrooms',
default=5,
help='Maximum number of bedrooms',
type=click.IntRange(min=1),
)
@click.option(
'--min-price',
default=0,
help='Minimum price',
type=click.IntRange(min=0),
)
@click.option(
'--max-price',
default=1000000,
help='Maximum price',
type=click.IntRange(min=0),
)
@click.option(
'--district',
default=None,
help='Districts to scrape',
type=click.Choice(get_districts().keys(), case_sensitive=False),
multiple=True,
)
@click.pass_context
def dump_listings(
ctx: click.core.Context,
district: list[str],
min_bedrooms: int,
max_bedrooms: int,
min_price: int,
max_price: int,
):
data_dir: str = ctx.obj['data_dir']
query_parameters = dump_listings_module.QueryParameters(
district_names=set(district),
min_bedrooms=min_bedrooms,
max_bedrooms=max_bedrooms,
min_price=min_price,
max_price=max_price,
)
click.echo(
f'Running dump_listings for districts {district}, data dir {data_dir} and parameters: '
f'{query_parameters}'
)
data_dir_path = pathlib.Path(data_dir)
dump_listings_module.dump_listings(query_parameters, data_dir_path)
@cli.command()
@click.pass_context
def dump_detail(ctx: click.core.Context):
data_dir = ctx.obj['data_dir']
click.echo(f'Running dump_detail for listings stored in {data_dir}')
listing_paths = sorted(list(pathlib.Path(data_dir).glob("*/listing.json")))
dump_detail_module.dump_detail(listing_paths)
@cli.command()
@click.pass_context
def dump_images(ctx: click.core.Context):
data_dir = ctx.obj['data_dir']
click.echo(f'Running dump_images stored in {data_dir}')
listing_paths = sorted(list(pathlib.Path(data_dir).glob("*/listing.json")))
dump_images_module.dump_images(listing_paths)
@cli.command()
@click.pass_context
def detect_floorplan(ctx: click.core.Context):
data_dir = ctx.obj['data_dir']
click.echo(f'Running detect_floorplan in {data_dir}')
listing_paths = sorted(list(pathlib.Path(data_dir).glob("*/listing.json")))
detect_floorplan_module.detect_floorplan(listing_paths)
@cli.command()
@click.pass_context
def routing(ctx: click.core.Context):
data_dir = ctx.obj['data_dir']
click.echo(f'Running routing for listings in {data_dir}')
listing_paths = sorted(list(pathlib.Path(data_dir).glob("*/listing.json")))
routing_module.calculate_route(listing_paths)
@cli.command()
@click.option(
'--columns',
'-C',
help='Columns to include in the CSV file',
type=click.Choice(
Listing.ALL_COLUMNS,
case_sensitive=False,
),
multiple=True,
default=Listing.ALL_COLUMNS,
)
@click.option(
'--output-file',
'-O',
help='Path to the output CSV file',
required=True,
type=click.Path(
writable=True,
file_okay=True,
dir_okay=False,
resolve_path=True,
),
)
@click.pass_context
def export_csv(ctx: click.core.Context, output_file: str, columns: tuple[str]):
data_dir = ctx.obj['data_dir']
click.echo(f'Exporting data to {output_file} using {data_dir=}')
output_file_path = pathlib.Path(output_file)
listing_paths = sorted(list(pathlib.Path(data_dir).glob("*/listing.json")))
listings = Listing.get_all_listings(listing_paths)
csv_exporter.export_to_csv(listings, output_file_path, list(columns))
if __name__ == '__main__':
cli()