2025-05-17 21:55:42 +00:00
|
|
|
import asyncio
|
2025-05-18 21:13:50 +00:00
|
|
|
import os
|
2025-05-14 20:19:08 +00:00
|
|
|
import pathlib
|
2025-05-11 18:59:41 +00:00
|
|
|
import click
|
|
|
|
|
import importlib
|
|
|
|
|
|
2025-05-14 19:41:13 +00:00
|
|
|
from rec.districts import get_districts
|
2025-05-17 20:13:28 +00:00
|
|
|
from data_access import Listing
|
|
|
|
|
import csv_exporter
|
2025-05-18 17:22:48 +00:00
|
|
|
from rec.query import ListingType, FurnishType
|
2025-05-18 21:13:50 +00:00
|
|
|
from rec.routing import API_KEY_ENVIRONMENT_VARIABLE, TravelMode
|
2025-05-14 19:41:13 +00:00
|
|
|
|
2025-05-11 18:59:41 +00:00
|
|
|
dump_listings_module = importlib.import_module('1_dump_listings')
|
2025-05-11 19:02:23 +00:00
|
|
|
dump_detail_module = importlib.import_module('2_dump_detail')
|
2025-05-11 19:04:19 +00:00
|
|
|
dump_images_module = importlib.import_module('3_dump_images')
|
2025-05-11 19:06:08 +00:00
|
|
|
detect_floorplan_module = importlib.import_module('4_detect_floorplan')
|
2025-05-11 19:11:23 +00:00
|
|
|
routing_module = importlib.import_module('5_routing')
|
2025-05-11 18:59:41 +00:00
|
|
|
|
|
|
|
|
|
2025-05-14 19:41:13 +00:00
|
|
|
@click.group()
|
2025-05-14 20:19:08 +00:00
|
|
|
@click.option(
|
|
|
|
|
'--data-dir',
|
|
|
|
|
default=pathlib.Path("data/rs/"),
|
|
|
|
|
help='Districts to scrape',
|
|
|
|
|
type=click.Path(
|
|
|
|
|
writable=True,
|
|
|
|
|
file_okay=False,
|
|
|
|
|
dir_okay=True,
|
|
|
|
|
resolve_path=True,
|
|
|
|
|
),
|
|
|
|
|
)
|
2025-05-14 20:32:37 +00:00
|
|
|
@click.pass_context
|
|
|
|
|
def cli(ctx, data_dir: str):
|
|
|
|
|
ctx.ensure_object(dict)
|
|
|
|
|
ctx.obj['data_dir'] = data_dir
|
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@cli.command()
|
2025-05-17 21:22:39 +00:00
|
|
|
@click.option(
|
|
|
|
|
'--type',
|
|
|
|
|
'-t',
|
|
|
|
|
help='Type of listing to scrape',
|
|
|
|
|
type=click.Choice(
|
|
|
|
|
ListingType.__members__.keys(),
|
|
|
|
|
case_sensitive=False,
|
|
|
|
|
),
|
|
|
|
|
required=True,
|
|
|
|
|
)
|
2025-05-17 20:40:44 +00:00
|
|
|
@click.option(
|
|
|
|
|
'--min-bedrooms',
|
|
|
|
|
default=1,
|
|
|
|
|
help='Minimum number of bedrooms',
|
|
|
|
|
type=click.IntRange(min=1),
|
|
|
|
|
)
|
|
|
|
|
@click.option(
|
|
|
|
|
'--max-bedrooms',
|
|
|
|
|
default=5,
|
|
|
|
|
help='Maximum number of bedrooms',
|
|
|
|
|
type=click.IntRange(min=1),
|
|
|
|
|
)
|
|
|
|
|
@click.option(
|
|
|
|
|
'--min-price',
|
|
|
|
|
default=0,
|
|
|
|
|
help='Minimum price',
|
|
|
|
|
type=click.IntRange(min=0),
|
|
|
|
|
)
|
|
|
|
|
@click.option(
|
|
|
|
|
'--max-price',
|
|
|
|
|
default=1000000,
|
|
|
|
|
help='Maximum price',
|
|
|
|
|
type=click.IntRange(min=0),
|
|
|
|
|
)
|
2025-05-14 20:32:37 +00:00
|
|
|
@click.option(
|
|
|
|
|
'--district',
|
|
|
|
|
default=None,
|
|
|
|
|
help='Districts to scrape',
|
|
|
|
|
type=click.Choice(get_districts().keys(), case_sensitive=False),
|
|
|
|
|
multiple=True,
|
|
|
|
|
)
|
2025-05-18 17:22:48 +00:00
|
|
|
@click.option(
|
|
|
|
|
'--furnish-types',
|
|
|
|
|
'-f',
|
|
|
|
|
help='Furnish types for rented listings',
|
|
|
|
|
type=click.Choice(
|
|
|
|
|
[
|
|
|
|
|
furnish_type.name
|
|
|
|
|
for furnish_type in FurnishType.__members__.values()
|
|
|
|
|
],
|
|
|
|
|
case_sensitive=False,
|
|
|
|
|
),
|
|
|
|
|
multiple=True,
|
|
|
|
|
)
|
2025-05-14 20:32:37 +00:00
|
|
|
@click.pass_context
|
2025-05-17 20:40:44 +00:00
|
|
|
def dump_listings(
|
|
|
|
|
ctx: click.core.Context,
|
|
|
|
|
district: list[str],
|
|
|
|
|
min_bedrooms: int,
|
|
|
|
|
max_bedrooms: int,
|
|
|
|
|
min_price: int,
|
|
|
|
|
max_price: int,
|
2025-05-17 21:22:39 +00:00
|
|
|
type: str,
|
2025-05-18 17:22:48 +00:00
|
|
|
furnish_types: list[str],
|
2025-05-17 20:40:44 +00:00
|
|
|
):
|
2025-05-14 20:32:37 +00:00
|
|
|
data_dir: str = ctx.obj['data_dir']
|
2025-05-17 20:40:44 +00:00
|
|
|
query_parameters = dump_listings_module.QueryParameters(
|
2025-05-17 21:22:39 +00:00
|
|
|
listing_type=ListingType[type],
|
2025-05-17 20:40:44 +00:00
|
|
|
district_names=set(district),
|
|
|
|
|
min_bedrooms=min_bedrooms,
|
|
|
|
|
max_bedrooms=max_bedrooms,
|
|
|
|
|
min_price=min_price,
|
|
|
|
|
max_price=max_price,
|
2025-05-18 17:22:48 +00:00
|
|
|
furnish_types=[
|
|
|
|
|
FurnishType[furnish_type] for furnish_type in furnish_types
|
|
|
|
|
],
|
2025-05-17 20:40:44 +00:00
|
|
|
)
|
2025-05-14 20:19:08 +00:00
|
|
|
click.echo(
|
2025-05-17 20:40:44 +00:00
|
|
|
f'Running dump_listings for districts {district}, data dir {data_dir} and parameters: '
|
2025-05-18 17:22:48 +00:00
|
|
|
f'{query_parameters}')
|
2025-05-14 20:19:08 +00:00
|
|
|
data_dir_path = pathlib.Path(data_dir)
|
2025-05-18 17:22:48 +00:00
|
|
|
asyncio.run(
|
|
|
|
|
dump_listings_module.dump_listings(query_parameters, data_dir_path))
|
2025-05-14 19:41:13 +00:00
|
|
|
|
|
|
|
|
|
|
|
|
|
@cli.command()
|
2025-05-14 20:32:37 +00:00
|
|
|
@click.pass_context
|
2025-05-17 21:55:42 +00:00
|
|
|
def dump_details(ctx: click.core.Context):
|
2025-05-14 20:32:37 +00:00
|
|
|
data_dir = ctx.obj['data_dir']
|
|
|
|
|
click.echo(f'Running dump_detail for listings stored in {data_dir}')
|
|
|
|
|
listing_paths = sorted(list(pathlib.Path(data_dir).glob("*/listing.json")))
|
2025-05-17 22:11:33 +00:00
|
|
|
asyncio.run(dump_detail_module.dump_detail(listing_paths))
|
2025-05-14 19:41:13 +00:00
|
|
|
|
|
|
|
|
|
|
|
|
|
@cli.command()
|
2025-05-14 21:01:58 +00:00
|
|
|
@click.pass_context
|
|
|
|
|
def dump_images(ctx: click.core.Context):
|
|
|
|
|
data_dir = ctx.obj['data_dir']
|
|
|
|
|
click.echo(f'Running dump_images stored in {data_dir}')
|
|
|
|
|
listing_paths = sorted(list(pathlib.Path(data_dir).glob("*/listing.json")))
|
2025-05-17 22:34:27 +00:00
|
|
|
asyncio.run(dump_images_module.dump_images(listing_paths))
|
2025-05-14 19:41:13 +00:00
|
|
|
|
|
|
|
|
|
|
|
|
|
@cli.command()
|
2025-05-14 21:05:59 +00:00
|
|
|
@click.pass_context
|
|
|
|
|
def detect_floorplan(ctx: click.core.Context):
|
|
|
|
|
data_dir = ctx.obj['data_dir']
|
|
|
|
|
click.echo(f'Running detect_floorplan in {data_dir}')
|
|
|
|
|
listing_paths = sorted(list(pathlib.Path(data_dir).glob("*/listing.json")))
|
2025-05-17 22:58:35 +00:00
|
|
|
asyncio.run(detect_floorplan_module.detect_floorplan(listing_paths))
|
2025-05-14 19:41:13 +00:00
|
|
|
|
|
|
|
|
|
|
|
|
|
@cli.command()
|
2025-05-18 21:13:50 +00:00
|
|
|
@click.option(
|
|
|
|
|
'--destination-address',
|
|
|
|
|
'-d',
|
|
|
|
|
help='Destination address for routing',
|
|
|
|
|
required=True,
|
|
|
|
|
type=click.STRING,
|
|
|
|
|
)
|
|
|
|
|
@click.option(
|
|
|
|
|
'--travel-mode',
|
|
|
|
|
'-m',
|
|
|
|
|
help='Travel mode for routing',
|
|
|
|
|
type=click.Choice(
|
|
|
|
|
TravelMode.__members__.keys(),
|
|
|
|
|
case_sensitive=False,
|
|
|
|
|
),
|
|
|
|
|
required=True,
|
|
|
|
|
)
|
|
|
|
|
@click.option(
|
|
|
|
|
'--limit',
|
|
|
|
|
'-l',
|
|
|
|
|
help='Limit the number of listings to process',
|
|
|
|
|
type=click.IntRange(min=1),
|
|
|
|
|
default=1, # by default limit to 1 to avoid accidental API usage
|
|
|
|
|
)
|
2025-05-14 21:08:03 +00:00
|
|
|
@click.pass_context
|
2025-05-18 21:13:50 +00:00
|
|
|
def routing(ctx: click.core.Context, destination_address: str,
|
|
|
|
|
travel_mode: str, limit: int):
|
2025-05-14 21:08:03 +00:00
|
|
|
data_dir = ctx.obj['data_dir']
|
2025-05-18 21:13:50 +00:00
|
|
|
click.echo(f'Running routing for the first {limit} listings in {data_dir}')
|
2025-05-14 21:08:03 +00:00
|
|
|
listing_paths = sorted(list(pathlib.Path(data_dir).glob("*/listing.json")))
|
2025-05-18 21:13:50 +00:00
|
|
|
listing_paths = listing_paths[:limit]
|
|
|
|
|
if os.environ.get(API_KEY_ENVIRONMENT_VARIABLE) is None:
|
|
|
|
|
raise click.exceptions.MissingParameter(
|
|
|
|
|
f'{API_KEY_ENVIRONMENT_VARIABLE} environment variable is not set. '
|
|
|
|
|
'Please set it to your API key for the routing service.')
|
|
|
|
|
routing_module.calculate_route(
|
|
|
|
|
listing_paths,
|
|
|
|
|
destination_address,
|
|
|
|
|
TravelMode[travel_mode],
|
|
|
|
|
)
|
2025-05-11 18:59:41 +00:00
|
|
|
|
|
|
|
|
|
2025-05-17 20:13:28 +00:00
|
|
|
@cli.command()
|
|
|
|
|
@click.option(
|
|
|
|
|
'--columns',
|
|
|
|
|
'-C',
|
|
|
|
|
help='Columns to include in the CSV file',
|
|
|
|
|
type=click.Choice(
|
|
|
|
|
Listing.ALL_COLUMNS,
|
|
|
|
|
case_sensitive=False,
|
|
|
|
|
),
|
|
|
|
|
multiple=True,
|
|
|
|
|
default=Listing.ALL_COLUMNS,
|
|
|
|
|
)
|
|
|
|
|
@click.option(
|
|
|
|
|
'--output-file',
|
|
|
|
|
'-O',
|
|
|
|
|
help='Path to the output CSV file',
|
|
|
|
|
required=True,
|
|
|
|
|
type=click.Path(
|
|
|
|
|
writable=True,
|
|
|
|
|
file_okay=True,
|
|
|
|
|
dir_okay=False,
|
|
|
|
|
resolve_path=True,
|
|
|
|
|
),
|
|
|
|
|
)
|
|
|
|
|
@click.pass_context
|
|
|
|
|
def export_csv(ctx: click.core.Context, output_file: str, columns: tuple[str]):
|
|
|
|
|
data_dir = ctx.obj['data_dir']
|
|
|
|
|
click.echo(f'Exporting data to {output_file} using {data_dir=}')
|
|
|
|
|
output_file_path = pathlib.Path(output_file)
|
|
|
|
|
listing_paths = sorted(list(pathlib.Path(data_dir).glob("*/listing.json")))
|
2025-05-18 21:24:04 +00:00
|
|
|
listings = Listing.get_all_listings([str(path) for path in listing_paths])
|
2025-05-17 20:13:28 +00:00
|
|
|
csv_exporter.export_to_csv(listings, output_file_path, list(columns))
|
|
|
|
|
|
|
|
|
|
|
2025-05-11 18:59:41 +00:00
|
|
|
if __name__ == '__main__':
|
2025-05-14 19:41:13 +00:00
|
|
|
cli()
|