2025-05-17 21:55:42 +00:00
|
|
|
import asyncio
|
2025-06-01 15:17:14 +00:00
|
|
|
from datetime import datetime
|
2025-05-20 21:58:08 +00:00
|
|
|
import json
|
2025-05-18 21:13:50 +00:00
|
|
|
import os
|
2025-05-14 20:19:08 +00:00
|
|
|
import pathlib
|
2025-05-11 18:59:41 +00:00
|
|
|
import click
|
|
|
|
|
import importlib
|
|
|
|
|
|
2025-07-27 18:33:39 +00:00
|
|
|
import listing_processor
|
2025-06-08 17:01:33 +00:00
|
|
|
from models.listing import FurnishType, ListingType, QueryParameters
|
2025-05-14 19:41:13 +00:00
|
|
|
from rec.districts import get_districts
|
2025-05-17 20:13:28 +00:00
|
|
|
from data_access import Listing
|
|
|
|
|
import csv_exporter
|
2025-05-18 21:13:50 +00:00
|
|
|
from rec.routing import API_KEY_ENVIRONMENT_VARIABLE, TravelMode
|
2025-06-07 12:00:23 +00:00
|
|
|
from repositories.listing_repository import ListingRepository
|
2025-05-26 19:36:54 +00:00
|
|
|
from ui_exporter import export_immoweb as export_immoweb_ui
|
2025-06-01 12:11:15 +00:00
|
|
|
from functools import wraps
|
2025-06-04 21:56:26 +00:00
|
|
|
from database import engine
|
2025-06-01 12:11:15 +00:00
|
|
|
|
2025-05-14 19:41:13 +00:00
|
|
|
|
2025-05-31 23:50:43 +00:00
|
|
|
dump_listings_module = importlib.import_module("1_dump_listings")
|
|
|
|
|
dump_images_module = importlib.import_module("3_dump_images")
|
|
|
|
|
detect_floorplan_module = importlib.import_module("4_detect_floorplan")
|
|
|
|
|
routing_module = importlib.import_module("5_routing")
|
2025-05-11 18:59:41 +00:00
|
|
|
|
|
|
|
|
|
2025-06-01 12:11:15 +00:00
|
|
|
def listing_filter_options(func):
|
|
|
|
|
"""Decorator to add common options for filtering listings."""
|
|
|
|
|
|
|
|
|
|
@click.option(
|
|
|
|
|
"--type",
|
|
|
|
|
"-t",
|
|
|
|
|
help="Type of listing to scrape",
|
|
|
|
|
type=click.Choice(
|
|
|
|
|
ListingType.__members__.keys(),
|
|
|
|
|
case_sensitive=False,
|
|
|
|
|
),
|
|
|
|
|
required=True,
|
|
|
|
|
)
|
|
|
|
|
@click.option(
|
|
|
|
|
"--min-bedrooms",
|
|
|
|
|
default=1,
|
|
|
|
|
help="Minimum number of bedrooms",
|
|
|
|
|
type=click.IntRange(min=1),
|
|
|
|
|
)
|
|
|
|
|
@click.option(
|
|
|
|
|
"--max-bedrooms",
|
2025-06-08 20:58:28 +00:00
|
|
|
default=10,
|
2025-06-01 12:11:15 +00:00
|
|
|
help="Maximum number of bedrooms",
|
2025-06-08 20:58:28 +00:00
|
|
|
type=click.IntRange(min=1, max=10), # Right move gets unhappy with >10
|
2025-06-01 12:11:15 +00:00
|
|
|
)
|
|
|
|
|
@click.option(
|
|
|
|
|
"--min-price",
|
|
|
|
|
default=0,
|
|
|
|
|
help="Minimum price",
|
|
|
|
|
type=click.IntRange(min=0),
|
|
|
|
|
)
|
|
|
|
|
@click.option(
|
|
|
|
|
"--max-price",
|
2025-06-08 20:58:28 +00:00
|
|
|
default=999_999,
|
2025-06-01 12:11:15 +00:00
|
|
|
help="Maximum price",
|
2025-07-06 12:02:25 +00:00
|
|
|
type=click.IntRange(min=0), # 40k for renting
|
2025-06-01 12:11:15 +00:00
|
|
|
)
|
|
|
|
|
@click.option(
|
|
|
|
|
"--district",
|
|
|
|
|
default=None,
|
|
|
|
|
help="Districts to scrape",
|
|
|
|
|
type=click.Choice(get_districts().keys(), case_sensitive=False),
|
|
|
|
|
multiple=True,
|
|
|
|
|
)
|
|
|
|
|
@click.option(
|
|
|
|
|
"--furnish-types",
|
|
|
|
|
"-f",
|
|
|
|
|
help="Furnish types for rented listings",
|
|
|
|
|
type=click.Choice(
|
|
|
|
|
[furnish_type.name for furnish_type in FurnishType.__members__.values()],
|
|
|
|
|
case_sensitive=False,
|
|
|
|
|
),
|
|
|
|
|
multiple=True,
|
|
|
|
|
)
|
2025-06-01 15:17:14 +00:00
|
|
|
@click.option(
|
|
|
|
|
"--available-from",
|
|
|
|
|
help="Let date available from",
|
|
|
|
|
default=None,
|
|
|
|
|
type=click.DateTime(),
|
|
|
|
|
)
|
2025-06-01 15:26:38 +00:00
|
|
|
@click.option(
|
|
|
|
|
"--last-seen-days",
|
|
|
|
|
help="Last seen (days). If set, only listings that were seen in the last N days will be included.",
|
|
|
|
|
default=14,
|
|
|
|
|
type=int,
|
|
|
|
|
)
|
2025-06-01 19:26:24 +00:00
|
|
|
@click.option(
|
|
|
|
|
"--min-sqm",
|
|
|
|
|
help="Minimum square meters for the listing",
|
|
|
|
|
default=None,
|
|
|
|
|
type=int,
|
|
|
|
|
)
|
2025-06-01 12:11:15 +00:00
|
|
|
@wraps(func)
|
|
|
|
|
def wrapper(*args, **kwargs):
|
|
|
|
|
return func(*args, **kwargs)
|
|
|
|
|
|
|
|
|
|
return wrapper
|
|
|
|
|
|
|
|
|
|
|
2025-05-14 19:41:13 +00:00
|
|
|
@click.group()
|
2025-05-14 20:19:08 +00:00
|
|
|
@click.option(
|
2025-05-31 23:50:43 +00:00
|
|
|
"--data-dir",
|
2025-05-14 20:19:08 +00:00
|
|
|
default=pathlib.Path("data/rs/"),
|
2025-05-31 23:50:43 +00:00
|
|
|
help="Districts to scrape",
|
2025-05-14 20:19:08 +00:00
|
|
|
type=click.Path(
|
|
|
|
|
writable=True,
|
|
|
|
|
file_okay=False,
|
|
|
|
|
dir_okay=True,
|
|
|
|
|
resolve_path=True,
|
|
|
|
|
),
|
|
|
|
|
)
|
2025-05-14 20:32:37 +00:00
|
|
|
@click.pass_context
|
|
|
|
|
def cli(ctx, data_dir: str):
|
|
|
|
|
ctx.ensure_object(dict)
|
2025-05-31 23:50:43 +00:00
|
|
|
ctx.obj["data_dir"] = data_dir
|
2025-05-14 20:32:37 +00:00
|
|
|
|
|
|
|
|
|
|
|
|
|
@cli.command()
|
2025-06-01 12:11:15 +00:00
|
|
|
@listing_filter_options
|
2025-06-21 12:49:04 +00:00
|
|
|
@click.option("--full", is_flag=True)
|
2025-05-14 20:32:37 +00:00
|
|
|
@click.pass_context
|
2025-05-17 20:40:44 +00:00
|
|
|
def dump_listings(
|
|
|
|
|
ctx: click.core.Context,
|
2025-06-21 12:49:04 +00:00
|
|
|
full: bool,
|
2025-05-17 20:40:44 +00:00
|
|
|
district: list[str],
|
|
|
|
|
min_bedrooms: int,
|
|
|
|
|
max_bedrooms: int,
|
|
|
|
|
min_price: int,
|
|
|
|
|
max_price: int,
|
2025-05-17 21:22:39 +00:00
|
|
|
type: str,
|
2025-05-18 17:22:48 +00:00
|
|
|
furnish_types: list[str],
|
2025-06-01 15:17:14 +00:00
|
|
|
available_from: datetime | None,
|
2025-06-01 15:26:38 +00:00
|
|
|
last_seen_days: int,
|
2025-06-01 19:26:24 +00:00
|
|
|
min_sqm: int | None = None,
|
2025-05-17 20:40:44 +00:00
|
|
|
):
|
2025-05-31 23:50:43 +00:00
|
|
|
data_dir: str = ctx.obj["data_dir"]
|
2025-06-01 15:17:14 +00:00
|
|
|
query_parameters = QueryParameters(
|
2025-05-17 21:22:39 +00:00
|
|
|
listing_type=ListingType[type],
|
2025-05-17 20:40:44 +00:00
|
|
|
district_names=set(district),
|
|
|
|
|
min_bedrooms=min_bedrooms,
|
|
|
|
|
max_bedrooms=max_bedrooms,
|
|
|
|
|
min_price=min_price,
|
|
|
|
|
max_price=max_price,
|
2025-05-31 23:50:43 +00:00
|
|
|
furnish_types=[FurnishType[furnish_type] for furnish_type in furnish_types],
|
2025-06-01 15:17:14 +00:00
|
|
|
let_date_available_from=available_from,
|
2025-06-01 15:26:38 +00:00
|
|
|
last_seen_days=last_seen_days,
|
2025-06-01 19:26:24 +00:00
|
|
|
min_sqm=min_sqm,
|
2025-06-21 12:49:04 +00:00
|
|
|
radius=0,
|
|
|
|
|
page_size=500,
|
|
|
|
|
max_days_since_added=14,
|
2025-05-17 20:40:44 +00:00
|
|
|
)
|
2025-05-14 20:19:08 +00:00
|
|
|
click.echo(
|
2025-05-31 23:50:43 +00:00
|
|
|
f"Running dump_listings for districts {district}, data dir {data_dir} and parameters: "
|
|
|
|
|
f"{query_parameters}"
|
|
|
|
|
)
|
2025-05-14 20:19:08 +00:00
|
|
|
data_dir_path = pathlib.Path(data_dir)
|
2025-06-06 19:57:50 +00:00
|
|
|
repository = ListingRepository(engine=engine)
|
2025-06-21 12:49:04 +00:00
|
|
|
if not full: # only listings
|
|
|
|
|
asyncio.run(
|
|
|
|
|
dump_listings_module.dump_listings(
|
|
|
|
|
query_parameters, repository, data_dir_path
|
|
|
|
|
)
|
|
|
|
|
)
|
|
|
|
|
else: # include images, floorplan detection etc.
|
|
|
|
|
asyncio.run(
|
|
|
|
|
dump_listings_module.dump_listings_full(
|
|
|
|
|
query_parameters, repository, data_dir_path
|
|
|
|
|
)
|
|
|
|
|
)
|
2025-05-14 19:41:13 +00:00
|
|
|
|
|
|
|
|
|
|
|
|
|
@cli.command()
|
2025-05-14 21:01:58 +00:00
|
|
|
@click.pass_context
|
|
|
|
|
def dump_images(ctx: click.core.Context):
|
2025-05-31 23:50:43 +00:00
|
|
|
data_dir = ctx.obj["data_dir"]
|
2025-06-07 14:30:32 +00:00
|
|
|
click.echo(f"Running dump_images for listings stored in {engine.url}")
|
2025-06-07 13:56:00 +00:00
|
|
|
repository = ListingRepository(engine=engine)
|
|
|
|
|
asyncio.run(dump_images_module.dump_images(repository, image_base_path=data_dir))
|
2025-05-14 19:41:13 +00:00
|
|
|
|
|
|
|
|
|
|
|
|
|
@cli.command()
|
2025-05-14 21:05:59 +00:00
|
|
|
@click.pass_context
|
|
|
|
|
def detect_floorplan(ctx: click.core.Context):
|
2025-05-31 23:50:43 +00:00
|
|
|
data_dir = ctx.obj["data_dir"]
|
2025-06-07 14:30:32 +00:00
|
|
|
click.echo(f"Running detect_floorplan for listings stored in {engine.url}")
|
|
|
|
|
repository = ListingRepository(engine=engine)
|
|
|
|
|
asyncio.run(detect_floorplan_module.detect_floorplan(repository))
|
2025-05-14 19:41:13 +00:00
|
|
|
|
|
|
|
|
|
|
|
|
|
@cli.command()
|
2025-05-18 21:13:50 +00:00
|
|
|
@click.option(
|
2025-05-31 23:50:43 +00:00
|
|
|
"--destination-address",
|
|
|
|
|
"-d",
|
|
|
|
|
help="Destination address for routing",
|
2025-05-18 21:13:50 +00:00
|
|
|
required=True,
|
|
|
|
|
type=click.STRING,
|
|
|
|
|
)
|
|
|
|
|
@click.option(
|
2025-05-31 23:50:43 +00:00
|
|
|
"--travel-mode",
|
|
|
|
|
"-m",
|
|
|
|
|
help="Travel mode for routing",
|
2025-05-18 21:13:50 +00:00
|
|
|
type=click.Choice(
|
|
|
|
|
TravelMode.__members__.keys(),
|
|
|
|
|
case_sensitive=False,
|
|
|
|
|
),
|
|
|
|
|
required=True,
|
|
|
|
|
)
|
|
|
|
|
@click.option(
|
2025-05-31 23:50:43 +00:00
|
|
|
"--limit",
|
|
|
|
|
"-l",
|
|
|
|
|
help="Limit the number of listings to process",
|
2025-05-18 21:13:50 +00:00
|
|
|
type=click.IntRange(min=1),
|
|
|
|
|
default=1, # by default limit to 1 to avoid accidental API usage
|
|
|
|
|
)
|
2025-05-14 21:08:03 +00:00
|
|
|
@click.pass_context
|
2025-05-31 23:50:43 +00:00
|
|
|
def routing(
|
|
|
|
|
ctx: click.core.Context, destination_address: str, travel_mode: str, limit: int
|
|
|
|
|
):
|
|
|
|
|
data_dir = ctx.obj["data_dir"]
|
|
|
|
|
click.echo(f"Running routing for the first {limit} listings in {data_dir}")
|
2025-05-14 21:08:03 +00:00
|
|
|
listing_paths = sorted(list(pathlib.Path(data_dir).glob("*/listing.json")))
|
2025-05-18 21:13:50 +00:00
|
|
|
listing_paths = listing_paths[:limit]
|
|
|
|
|
if os.environ.get(API_KEY_ENVIRONMENT_VARIABLE) is None:
|
|
|
|
|
raise click.exceptions.MissingParameter(
|
2025-05-31 23:50:43 +00:00
|
|
|
f"{API_KEY_ENVIRONMENT_VARIABLE} environment variable is not set. "
|
|
|
|
|
"Please set it to your API key for the routing service."
|
|
|
|
|
)
|
2025-05-20 21:58:08 +00:00
|
|
|
|
2025-06-08 11:45:05 +00:00
|
|
|
repository = ListingRepository(engine=engine)
|
2025-05-20 21:58:08 +00:00
|
|
|
asyncio.run(
|
|
|
|
|
routing_module.calculate_route(
|
2025-06-08 11:45:05 +00:00
|
|
|
repository,
|
2025-05-20 21:58:08 +00:00
|
|
|
destination_address,
|
|
|
|
|
# destination_address_coordinates,
|
|
|
|
|
TravelMode[travel_mode],
|
2025-06-08 11:45:05 +00:00
|
|
|
limit=limit,
|
2025-05-31 23:50:43 +00:00
|
|
|
)
|
|
|
|
|
)
|
2025-05-11 18:59:41 +00:00
|
|
|
|
|
|
|
|
|
2025-05-17 20:13:28 +00:00
|
|
|
@cli.command()
|
2025-06-08 17:01:33 +00:00
|
|
|
# @click.option(
|
|
|
|
|
# "--columns",
|
|
|
|
|
# "-C",
|
|
|
|
|
# help="Columns to include in the CSV file",
|
|
|
|
|
# type=click.Choice(
|
|
|
|
|
# # csv_exporter.get_columns_from_listings(),
|
|
|
|
|
# [1],
|
|
|
|
|
# case_sensitive=False,
|
|
|
|
|
# ),
|
|
|
|
|
# multiple=True,
|
|
|
|
|
# default=Listing.ALL_COLUMNS,
|
|
|
|
|
# )
|
2025-05-17 20:13:28 +00:00
|
|
|
@click.option(
|
2025-05-31 23:50:43 +00:00
|
|
|
"--output-file",
|
|
|
|
|
"-O",
|
|
|
|
|
help="Path to the output CSV file",
|
2025-05-17 20:13:28 +00:00
|
|
|
required=True,
|
|
|
|
|
type=click.Path(
|
|
|
|
|
writable=True,
|
|
|
|
|
file_okay=True,
|
|
|
|
|
dir_okay=False,
|
|
|
|
|
resolve_path=True,
|
|
|
|
|
),
|
|
|
|
|
)
|
|
|
|
|
@click.pass_context
|
2025-06-01 20:11:00 +00:00
|
|
|
@listing_filter_options
|
|
|
|
|
def export_csv(
|
|
|
|
|
ctx: click.core.Context,
|
|
|
|
|
output_file: str,
|
2025-06-08 17:01:33 +00:00
|
|
|
# columns: tuple[str],
|
2025-06-01 20:11:00 +00:00
|
|
|
district: list[str],
|
|
|
|
|
min_bedrooms: int,
|
|
|
|
|
max_bedrooms: int,
|
|
|
|
|
min_price: int,
|
|
|
|
|
max_price: int,
|
|
|
|
|
type: str,
|
|
|
|
|
furnish_types: list[str],
|
|
|
|
|
available_from: datetime | None,
|
|
|
|
|
last_seen_days: int,
|
|
|
|
|
min_sqm: int | None = None,
|
|
|
|
|
):
|
2025-06-08 17:01:33 +00:00
|
|
|
# use model
|
2025-05-31 23:50:43 +00:00
|
|
|
data_dir = ctx.obj["data_dir"]
|
2025-06-01 20:11:00 +00:00
|
|
|
query_parameters = QueryParameters(
|
|
|
|
|
listing_type=ListingType[type],
|
|
|
|
|
district_names=set(district),
|
|
|
|
|
min_bedrooms=min_bedrooms,
|
|
|
|
|
max_bedrooms=max_bedrooms,
|
|
|
|
|
min_price=min_price,
|
|
|
|
|
max_price=max_price,
|
|
|
|
|
furnish_types=[FurnishType[furnish_type] for furnish_type in furnish_types],
|
|
|
|
|
let_date_available_from=available_from,
|
|
|
|
|
last_seen_days=last_seen_days,
|
|
|
|
|
min_sqm=min_sqm,
|
|
|
|
|
)
|
|
|
|
|
click.echo(
|
|
|
|
|
f"Exporting data to {output_file} using {data_dir=} and query parameters: {query_parameters}"
|
|
|
|
|
)
|
2025-05-17 20:13:28 +00:00
|
|
|
output_file_path = pathlib.Path(output_file)
|
2025-06-08 17:01:33 +00:00
|
|
|
repository = ListingRepository(engine=engine)
|
2025-05-20 21:58:08 +00:00
|
|
|
asyncio.run(
|
2025-06-01 20:11:00 +00:00
|
|
|
csv_exporter.export_to_csv(
|
2025-06-08 17:01:33 +00:00
|
|
|
repository,
|
2025-06-01 20:11:00 +00:00
|
|
|
output_file_path,
|
2025-06-08 17:01:33 +00:00
|
|
|
# list(columns),
|
2025-06-01 20:11:00 +00:00
|
|
|
query_parameters=query_parameters,
|
|
|
|
|
),
|
2025-05-31 23:50:43 +00:00
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
2025-05-26 19:36:54 +00:00
|
|
|
@cli.command()
|
|
|
|
|
@click.option(
|
2025-05-31 23:50:43 +00:00
|
|
|
"--output-file",
|
|
|
|
|
"-O",
|
|
|
|
|
help="Path to the output immoweb file",
|
2025-05-26 19:36:54 +00:00
|
|
|
required=True,
|
|
|
|
|
type=click.Path(
|
|
|
|
|
writable=True,
|
|
|
|
|
file_okay=True,
|
|
|
|
|
dir_okay=False,
|
|
|
|
|
resolve_path=True,
|
|
|
|
|
),
|
|
|
|
|
)
|
2025-06-01 15:17:14 +00:00
|
|
|
@listing_filter_options
|
2025-05-26 19:36:54 +00:00
|
|
|
@click.pass_context
|
2025-06-01 15:17:14 +00:00
|
|
|
def export_immoweb(
|
2025-06-04 21:56:26 +00:00
|
|
|
ctx: click.core.Context,
|
2025-06-01 15:17:14 +00:00
|
|
|
output_file: str,
|
|
|
|
|
district: list[str],
|
|
|
|
|
min_bedrooms: int,
|
|
|
|
|
max_bedrooms: int,
|
|
|
|
|
min_price: int,
|
|
|
|
|
max_price: int,
|
|
|
|
|
type: str,
|
|
|
|
|
furnish_types: list[str],
|
|
|
|
|
available_from: datetime | None,
|
2025-06-01 15:26:38 +00:00
|
|
|
last_seen_days: int,
|
2025-06-01 19:26:24 +00:00
|
|
|
min_sqm: int | None = None,
|
2025-06-01 15:17:14 +00:00
|
|
|
):
|
|
|
|
|
query_parameters = QueryParameters(
|
|
|
|
|
listing_type=ListingType[type],
|
|
|
|
|
district_names=set(district),
|
|
|
|
|
min_bedrooms=min_bedrooms,
|
|
|
|
|
max_bedrooms=max_bedrooms,
|
|
|
|
|
min_price=min_price,
|
|
|
|
|
max_price=max_price,
|
|
|
|
|
furnish_types=[FurnishType[furnish_type] for furnish_type in furnish_types],
|
|
|
|
|
let_date_available_from=available_from,
|
2025-06-01 15:26:38 +00:00
|
|
|
last_seen_days=last_seen_days,
|
2025-06-01 19:26:24 +00:00
|
|
|
min_sqm=min_sqm,
|
2025-06-01 15:17:14 +00:00
|
|
|
)
|
|
|
|
|
click.echo(
|
2025-06-08 18:18:38 +00:00
|
|
|
f"Exporting data to {output_file} for listings stored in {engine.url} that match the query parameters: {query_parameters}"
|
2025-06-01 15:17:14 +00:00
|
|
|
)
|
2025-06-08 18:18:38 +00:00
|
|
|
repository = ListingRepository(engine=engine)
|
|
|
|
|
asyncio.run(export_immoweb_ui(repository, output_file, query_parameters))
|
2025-05-26 19:36:54 +00:00
|
|
|
|
2025-05-17 20:13:28 +00:00
|
|
|
|
2025-06-04 21:56:26 +00:00
|
|
|
@cli.command()
|
|
|
|
|
@click.pass_context
|
|
|
|
|
def populate_db(
|
|
|
|
|
ctx: click.core.Context,
|
|
|
|
|
):
|
|
|
|
|
data_dir = ctx.obj["data_dir"]
|
|
|
|
|
click.echo(f"Populating the database with data from {data_dir}")
|
|
|
|
|
repository = ListingRepository(engine=engine)
|
|
|
|
|
listings = Listing.get_all_listings(
|
2025-06-07 12:00:23 +00:00
|
|
|
[path for path in pathlib.Path(data_dir).glob("*/listing.json")]
|
2025-06-04 21:56:26 +00:00
|
|
|
)
|
2025-06-08 20:58:28 +00:00
|
|
|
asyncio.run(repository.upsert_listings_legacy(listings))
|
2025-06-04 21:56:26 +00:00
|
|
|
|
|
|
|
|
|
2025-05-31 23:50:43 +00:00
|
|
|
if __name__ == "__main__":
|
2025-05-14 19:41:13 +00:00
|
|
|
cli()
|