Add services layer, tests, streaming UI, and cleanup legacy code

This commit is contained in:
Viktor Barzin 2026-02-06 20:55:10 +00:00
parent 5514fa6381
commit d205d15c74
62 changed files with 3729 additions and 1024 deletions

View file

@ -1,28 +1,28 @@
"""CLI entry point for the Real Estate Crawler."""
import asyncio
from datetime import datetime
import os
import pathlib
from typing import Callable, ParamSpec, TypeVar
import click
import importlib
from models.listing import FurnishType, ListingType, QueryParameters
from rec.districts import get_districts
from data_access import Listing
import csv_exporter
from rec.routing import API_KEY_ENVIRONMENT_VARIABLE, TravelMode
from repositories.listing_repository import ListingRepository
from ui_exporter import export_immoweb as export_immoweb_ui
from functools import wraps
from database import engine
from services import (
listing_service,
export_service,
district_service,
)
P = ParamSpec("P")
R = TypeVar("R")
dump_listings_module = importlib.import_module("1_dump_listings")
dump_images_module = importlib.import_module("3_dump_images")
detect_floorplan_module = importlib.import_module("4_detect_floorplan")
routing_module = importlib.import_module("5_routing")
def listing_filter_options(func):
def listing_filter_options(func: Callable[P, R]) -> Callable[P, R]:
"""Decorator to add common options for filtering listings."""
@click.option(
@ -45,7 +45,7 @@ def listing_filter_options(func):
"--max-bedrooms",
default=10,
help="Maximum number of bedrooms",
type=click.IntRange(min=1, max=10), # Right move gets unhappy with >10
type=click.IntRange(min=1, max=10),
)
@click.option(
"--min-price",
@ -57,13 +57,13 @@ def listing_filter_options(func):
"--max-price",
default=999_999,
help="Maximum price",
type=click.IntRange(min=0), # 40k for renting
type=click.IntRange(min=0),
)
@click.option(
"--district",
default=None,
help="Districts to scrape",
type=click.Choice(get_districts().keys(), case_sensitive=False),
type=click.Choice(district_service.get_district_names(), case_sensitive=False),
multiple=True,
)
@click.option(
@ -95,17 +95,50 @@ def listing_filter_options(func):
type=int,
)
@wraps(func)
def wrapper(*args, **kwargs):
def wrapper(*args: P.args, **kwargs: P.kwargs) -> R:
return func(*args, **kwargs)
return wrapper
def build_query_parameters(
type: str,
district: list[str],
min_bedrooms: int,
max_bedrooms: int,
min_price: int,
max_price: int,
furnish_types: list[str],
available_from: datetime | None,
last_seen_days: int,
min_sqm: int | None = None,
radius: int = 0,
page_size: int = 500,
max_days_since_added: int = 14,
) -> QueryParameters:
"""Build QueryParameters from CLI options."""
return QueryParameters(
listing_type=ListingType[type],
district_names=set(district) if district else None,
min_bedrooms=min_bedrooms,
max_bedrooms=max_bedrooms,
min_price=min_price,
max_price=max_price,
furnish_types=[FurnishType[ft] for ft in furnish_types] if furnish_types else None,
let_date_available_from=available_from,
last_seen_days=last_seen_days,
min_sqm=min_sqm,
radius=radius,
page_size=page_size,
max_days_since_added=max_days_since_added,
)
@click.group()
@click.option(
"--data-dir",
default=pathlib.Path("data/rs/"),
help="Districts to scrape",
help="Data directory for storing listings",
type=click.Path(
writable=True,
file_okay=False,
@ -114,17 +147,18 @@ def listing_filter_options(func):
),
)
@click.pass_context
def cli(ctx, data_dir: str):
def cli(ctx: click.Context, data_dir: str) -> None:
ctx.ensure_object(dict)
ctx.obj["data_dir"] = data_dir
ctx.obj["data_dir"] = pathlib.Path(data_dir)
ctx.obj["repository"] = ListingRepository(engine=engine)
@cli.command()
@listing_filter_options
@click.option("--full", is_flag=True)
@click.option("--full", is_flag=True, help="Include images and floorplan detection")
@click.pass_context
def dump_listings(
ctx: click.core.Context,
ctx: click.Context,
full: bool,
district: list[str],
min_bedrooms: int,
@ -136,58 +170,63 @@ def dump_listings(
available_from: datetime | None,
last_seen_days: int,
min_sqm: int | None = None,
):
data_dir: str = ctx.obj["data_dir"]
query_parameters = QueryParameters(
listing_type=ListingType[type],
district_names=set(district),
) -> None:
"""Fetch listings from Rightmove API."""
data_dir: pathlib.Path = ctx.obj["data_dir"]
repository: ListingRepository = ctx.obj["repository"]
query_parameters = build_query_parameters(
type=type,
district=district,
min_bedrooms=min_bedrooms,
max_bedrooms=max_bedrooms,
min_price=min_price,
max_price=max_price,
furnish_types=[FurnishType[furnish_type] for furnish_type in furnish_types],
let_date_available_from=available_from,
furnish_types=furnish_types,
available_from=available_from,
last_seen_days=last_seen_days,
min_sqm=min_sqm,
radius=0,
page_size=500,
max_days_since_added=14,
)
click.echo(
f"Running dump_listings for districts {district}, data dir {data_dir} and parameters: "
f"{query_parameters}"
click.echo(f"Fetching listings with parameters: {query_parameters}")
result = asyncio.run(
listing_service.refresh_listings(
repository,
query_parameters,
full=full,
async_mode=False,
)
)
data_dir_path = pathlib.Path(data_dir)
repository = ListingRepository(engine=engine)
if not full: # only listings
asyncio.run(
dump_listings_module.dump_listings(
query_parameters, repository, data_dir_path
)
)
else: # include images, floorplan detection etc.
asyncio.run(
dump_listings_module.dump_listings_full(
query_parameters, repository, data_dir_path
)
)
click.echo(result.message)
@cli.command()
@click.pass_context
def dump_images(ctx: click.core.Context):
data_dir = ctx.obj["data_dir"]
click.echo(f"Running dump_images for listings stored in {engine.url}")
repository = ListingRepository(engine=engine)
asyncio.run(dump_images_module.dump_images(repository, image_base_path=data_dir))
def dump_images(ctx: click.Context) -> None:
"""Download floorplan images for all listings."""
data_dir: pathlib.Path = ctx.obj["data_dir"]
repository: ListingRepository = ctx.obj["repository"]
click.echo(f"Downloading images to {data_dir}")
count = asyncio.run(listing_service.download_images(repository, data_dir))
click.echo(f"Processed {count} listings")
@cli.command()
@click.pass_context
def detect_floorplan(ctx: click.core.Context):
click.echo(f"Running detect_floorplan for listings stored in {engine.url}")
repository = ListingRepository(engine=engine)
asyncio.run(detect_floorplan_module.detect_floorplan(repository))
def detect_floorplan(ctx: click.Context) -> None:
"""Run OCR on floorplan images to detect square meters."""
repository: ListingRepository = ctx.obj["repository"]
click.echo("Running floorplan detection...")
count = asyncio.run(listing_service.detect_floorplans(repository))
click.echo(f"Processed {count} listings")
@cli.command()
@ -202,10 +241,7 @@ def detect_floorplan(ctx: click.core.Context):
"--travel-mode",
"-m",
help="Travel mode for routing",
type=click.Choice(
TravelMode.__members__.keys(),
case_sensitive=False,
),
type=click.Choice(TravelMode.__members__.keys(), case_sensitive=False),
required=True,
)
@click.option(
@ -213,65 +249,50 @@ def detect_floorplan(ctx: click.core.Context):
"-l",
help="Limit the number of listings to process",
type=click.IntRange(min=1),
default=1, # by default limit to 1 to avoid accidental API usage
default=1,
)
@click.pass_context
def routing(
ctx: click.core.Context, destination_address: str, travel_mode: str, limit: int
):
data_dir = ctx.obj["data_dir"]
click.echo(f"Running routing for the first {limit} listings in {data_dir}")
listing_paths = sorted(list(pathlib.Path(data_dir).glob("*/listing.json")))
listing_paths = listing_paths[:limit]
ctx: click.Context,
destination_address: str,
travel_mode: str,
limit: int,
) -> None:
"""Calculate transit routes for listings."""
repository: ListingRepository = ctx.obj["repository"]
if os.environ.get(API_KEY_ENVIRONMENT_VARIABLE) is None:
raise click.exceptions.MissingParameter(
f"{API_KEY_ENVIRONMENT_VARIABLE} environment variable is not set. "
"Please set it to your API key for the routing service."
raise click.ClickException(
f"{API_KEY_ENVIRONMENT_VARIABLE} environment variable is not set."
)
repository = ListingRepository(engine=engine)
asyncio.run(
routing_module.calculate_route(
click.echo(f"Calculating routes to '{destination_address}' for {limit} listings")
count = asyncio.run(
listing_service.calculate_routes(
repository,
destination_address,
# destination_address_coordinates,
TravelMode[travel_mode],
travel_mode,
limit=limit,
)
)
click.echo(f"Processed {count} listings")
@cli.command()
# @click.option(
# "--columns",
# "-C",
# help="Columns to include in the CSV file",
# type=click.Choice(
# # csv_exporter.get_columns_from_listings(),
# [1],
# case_sensitive=False,
# ),
# multiple=True,
# default=Listing.ALL_COLUMNS,
# )
@click.option(
"--output-file",
"-O",
help="Path to the output CSV file",
required=True,
type=click.Path(
writable=True,
file_okay=True,
dir_okay=False,
resolve_path=True,
),
type=click.Path(writable=True, file_okay=True, dir_okay=False, resolve_path=True),
)
@click.pass_context
@listing_filter_options
@click.pass_context
def export_csv(
ctx: click.core.Context,
ctx: click.Context,
output_file: str,
# columns: tuple[str],
district: list[str],
min_bedrooms: int,
max_bedrooms: int,
@ -282,53 +303,48 @@ def export_csv(
available_from: datetime | None,
last_seen_days: int,
min_sqm: int | None = None,
):
# use model
data_dir = ctx.obj["data_dir"]
query_parameters = QueryParameters(
listing_type=ListingType[type],
district_names=set(district),
) -> None:
"""Export listings to CSV file."""
repository: ListingRepository = ctx.obj["repository"]
query_parameters = build_query_parameters(
type=type,
district=district,
min_bedrooms=min_bedrooms,
max_bedrooms=max_bedrooms,
min_price=min_price,
max_price=max_price,
furnish_types=[FurnishType[furnish_type] for furnish_type in furnish_types],
let_date_available_from=available_from,
furnish_types=furnish_types,
available_from=available_from,
last_seen_days=last_seen_days,
min_sqm=min_sqm,
)
click.echo(
f"Exporting data to {output_file} using {data_dir=} and query parameters: {query_parameters}"
)
output_file_path = pathlib.Path(output_file)
repository = ListingRepository(engine=engine)
asyncio.run(
csv_exporter.export_to_csv(
click.echo(f"Exporting to {output_file}")
result = asyncio.run(
export_service.export_to_csv(
repository,
output_file_path,
# list(columns),
query_parameters=query_parameters,
),
pathlib.Path(output_file),
query_parameters,
)
)
click.echo(result.message)
@cli.command()
@click.option(
"--output-file",
"-O",
help="Path to the output immoweb file",
help="Path to the output GeoJSON file",
required=True,
type=click.Path(
writable=True,
file_okay=True,
dir_okay=False,
resolve_path=True,
),
type=click.Path(writable=True, file_okay=True, dir_okay=False, resolve_path=True),
)
@listing_filter_options
@click.pass_context
def export_immoweb(
ctx: click.core.Context,
ctx: click.Context,
output_file: str,
district: list[str],
min_bedrooms: int,
@ -340,39 +356,62 @@ def export_immoweb(
available_from: datetime | None,
last_seen_days: int,
min_sqm: int | None = None,
):
query_parameters = QueryParameters(
listing_type=ListingType[type],
district_names=set(district),
) -> None:
"""Export listings to GeoJSON file for map visualization."""
repository: ListingRepository = ctx.obj["repository"]
query_parameters = build_query_parameters(
type=type,
district=district,
min_bedrooms=min_bedrooms,
max_bedrooms=max_bedrooms,
min_price=min_price,
max_price=max_price,
furnish_types=[FurnishType[furnish_type] for furnish_type in furnish_types],
let_date_available_from=available_from,
furnish_types=furnish_types,
available_from=available_from,
last_seen_days=last_seen_days,
min_sqm=min_sqm,
)
click.echo(
f"Exporting data to {output_file} for listings stored in {engine.url} that match the query parameters: {query_parameters}"
click.echo(f"Exporting to {output_file}")
result = asyncio.run(
export_service.export_to_geojson(
repository,
query_parameters=query_parameters,
output_path=pathlib.Path(output_file),
)
)
repository = ListingRepository(engine=engine)
asyncio.run(export_immoweb_ui(repository, output_file, query_parameters))
click.echo(result.message)
@cli.command()
@click.pass_context
def populate_db(
ctx: click.core.Context,
):
data_dir = ctx.obj["data_dir"]
click.echo(f"Populating the database with data from {data_dir}")
repository = ListingRepository(engine=engine)
def populate_db(ctx: click.Context) -> None:
"""Populate database from filesystem data (legacy migration)."""
data_dir: pathlib.Path = ctx.obj["data_dir"]
repository: ListingRepository = ctx.obj["repository"]
click.echo(f"Populating database from {data_dir}")
listings = Listing.get_all_listings(
[path for path in pathlib.Path(data_dir).glob("*/listing.json")]
[path for path in data_dir.glob("*/listing.json")]
)
asyncio.run(repository.upsert_listings_legacy(listings))
click.echo(f"Imported {len(listings)} listings")
@cli.command()
def list_districts() -> None:
"""List all available districts."""
districts = district_service.get_all_districts()
click.echo(f"Available districts ({len(districts)}):")
for name in sorted(districts.keys()):
click.echo(f" - {name}")
if __name__ == "__main__":
cli()