Add services layer, tests, streaming UI, and cleanup legacy code
This commit is contained in:
parent
5514fa6381
commit
d205d15c74
62 changed files with 3729 additions and 1024 deletions
319
crawler/main.py
319
crawler/main.py
|
|
@ -1,28 +1,28 @@
|
|||
"""CLI entry point for the Real Estate Crawler."""
|
||||
import asyncio
|
||||
from datetime import datetime
|
||||
import os
|
||||
import pathlib
|
||||
from typing import Callable, ParamSpec, TypeVar
|
||||
import click
|
||||
import importlib
|
||||
|
||||
from models.listing import FurnishType, ListingType, QueryParameters
|
||||
from rec.districts import get_districts
|
||||
from data_access import Listing
|
||||
import csv_exporter
|
||||
from rec.routing import API_KEY_ENVIRONMENT_VARIABLE, TravelMode
|
||||
from repositories.listing_repository import ListingRepository
|
||||
from ui_exporter import export_immoweb as export_immoweb_ui
|
||||
from functools import wraps
|
||||
from database import engine
|
||||
from services import (
|
||||
listing_service,
|
||||
export_service,
|
||||
district_service,
|
||||
)
|
||||
|
||||
P = ParamSpec("P")
|
||||
R = TypeVar("R")
|
||||
|
||||
|
||||
dump_listings_module = importlib.import_module("1_dump_listings")
|
||||
dump_images_module = importlib.import_module("3_dump_images")
|
||||
detect_floorplan_module = importlib.import_module("4_detect_floorplan")
|
||||
routing_module = importlib.import_module("5_routing")
|
||||
|
||||
|
||||
def listing_filter_options(func):
|
||||
def listing_filter_options(func: Callable[P, R]) -> Callable[P, R]:
|
||||
"""Decorator to add common options for filtering listings."""
|
||||
|
||||
@click.option(
|
||||
|
|
@ -45,7 +45,7 @@ def listing_filter_options(func):
|
|||
"--max-bedrooms",
|
||||
default=10,
|
||||
help="Maximum number of bedrooms",
|
||||
type=click.IntRange(min=1, max=10), # Right move gets unhappy with >10
|
||||
type=click.IntRange(min=1, max=10),
|
||||
)
|
||||
@click.option(
|
||||
"--min-price",
|
||||
|
|
@ -57,13 +57,13 @@ def listing_filter_options(func):
|
|||
"--max-price",
|
||||
default=999_999,
|
||||
help="Maximum price",
|
||||
type=click.IntRange(min=0), # 40k for renting
|
||||
type=click.IntRange(min=0),
|
||||
)
|
||||
@click.option(
|
||||
"--district",
|
||||
default=None,
|
||||
help="Districts to scrape",
|
||||
type=click.Choice(get_districts().keys(), case_sensitive=False),
|
||||
type=click.Choice(district_service.get_district_names(), case_sensitive=False),
|
||||
multiple=True,
|
||||
)
|
||||
@click.option(
|
||||
|
|
@ -95,17 +95,50 @@ def listing_filter_options(func):
|
|||
type=int,
|
||||
)
|
||||
@wraps(func)
|
||||
def wrapper(*args, **kwargs):
|
||||
def wrapper(*args: P.args, **kwargs: P.kwargs) -> R:
|
||||
return func(*args, **kwargs)
|
||||
|
||||
return wrapper
|
||||
|
||||
|
||||
def build_query_parameters(
|
||||
type: str,
|
||||
district: list[str],
|
||||
min_bedrooms: int,
|
||||
max_bedrooms: int,
|
||||
min_price: int,
|
||||
max_price: int,
|
||||
furnish_types: list[str],
|
||||
available_from: datetime | None,
|
||||
last_seen_days: int,
|
||||
min_sqm: int | None = None,
|
||||
radius: int = 0,
|
||||
page_size: int = 500,
|
||||
max_days_since_added: int = 14,
|
||||
) -> QueryParameters:
|
||||
"""Build QueryParameters from CLI options."""
|
||||
return QueryParameters(
|
||||
listing_type=ListingType[type],
|
||||
district_names=set(district) if district else None,
|
||||
min_bedrooms=min_bedrooms,
|
||||
max_bedrooms=max_bedrooms,
|
||||
min_price=min_price,
|
||||
max_price=max_price,
|
||||
furnish_types=[FurnishType[ft] for ft in furnish_types] if furnish_types else None,
|
||||
let_date_available_from=available_from,
|
||||
last_seen_days=last_seen_days,
|
||||
min_sqm=min_sqm,
|
||||
radius=radius,
|
||||
page_size=page_size,
|
||||
max_days_since_added=max_days_since_added,
|
||||
)
|
||||
|
||||
|
||||
@click.group()
|
||||
@click.option(
|
||||
"--data-dir",
|
||||
default=pathlib.Path("data/rs/"),
|
||||
help="Districts to scrape",
|
||||
help="Data directory for storing listings",
|
||||
type=click.Path(
|
||||
writable=True,
|
||||
file_okay=False,
|
||||
|
|
@ -114,17 +147,18 @@ def listing_filter_options(func):
|
|||
),
|
||||
)
|
||||
@click.pass_context
|
||||
def cli(ctx, data_dir: str):
|
||||
def cli(ctx: click.Context, data_dir: str) -> None:
|
||||
ctx.ensure_object(dict)
|
||||
ctx.obj["data_dir"] = data_dir
|
||||
ctx.obj["data_dir"] = pathlib.Path(data_dir)
|
||||
ctx.obj["repository"] = ListingRepository(engine=engine)
|
||||
|
||||
|
||||
@cli.command()
|
||||
@listing_filter_options
|
||||
@click.option("--full", is_flag=True)
|
||||
@click.option("--full", is_flag=True, help="Include images and floorplan detection")
|
||||
@click.pass_context
|
||||
def dump_listings(
|
||||
ctx: click.core.Context,
|
||||
ctx: click.Context,
|
||||
full: bool,
|
||||
district: list[str],
|
||||
min_bedrooms: int,
|
||||
|
|
@ -136,58 +170,63 @@ def dump_listings(
|
|||
available_from: datetime | None,
|
||||
last_seen_days: int,
|
||||
min_sqm: int | None = None,
|
||||
):
|
||||
data_dir: str = ctx.obj["data_dir"]
|
||||
query_parameters = QueryParameters(
|
||||
listing_type=ListingType[type],
|
||||
district_names=set(district),
|
||||
) -> None:
|
||||
"""Fetch listings from Rightmove API."""
|
||||
data_dir: pathlib.Path = ctx.obj["data_dir"]
|
||||
repository: ListingRepository = ctx.obj["repository"]
|
||||
|
||||
query_parameters = build_query_parameters(
|
||||
type=type,
|
||||
district=district,
|
||||
min_bedrooms=min_bedrooms,
|
||||
max_bedrooms=max_bedrooms,
|
||||
min_price=min_price,
|
||||
max_price=max_price,
|
||||
furnish_types=[FurnishType[furnish_type] for furnish_type in furnish_types],
|
||||
let_date_available_from=available_from,
|
||||
furnish_types=furnish_types,
|
||||
available_from=available_from,
|
||||
last_seen_days=last_seen_days,
|
||||
min_sqm=min_sqm,
|
||||
radius=0,
|
||||
page_size=500,
|
||||
max_days_since_added=14,
|
||||
)
|
||||
click.echo(
|
||||
f"Running dump_listings for districts {district}, data dir {data_dir} and parameters: "
|
||||
f"{query_parameters}"
|
||||
|
||||
click.echo(f"Fetching listings with parameters: {query_parameters}")
|
||||
|
||||
result = asyncio.run(
|
||||
listing_service.refresh_listings(
|
||||
repository,
|
||||
query_parameters,
|
||||
full=full,
|
||||
async_mode=False,
|
||||
)
|
||||
)
|
||||
data_dir_path = pathlib.Path(data_dir)
|
||||
repository = ListingRepository(engine=engine)
|
||||
if not full: # only listings
|
||||
asyncio.run(
|
||||
dump_listings_module.dump_listings(
|
||||
query_parameters, repository, data_dir_path
|
||||
)
|
||||
)
|
||||
else: # include images, floorplan detection etc.
|
||||
asyncio.run(
|
||||
dump_listings_module.dump_listings_full(
|
||||
query_parameters, repository, data_dir_path
|
||||
)
|
||||
)
|
||||
|
||||
click.echo(result.message)
|
||||
|
||||
|
||||
@cli.command()
|
||||
@click.pass_context
|
||||
def dump_images(ctx: click.core.Context):
|
||||
data_dir = ctx.obj["data_dir"]
|
||||
click.echo(f"Running dump_images for listings stored in {engine.url}")
|
||||
repository = ListingRepository(engine=engine)
|
||||
asyncio.run(dump_images_module.dump_images(repository, image_base_path=data_dir))
|
||||
def dump_images(ctx: click.Context) -> None:
|
||||
"""Download floorplan images for all listings."""
|
||||
data_dir: pathlib.Path = ctx.obj["data_dir"]
|
||||
repository: ListingRepository = ctx.obj["repository"]
|
||||
|
||||
click.echo(f"Downloading images to {data_dir}")
|
||||
|
||||
count = asyncio.run(listing_service.download_images(repository, data_dir))
|
||||
|
||||
click.echo(f"Processed {count} listings")
|
||||
|
||||
|
||||
@cli.command()
|
||||
@click.pass_context
|
||||
def detect_floorplan(ctx: click.core.Context):
|
||||
click.echo(f"Running detect_floorplan for listings stored in {engine.url}")
|
||||
repository = ListingRepository(engine=engine)
|
||||
asyncio.run(detect_floorplan_module.detect_floorplan(repository))
|
||||
def detect_floorplan(ctx: click.Context) -> None:
|
||||
"""Run OCR on floorplan images to detect square meters."""
|
||||
repository: ListingRepository = ctx.obj["repository"]
|
||||
|
||||
click.echo("Running floorplan detection...")
|
||||
|
||||
count = asyncio.run(listing_service.detect_floorplans(repository))
|
||||
|
||||
click.echo(f"Processed {count} listings")
|
||||
|
||||
|
||||
@cli.command()
|
||||
|
|
@ -202,10 +241,7 @@ def detect_floorplan(ctx: click.core.Context):
|
|||
"--travel-mode",
|
||||
"-m",
|
||||
help="Travel mode for routing",
|
||||
type=click.Choice(
|
||||
TravelMode.__members__.keys(),
|
||||
case_sensitive=False,
|
||||
),
|
||||
type=click.Choice(TravelMode.__members__.keys(), case_sensitive=False),
|
||||
required=True,
|
||||
)
|
||||
@click.option(
|
||||
|
|
@ -213,65 +249,50 @@ def detect_floorplan(ctx: click.core.Context):
|
|||
"-l",
|
||||
help="Limit the number of listings to process",
|
||||
type=click.IntRange(min=1),
|
||||
default=1, # by default limit to 1 to avoid accidental API usage
|
||||
default=1,
|
||||
)
|
||||
@click.pass_context
|
||||
def routing(
|
||||
ctx: click.core.Context, destination_address: str, travel_mode: str, limit: int
|
||||
):
|
||||
data_dir = ctx.obj["data_dir"]
|
||||
click.echo(f"Running routing for the first {limit} listings in {data_dir}")
|
||||
listing_paths = sorted(list(pathlib.Path(data_dir).glob("*/listing.json")))
|
||||
listing_paths = listing_paths[:limit]
|
||||
ctx: click.Context,
|
||||
destination_address: str,
|
||||
travel_mode: str,
|
||||
limit: int,
|
||||
) -> None:
|
||||
"""Calculate transit routes for listings."""
|
||||
repository: ListingRepository = ctx.obj["repository"]
|
||||
|
||||
if os.environ.get(API_KEY_ENVIRONMENT_VARIABLE) is None:
|
||||
raise click.exceptions.MissingParameter(
|
||||
f"{API_KEY_ENVIRONMENT_VARIABLE} environment variable is not set. "
|
||||
"Please set it to your API key for the routing service."
|
||||
raise click.ClickException(
|
||||
f"{API_KEY_ENVIRONMENT_VARIABLE} environment variable is not set."
|
||||
)
|
||||
|
||||
repository = ListingRepository(engine=engine)
|
||||
asyncio.run(
|
||||
routing_module.calculate_route(
|
||||
click.echo(f"Calculating routes to '{destination_address}' for {limit} listings")
|
||||
|
||||
count = asyncio.run(
|
||||
listing_service.calculate_routes(
|
||||
repository,
|
||||
destination_address,
|
||||
# destination_address_coordinates,
|
||||
TravelMode[travel_mode],
|
||||
travel_mode,
|
||||
limit=limit,
|
||||
)
|
||||
)
|
||||
|
||||
click.echo(f"Processed {count} listings")
|
||||
|
||||
|
||||
@cli.command()
|
||||
# @click.option(
|
||||
# "--columns",
|
||||
# "-C",
|
||||
# help="Columns to include in the CSV file",
|
||||
# type=click.Choice(
|
||||
# # csv_exporter.get_columns_from_listings(),
|
||||
# [1],
|
||||
# case_sensitive=False,
|
||||
# ),
|
||||
# multiple=True,
|
||||
# default=Listing.ALL_COLUMNS,
|
||||
# )
|
||||
@click.option(
|
||||
"--output-file",
|
||||
"-O",
|
||||
help="Path to the output CSV file",
|
||||
required=True,
|
||||
type=click.Path(
|
||||
writable=True,
|
||||
file_okay=True,
|
||||
dir_okay=False,
|
||||
resolve_path=True,
|
||||
),
|
||||
type=click.Path(writable=True, file_okay=True, dir_okay=False, resolve_path=True),
|
||||
)
|
||||
@click.pass_context
|
||||
@listing_filter_options
|
||||
@click.pass_context
|
||||
def export_csv(
|
||||
ctx: click.core.Context,
|
||||
ctx: click.Context,
|
||||
output_file: str,
|
||||
# columns: tuple[str],
|
||||
district: list[str],
|
||||
min_bedrooms: int,
|
||||
max_bedrooms: int,
|
||||
|
|
@ -282,53 +303,48 @@ def export_csv(
|
|||
available_from: datetime | None,
|
||||
last_seen_days: int,
|
||||
min_sqm: int | None = None,
|
||||
):
|
||||
# use model
|
||||
data_dir = ctx.obj["data_dir"]
|
||||
query_parameters = QueryParameters(
|
||||
listing_type=ListingType[type],
|
||||
district_names=set(district),
|
||||
) -> None:
|
||||
"""Export listings to CSV file."""
|
||||
repository: ListingRepository = ctx.obj["repository"]
|
||||
|
||||
query_parameters = build_query_parameters(
|
||||
type=type,
|
||||
district=district,
|
||||
min_bedrooms=min_bedrooms,
|
||||
max_bedrooms=max_bedrooms,
|
||||
min_price=min_price,
|
||||
max_price=max_price,
|
||||
furnish_types=[FurnishType[furnish_type] for furnish_type in furnish_types],
|
||||
let_date_available_from=available_from,
|
||||
furnish_types=furnish_types,
|
||||
available_from=available_from,
|
||||
last_seen_days=last_seen_days,
|
||||
min_sqm=min_sqm,
|
||||
)
|
||||
click.echo(
|
||||
f"Exporting data to {output_file} using {data_dir=} and query parameters: {query_parameters}"
|
||||
)
|
||||
output_file_path = pathlib.Path(output_file)
|
||||
repository = ListingRepository(engine=engine)
|
||||
asyncio.run(
|
||||
csv_exporter.export_to_csv(
|
||||
|
||||
click.echo(f"Exporting to {output_file}")
|
||||
|
||||
result = asyncio.run(
|
||||
export_service.export_to_csv(
|
||||
repository,
|
||||
output_file_path,
|
||||
# list(columns),
|
||||
query_parameters=query_parameters,
|
||||
),
|
||||
pathlib.Path(output_file),
|
||||
query_parameters,
|
||||
)
|
||||
)
|
||||
|
||||
click.echo(result.message)
|
||||
|
||||
|
||||
@cli.command()
|
||||
@click.option(
|
||||
"--output-file",
|
||||
"-O",
|
||||
help="Path to the output immoweb file",
|
||||
help="Path to the output GeoJSON file",
|
||||
required=True,
|
||||
type=click.Path(
|
||||
writable=True,
|
||||
file_okay=True,
|
||||
dir_okay=False,
|
||||
resolve_path=True,
|
||||
),
|
||||
type=click.Path(writable=True, file_okay=True, dir_okay=False, resolve_path=True),
|
||||
)
|
||||
@listing_filter_options
|
||||
@click.pass_context
|
||||
def export_immoweb(
|
||||
ctx: click.core.Context,
|
||||
ctx: click.Context,
|
||||
output_file: str,
|
||||
district: list[str],
|
||||
min_bedrooms: int,
|
||||
|
|
@ -340,39 +356,62 @@ def export_immoweb(
|
|||
available_from: datetime | None,
|
||||
last_seen_days: int,
|
||||
min_sqm: int | None = None,
|
||||
):
|
||||
query_parameters = QueryParameters(
|
||||
listing_type=ListingType[type],
|
||||
district_names=set(district),
|
||||
) -> None:
|
||||
"""Export listings to GeoJSON file for map visualization."""
|
||||
repository: ListingRepository = ctx.obj["repository"]
|
||||
|
||||
query_parameters = build_query_parameters(
|
||||
type=type,
|
||||
district=district,
|
||||
min_bedrooms=min_bedrooms,
|
||||
max_bedrooms=max_bedrooms,
|
||||
min_price=min_price,
|
||||
max_price=max_price,
|
||||
furnish_types=[FurnishType[furnish_type] for furnish_type in furnish_types],
|
||||
let_date_available_from=available_from,
|
||||
furnish_types=furnish_types,
|
||||
available_from=available_from,
|
||||
last_seen_days=last_seen_days,
|
||||
min_sqm=min_sqm,
|
||||
)
|
||||
click.echo(
|
||||
f"Exporting data to {output_file} for listings stored in {engine.url} that match the query parameters: {query_parameters}"
|
||||
|
||||
click.echo(f"Exporting to {output_file}")
|
||||
|
||||
result = asyncio.run(
|
||||
export_service.export_to_geojson(
|
||||
repository,
|
||||
query_parameters=query_parameters,
|
||||
output_path=pathlib.Path(output_file),
|
||||
)
|
||||
)
|
||||
repository = ListingRepository(engine=engine)
|
||||
asyncio.run(export_immoweb_ui(repository, output_file, query_parameters))
|
||||
|
||||
click.echo(result.message)
|
||||
|
||||
|
||||
@cli.command()
|
||||
@click.pass_context
|
||||
def populate_db(
|
||||
ctx: click.core.Context,
|
||||
):
|
||||
data_dir = ctx.obj["data_dir"]
|
||||
click.echo(f"Populating the database with data from {data_dir}")
|
||||
repository = ListingRepository(engine=engine)
|
||||
def populate_db(ctx: click.Context) -> None:
|
||||
"""Populate database from filesystem data (legacy migration)."""
|
||||
data_dir: pathlib.Path = ctx.obj["data_dir"]
|
||||
repository: ListingRepository = ctx.obj["repository"]
|
||||
|
||||
click.echo(f"Populating database from {data_dir}")
|
||||
|
||||
listings = Listing.get_all_listings(
|
||||
[path for path in pathlib.Path(data_dir).glob("*/listing.json")]
|
||||
[path for path in data_dir.glob("*/listing.json")]
|
||||
)
|
||||
|
||||
asyncio.run(repository.upsert_listings_legacy(listings))
|
||||
|
||||
click.echo(f"Imported {len(listings)} listings")
|
||||
|
||||
|
||||
@cli.command()
|
||||
def list_districts() -> None:
|
||||
"""List all available districts."""
|
||||
districts = district_service.get_all_districts()
|
||||
click.echo(f"Available districts ({len(districts)}):")
|
||||
for name in sorted(districts.keys()):
|
||||
click.echo(f" - {name}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
cli()
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue