use query params to filter out models; also make csv exporter work with models

This commit is contained in:
Viktor Barzin 2025-06-08 17:01:33 +00:00
parent 80c335ba04
commit e317d2ec54
No known key found for this signature in database
GPG key ID: 4056458DBDBF8863
5 changed files with 72 additions and 113 deletions

View file

@ -1,24 +1,26 @@
import asyncio
from pathlib import Path
from data_access import Listing
import pandas as pd
from rec.query import QueryParameters, filter_listings
from rec.query import QueryParameters
from repositories.listing_repository import ListingRepository
async def export_to_csv(
listings: list[Listing],
repository: ListingRepository,
output_file: Path,
columns: list[str],
query_parameters: QueryParameters | None = None,
) -> None:
if query_parameters is not None:
listings = await filter_listings(listings, query_parameters)
ds = await asyncio.gather(*[listing.dict_nicely() for listing in listings])
listings = await repository.get_listings(query_parameters=query_parameters)
ds = [*[listing.__dict__ for listing in listings]]
df = pd.DataFrame(ds)
# read decisions on file
decisions_path = "data/decisions.json"
decisions = pd.read_json(decisions_path)
df.loc[:, "decision"] = df.identifier.apply(lambda x: decisions.get(x))
df.loc[:, "decision"] = df.id.apply(lambda x: decisions.get(x))
# remove _sa_instance_state column
drop_columns = ["_sa_instance_state", "additional_info"]
df = df.drop(columns=drop_columns)
# remove all entries where we didnt calculate transit time (probably due to a too far distance)
# df2 = df[df.travel_time_fastest.notna()]
@ -30,9 +32,15 @@ async def export_to_csv(
# s1 = df2
# fill in gap values for service charge and lease left. This is for excel so we can use filters better there
if "service_charge" not in df2.columns:
df2.loc[:, "service_charge"] = -1
df2.loc[:, "service_charge"] = df2.service_charge.fillna(-1)
if "lease_left" not in df2.columns:
df2.loc[:, "lease_left"] = -1
df2.loc[:, "lease_left"] = df2.lease_left.fillna(-1)
df2.loc[:, "sqm_ocr"] = df2.sqm_ocr.fillna(-1)
if "square_meters" not in df2.columns:
df2.loc[:, "square_meters"] = -1
df2.loc[:, "square_meters"] = df2.square_meters.fillna(-1)
df3 = df2
# df3 = pd.concat([df2.drop(['travel_time_fastest', 'travel_time_second'], axis=1), s1], axis=1)
@ -40,6 +48,11 @@ async def export_to_csv(
df3.shape
df4 = df3
df5 = df4[columns]
# df5 = df4[columns]
# Add some interesting columns
df4.loc[:, "price_per_sqm"] = df4.price / df4.square_meters
df5 = df4
df6 = df5.sort_values(by=["price_per_sqm"], ascending=True)
df6.to_csv(str(output_file), index=False)

View file

@ -6,10 +6,10 @@ import pathlib
import click
import importlib
from models.listing import FurnishType, ListingType, QueryParameters
from rec.districts import get_districts
from data_access import Listing
import csv_exporter
from rec.query import ListingType, FurnishType, QueryParameters
from rec.routing import API_KEY_ENVIRONMENT_VARIABLE, TravelMode
from repositories.listing_repository import ListingRepository
from ui_exporter import export_immoweb as export_immoweb_ui
@ -230,17 +230,18 @@ def routing(
@cli.command()
@click.option(
"--columns",
"-C",
help="Columns to include in the CSV file",
type=click.Choice(
Listing.ALL_COLUMNS,
case_sensitive=False,
),
multiple=True,
default=Listing.ALL_COLUMNS,
)
# @click.option(
# "--columns",
# "-C",
# help="Columns to include in the CSV file",
# type=click.Choice(
# # csv_exporter.get_columns_from_listings(),
# [1],
# case_sensitive=False,
# ),
# multiple=True,
# default=Listing.ALL_COLUMNS,
# )
@click.option(
"--output-file",
"-O",
@ -258,7 +259,7 @@ def routing(
def export_csv(
ctx: click.core.Context,
output_file: str,
columns: tuple[str],
# columns: tuple[str],
district: list[str],
min_bedrooms: int,
max_bedrooms: int,
@ -270,6 +271,7 @@ def export_csv(
last_seen_days: int,
min_sqm: int | None = None,
):
# use model
data_dir = ctx.obj["data_dir"]
query_parameters = QueryParameters(
listing_type=ListingType[type],
@ -287,13 +289,12 @@ def export_csv(
f"Exporting data to {output_file} using {data_dir=} and query parameters: {query_parameters}"
)
output_file_path = pathlib.Path(output_file)
listing_paths = sorted(list(pathlib.Path(data_dir).glob("*/listing.json")))
listings = Listing.get_all_listings([path for path in listing_paths])
repository = ListingRepository(engine=engine)
asyncio.run(
csv_exporter.export_to_csv(
listings,
repository,
output_file_path,
list(columns),
# list(columns),
query_parameters=query_parameters,
),
)
@ -328,6 +329,7 @@ def export_immoweb(
last_seen_days: int,
min_sqm: int | None = None,
):
# use model
query_parameters = QueryParameters(
listing_type=ListingType[type],
district_names=set(district),

View file

@ -132,7 +132,8 @@ class Listing(SQLModel, table=False):
class FurnishType(enum.StrEnum):
FURNISHED = "furnished"
UNFURNISHED = "unfurnished"
PART_FURNISHED = "partFurnished"
PART_FURNISHED = "part furnished"
ASK_LANDLORD = "ask landlord"
UNKNOWN = "unknown"
@ -156,25 +157,6 @@ class DestinationMode:
def __hash__(self) -> int:
return hash((self.destination_address, self.travel_mode))
# def to_dict(self) -> dict[str, str | routing.TravelMode]:
# return {
# "destination_address": self.destination_address,
# "travel_mode": self.travel_mode.value,
# }
# @classmethod
# def from_dict(cls, data: dict):
# return cls(
# destination_address=data["destination_address"],
# travel_mode=routing.TravelMode(data["travel_mode"]),
# )
# def __json__(self) -> dict[str, str | routing.TravelMode]:
# return {
# "destination_address": self.destination_address,
# "travel_mode": self.travel_mode.value,
# }
def __getstate__(self):
# This allows serializers to pick up a dict representation
return asdict(self)
@ -182,3 +164,28 @@ class DestinationMode:
def __iter__(self):
# Makes it behave like a dict when expected
return iter(asdict(self).items())
class ListingType(enum.StrEnum):
BUY = "BUY"
RENT = "RENT"
@dataclass(frozen=True)
class QueryParameters:
listing_type: ListingType
min_bedrooms: int = 1
max_bedrooms: int = 999
min_price: int = 0
max_price: int = 10_000_000
district_names: set[str] = dataclasses.field(default_factory=set)
radius: float = 0
page_size: int = 500 # items per page
max_days_since_added: int = 30
furnish_types: list[FurnishType] | None = None
# The values below are not supported by rightmove
# hence we apply them after fetching
# available from; council tax
let_date_available_from: datetime | None = None
last_seen_days: int | None = None
min_sqm: int | None = None

View file

@ -6,70 +6,7 @@ import enum
from typing import Any
import aiohttp
from data_access import Listing
from models.listing import FurnishType
class ListingType(enum.StrEnum):
BUY = "BUY"
RENT = "RENT"
@dataclass(frozen=True)
class QueryParameters:
listing_type: ListingType
min_bedrooms: int
max_bedrooms: int
min_price: int
max_price: int
district_names: set[str]
radius: float = 0
page_size: int = 500 # items per page
max_days_since_added: int = 30
furnish_types: list[FurnishType] | None = None
# The values below are not supported by rightmove
# hence we apply them after fetching
# available from; council tax
let_date_available_from: datetime | None = None
last_seen_days: int | None = None
min_sqm: int | None = None
async def filter_listings(
listings: list[Listing],
query_parameters: QueryParameters,
) -> list[Listing]:
"""
Filter listings based on the provided query parameters.
"""
filtered_listings = []
for listing in listings:
if (
listing.bedrooms > query_parameters.max_bedrooms
or listing.bedrooms < query_parameters.min_bedrooms
):
continue
if (
listing.price < query_parameters.min_price
or listing.price > query_parameters.max_price
):
continue
if (
query_parameters.last_seen_days is not None
and listing.last_seen > query_parameters.last_seen_days
):
continue
if (
listing.letDateAvailable is not None
and query_parameters.let_date_available_from is not None
and listing.letDateAvailable < query_parameters.let_date_available_from
):
continue
sqm_ocr = await listing.sqm_ocr() or 0
if query_parameters.min_sqm is not None and sqm_ocr < query_parameters.min_sqm:
continue
filtered_listings.append(listing)
return filtered_listings
from models.listing import FurnishType, ListingType, QueryParameters
headers = {

View file

@ -2,7 +2,7 @@ import json
import pathlib
from data_access import Listing
from rec.query import QueryParameters, filter_listings
from rec.query import QueryParameters
async def export_immoweb(