use query params to filter out models; also make csv exporter work with models
This commit is contained in:
parent
80c335ba04
commit
e317d2ec54
5 changed files with 72 additions and 113 deletions
|
|
@ -1,24 +1,26 @@
|
||||||
import asyncio
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from data_access import Listing
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from rec.query import QueryParameters, filter_listings
|
from rec.query import QueryParameters
|
||||||
|
from repositories.listing_repository import ListingRepository
|
||||||
|
|
||||||
|
|
||||||
async def export_to_csv(
|
async def export_to_csv(
|
||||||
listings: list[Listing],
|
repository: ListingRepository,
|
||||||
output_file: Path,
|
output_file: Path,
|
||||||
columns: list[str],
|
|
||||||
query_parameters: QueryParameters | None = None,
|
query_parameters: QueryParameters | None = None,
|
||||||
) -> None:
|
) -> None:
|
||||||
if query_parameters is not None:
|
listings = await repository.get_listings(query_parameters=query_parameters)
|
||||||
listings = await filter_listings(listings, query_parameters)
|
ds = [*[listing.__dict__ for listing in listings]]
|
||||||
ds = await asyncio.gather(*[listing.dict_nicely() for listing in listings])
|
|
||||||
df = pd.DataFrame(ds)
|
df = pd.DataFrame(ds)
|
||||||
|
|
||||||
# read decisions on file
|
# read decisions on file
|
||||||
decisions_path = "data/decisions.json"
|
decisions_path = "data/decisions.json"
|
||||||
decisions = pd.read_json(decisions_path)
|
decisions = pd.read_json(decisions_path)
|
||||||
df.loc[:, "decision"] = df.identifier.apply(lambda x: decisions.get(x))
|
df.loc[:, "decision"] = df.id.apply(lambda x: decisions.get(x))
|
||||||
|
|
||||||
|
# remove _sa_instance_state column
|
||||||
|
drop_columns = ["_sa_instance_state", "additional_info"]
|
||||||
|
df = df.drop(columns=drop_columns)
|
||||||
|
|
||||||
# remove all entries where we didnt calculate transit time (probably due to a too far distance)
|
# remove all entries where we didnt calculate transit time (probably due to a too far distance)
|
||||||
# df2 = df[df.travel_time_fastest.notna()]
|
# df2 = df[df.travel_time_fastest.notna()]
|
||||||
|
|
@ -30,9 +32,15 @@ async def export_to_csv(
|
||||||
# s1 = df2
|
# s1 = df2
|
||||||
|
|
||||||
# fill in gap values for service charge and lease left. This is for excel so we can use filters better there
|
# fill in gap values for service charge and lease left. This is for excel so we can use filters better there
|
||||||
|
if "service_charge" not in df2.columns:
|
||||||
|
df2.loc[:, "service_charge"] = -1
|
||||||
df2.loc[:, "service_charge"] = df2.service_charge.fillna(-1)
|
df2.loc[:, "service_charge"] = df2.service_charge.fillna(-1)
|
||||||
|
if "lease_left" not in df2.columns:
|
||||||
|
df2.loc[:, "lease_left"] = -1
|
||||||
df2.loc[:, "lease_left"] = df2.lease_left.fillna(-1)
|
df2.loc[:, "lease_left"] = df2.lease_left.fillna(-1)
|
||||||
df2.loc[:, "sqm_ocr"] = df2.sqm_ocr.fillna(-1)
|
if "square_meters" not in df2.columns:
|
||||||
|
df2.loc[:, "square_meters"] = -1
|
||||||
|
df2.loc[:, "square_meters"] = df2.square_meters.fillna(-1)
|
||||||
|
|
||||||
df3 = df2
|
df3 = df2
|
||||||
# df3 = pd.concat([df2.drop(['travel_time_fastest', 'travel_time_second'], axis=1), s1], axis=1)
|
# df3 = pd.concat([df2.drop(['travel_time_fastest', 'travel_time_second'], axis=1), s1], axis=1)
|
||||||
|
|
@ -40,6 +48,11 @@ async def export_to_csv(
|
||||||
df3.shape
|
df3.shape
|
||||||
df4 = df3
|
df4 = df3
|
||||||
|
|
||||||
df5 = df4[columns]
|
# df5 = df4[columns]
|
||||||
|
|
||||||
|
# Add some interesting columns
|
||||||
|
df4.loc[:, "price_per_sqm"] = df4.price / df4.square_meters
|
||||||
|
df5 = df4
|
||||||
|
|
||||||
df6 = df5.sort_values(by=["price_per_sqm"], ascending=True)
|
df6 = df5.sort_values(by=["price_per_sqm"], ascending=True)
|
||||||
df6.to_csv(str(output_file), index=False)
|
df6.to_csv(str(output_file), index=False)
|
||||||
|
|
|
||||||
|
|
@ -6,10 +6,10 @@ import pathlib
|
||||||
import click
|
import click
|
||||||
import importlib
|
import importlib
|
||||||
|
|
||||||
|
from models.listing import FurnishType, ListingType, QueryParameters
|
||||||
from rec.districts import get_districts
|
from rec.districts import get_districts
|
||||||
from data_access import Listing
|
from data_access import Listing
|
||||||
import csv_exporter
|
import csv_exporter
|
||||||
from rec.query import ListingType, FurnishType, QueryParameters
|
|
||||||
from rec.routing import API_KEY_ENVIRONMENT_VARIABLE, TravelMode
|
from rec.routing import API_KEY_ENVIRONMENT_VARIABLE, TravelMode
|
||||||
from repositories.listing_repository import ListingRepository
|
from repositories.listing_repository import ListingRepository
|
||||||
from ui_exporter import export_immoweb as export_immoweb_ui
|
from ui_exporter import export_immoweb as export_immoweb_ui
|
||||||
|
|
@ -230,17 +230,18 @@ def routing(
|
||||||
|
|
||||||
|
|
||||||
@cli.command()
|
@cli.command()
|
||||||
@click.option(
|
# @click.option(
|
||||||
"--columns",
|
# "--columns",
|
||||||
"-C",
|
# "-C",
|
||||||
help="Columns to include in the CSV file",
|
# help="Columns to include in the CSV file",
|
||||||
type=click.Choice(
|
# type=click.Choice(
|
||||||
Listing.ALL_COLUMNS,
|
# # csv_exporter.get_columns_from_listings(),
|
||||||
case_sensitive=False,
|
# [1],
|
||||||
),
|
# case_sensitive=False,
|
||||||
multiple=True,
|
# ),
|
||||||
default=Listing.ALL_COLUMNS,
|
# multiple=True,
|
||||||
)
|
# default=Listing.ALL_COLUMNS,
|
||||||
|
# )
|
||||||
@click.option(
|
@click.option(
|
||||||
"--output-file",
|
"--output-file",
|
||||||
"-O",
|
"-O",
|
||||||
|
|
@ -258,7 +259,7 @@ def routing(
|
||||||
def export_csv(
|
def export_csv(
|
||||||
ctx: click.core.Context,
|
ctx: click.core.Context,
|
||||||
output_file: str,
|
output_file: str,
|
||||||
columns: tuple[str],
|
# columns: tuple[str],
|
||||||
district: list[str],
|
district: list[str],
|
||||||
min_bedrooms: int,
|
min_bedrooms: int,
|
||||||
max_bedrooms: int,
|
max_bedrooms: int,
|
||||||
|
|
@ -270,6 +271,7 @@ def export_csv(
|
||||||
last_seen_days: int,
|
last_seen_days: int,
|
||||||
min_sqm: int | None = None,
|
min_sqm: int | None = None,
|
||||||
):
|
):
|
||||||
|
# use model
|
||||||
data_dir = ctx.obj["data_dir"]
|
data_dir = ctx.obj["data_dir"]
|
||||||
query_parameters = QueryParameters(
|
query_parameters = QueryParameters(
|
||||||
listing_type=ListingType[type],
|
listing_type=ListingType[type],
|
||||||
|
|
@ -287,13 +289,12 @@ def export_csv(
|
||||||
f"Exporting data to {output_file} using {data_dir=} and query parameters: {query_parameters}"
|
f"Exporting data to {output_file} using {data_dir=} and query parameters: {query_parameters}"
|
||||||
)
|
)
|
||||||
output_file_path = pathlib.Path(output_file)
|
output_file_path = pathlib.Path(output_file)
|
||||||
listing_paths = sorted(list(pathlib.Path(data_dir).glob("*/listing.json")))
|
repository = ListingRepository(engine=engine)
|
||||||
listings = Listing.get_all_listings([path for path in listing_paths])
|
|
||||||
asyncio.run(
|
asyncio.run(
|
||||||
csv_exporter.export_to_csv(
|
csv_exporter.export_to_csv(
|
||||||
listings,
|
repository,
|
||||||
output_file_path,
|
output_file_path,
|
||||||
list(columns),
|
# list(columns),
|
||||||
query_parameters=query_parameters,
|
query_parameters=query_parameters,
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
|
|
@ -328,6 +329,7 @@ def export_immoweb(
|
||||||
last_seen_days: int,
|
last_seen_days: int,
|
||||||
min_sqm: int | None = None,
|
min_sqm: int | None = None,
|
||||||
):
|
):
|
||||||
|
# use model
|
||||||
query_parameters = QueryParameters(
|
query_parameters = QueryParameters(
|
||||||
listing_type=ListingType[type],
|
listing_type=ListingType[type],
|
||||||
district_names=set(district),
|
district_names=set(district),
|
||||||
|
|
|
||||||
|
|
@ -132,7 +132,8 @@ class Listing(SQLModel, table=False):
|
||||||
class FurnishType(enum.StrEnum):
|
class FurnishType(enum.StrEnum):
|
||||||
FURNISHED = "furnished"
|
FURNISHED = "furnished"
|
||||||
UNFURNISHED = "unfurnished"
|
UNFURNISHED = "unfurnished"
|
||||||
PART_FURNISHED = "partFurnished"
|
PART_FURNISHED = "part furnished"
|
||||||
|
ASK_LANDLORD = "ask landlord"
|
||||||
UNKNOWN = "unknown"
|
UNKNOWN = "unknown"
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -156,25 +157,6 @@ class DestinationMode:
|
||||||
def __hash__(self) -> int:
|
def __hash__(self) -> int:
|
||||||
return hash((self.destination_address, self.travel_mode))
|
return hash((self.destination_address, self.travel_mode))
|
||||||
|
|
||||||
# def to_dict(self) -> dict[str, str | routing.TravelMode]:
|
|
||||||
# return {
|
|
||||||
# "destination_address": self.destination_address,
|
|
||||||
# "travel_mode": self.travel_mode.value,
|
|
||||||
# }
|
|
||||||
|
|
||||||
# @classmethod
|
|
||||||
# def from_dict(cls, data: dict):
|
|
||||||
# return cls(
|
|
||||||
# destination_address=data["destination_address"],
|
|
||||||
# travel_mode=routing.TravelMode(data["travel_mode"]),
|
|
||||||
# )
|
|
||||||
|
|
||||||
# def __json__(self) -> dict[str, str | routing.TravelMode]:
|
|
||||||
# return {
|
|
||||||
# "destination_address": self.destination_address,
|
|
||||||
# "travel_mode": self.travel_mode.value,
|
|
||||||
# }
|
|
||||||
|
|
||||||
def __getstate__(self):
|
def __getstate__(self):
|
||||||
# This allows serializers to pick up a dict representation
|
# This allows serializers to pick up a dict representation
|
||||||
return asdict(self)
|
return asdict(self)
|
||||||
|
|
@ -182,3 +164,28 @@ class DestinationMode:
|
||||||
def __iter__(self):
|
def __iter__(self):
|
||||||
# Makes it behave like a dict when expected
|
# Makes it behave like a dict when expected
|
||||||
return iter(asdict(self).items())
|
return iter(asdict(self).items())
|
||||||
|
|
||||||
|
|
||||||
|
class ListingType(enum.StrEnum):
|
||||||
|
BUY = "BUY"
|
||||||
|
RENT = "RENT"
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class QueryParameters:
|
||||||
|
listing_type: ListingType
|
||||||
|
min_bedrooms: int = 1
|
||||||
|
max_bedrooms: int = 999
|
||||||
|
min_price: int = 0
|
||||||
|
max_price: int = 10_000_000
|
||||||
|
district_names: set[str] = dataclasses.field(default_factory=set)
|
||||||
|
radius: float = 0
|
||||||
|
page_size: int = 500 # items per page
|
||||||
|
max_days_since_added: int = 30
|
||||||
|
furnish_types: list[FurnishType] | None = None
|
||||||
|
# The values below are not supported by rightmove
|
||||||
|
# hence we apply them after fetching
|
||||||
|
# available from; council tax
|
||||||
|
let_date_available_from: datetime | None = None
|
||||||
|
last_seen_days: int | None = None
|
||||||
|
min_sqm: int | None = None
|
||||||
|
|
|
||||||
|
|
@ -6,70 +6,7 @@ import enum
|
||||||
from typing import Any
|
from typing import Any
|
||||||
import aiohttp
|
import aiohttp
|
||||||
from data_access import Listing
|
from data_access import Listing
|
||||||
from models.listing import FurnishType
|
from models.listing import FurnishType, ListingType, QueryParameters
|
||||||
|
|
||||||
|
|
||||||
class ListingType(enum.StrEnum):
|
|
||||||
BUY = "BUY"
|
|
||||||
RENT = "RENT"
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass(frozen=True)
|
|
||||||
class QueryParameters:
|
|
||||||
listing_type: ListingType
|
|
||||||
min_bedrooms: int
|
|
||||||
max_bedrooms: int
|
|
||||||
min_price: int
|
|
||||||
max_price: int
|
|
||||||
district_names: set[str]
|
|
||||||
radius: float = 0
|
|
||||||
page_size: int = 500 # items per page
|
|
||||||
max_days_since_added: int = 30
|
|
||||||
furnish_types: list[FurnishType] | None = None
|
|
||||||
# The values below are not supported by rightmove
|
|
||||||
# hence we apply them after fetching
|
|
||||||
# available from; council tax
|
|
||||||
let_date_available_from: datetime | None = None
|
|
||||||
last_seen_days: int | None = None
|
|
||||||
min_sqm: int | None = None
|
|
||||||
|
|
||||||
|
|
||||||
async def filter_listings(
|
|
||||||
listings: list[Listing],
|
|
||||||
query_parameters: QueryParameters,
|
|
||||||
) -> list[Listing]:
|
|
||||||
"""
|
|
||||||
Filter listings based on the provided query parameters.
|
|
||||||
"""
|
|
||||||
filtered_listings = []
|
|
||||||
for listing in listings:
|
|
||||||
if (
|
|
||||||
listing.bedrooms > query_parameters.max_bedrooms
|
|
||||||
or listing.bedrooms < query_parameters.min_bedrooms
|
|
||||||
):
|
|
||||||
continue
|
|
||||||
if (
|
|
||||||
listing.price < query_parameters.min_price
|
|
||||||
or listing.price > query_parameters.max_price
|
|
||||||
):
|
|
||||||
continue
|
|
||||||
if (
|
|
||||||
query_parameters.last_seen_days is not None
|
|
||||||
and listing.last_seen > query_parameters.last_seen_days
|
|
||||||
):
|
|
||||||
continue
|
|
||||||
if (
|
|
||||||
listing.letDateAvailable is not None
|
|
||||||
and query_parameters.let_date_available_from is not None
|
|
||||||
and listing.letDateAvailable < query_parameters.let_date_available_from
|
|
||||||
):
|
|
||||||
continue
|
|
||||||
sqm_ocr = await listing.sqm_ocr() or 0
|
|
||||||
if query_parameters.min_sqm is not None and sqm_ocr < query_parameters.min_sqm:
|
|
||||||
continue
|
|
||||||
filtered_listings.append(listing)
|
|
||||||
|
|
||||||
return filtered_listings
|
|
||||||
|
|
||||||
|
|
||||||
headers = {
|
headers = {
|
||||||
|
|
|
||||||
|
|
@ -2,7 +2,7 @@ import json
|
||||||
import pathlib
|
import pathlib
|
||||||
|
|
||||||
from data_access import Listing
|
from data_access import Listing
|
||||||
from rec.query import QueryParameters, filter_listings
|
from rec.query import QueryParameters
|
||||||
|
|
||||||
|
|
||||||
async def export_immoweb(
|
async def export_immoweb(
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue