diff --git a/crawler/csv_exporter.py b/crawler/csv_exporter.py index 0428f3b..838c0a9 100644 --- a/crawler/csv_exporter.py +++ b/crawler/csv_exporter.py @@ -1,24 +1,26 @@ -import asyncio from pathlib import Path -from data_access import Listing import pandas as pd -from rec.query import QueryParameters, filter_listings +from rec.query import QueryParameters +from repositories.listing_repository import ListingRepository async def export_to_csv( - listings: list[Listing], + repository: ListingRepository, output_file: Path, - columns: list[str], query_parameters: QueryParameters | None = None, ) -> None: - if query_parameters is not None: - listings = await filter_listings(listings, query_parameters) - ds = await asyncio.gather(*[listing.dict_nicely() for listing in listings]) + listings = await repository.get_listings(query_parameters=query_parameters) + ds = [*[listing.__dict__ for listing in listings]] df = pd.DataFrame(ds) + # read decisions on file decisions_path = "data/decisions.json" decisions = pd.read_json(decisions_path) - df.loc[:, "decision"] = df.identifier.apply(lambda x: decisions.get(x)) + df.loc[:, "decision"] = df.id.apply(lambda x: decisions.get(x)) + + # remove _sa_instance_state column + drop_columns = ["_sa_instance_state", "additional_info"] + df = df.drop(columns=drop_columns) # remove all entries where we didnt calculate transit time (probably due to a too far distance) # df2 = df[df.travel_time_fastest.notna()] @@ -30,9 +32,15 @@ async def export_to_csv( # s1 = df2 # fill in gap values for service charge and lease left. This is for excel so we can use filters better there + if "service_charge" not in df2.columns: + df2.loc[:, "service_charge"] = -1 df2.loc[:, "service_charge"] = df2.service_charge.fillna(-1) + if "lease_left" not in df2.columns: + df2.loc[:, "lease_left"] = -1 df2.loc[:, "lease_left"] = df2.lease_left.fillna(-1) - df2.loc[:, "sqm_ocr"] = df2.sqm_ocr.fillna(-1) + if "square_meters" not in df2.columns: + df2.loc[:, "square_meters"] = -1 + df2.loc[:, "square_meters"] = df2.square_meters.fillna(-1) df3 = df2 # df3 = pd.concat([df2.drop(['travel_time_fastest', 'travel_time_second'], axis=1), s1], axis=1) @@ -40,6 +48,11 @@ async def export_to_csv( df3.shape df4 = df3 - df5 = df4[columns] + # df5 = df4[columns] + + # Add some interesting columns + df4.loc[:, "price_per_sqm"] = df4.price / df4.square_meters + df5 = df4 + df6 = df5.sort_values(by=["price_per_sqm"], ascending=True) df6.to_csv(str(output_file), index=False) diff --git a/crawler/main.py b/crawler/main.py index 41a7331..64d9d54 100644 --- a/crawler/main.py +++ b/crawler/main.py @@ -6,10 +6,10 @@ import pathlib import click import importlib +from models.listing import FurnishType, ListingType, QueryParameters from rec.districts import get_districts from data_access import Listing import csv_exporter -from rec.query import ListingType, FurnishType, QueryParameters from rec.routing import API_KEY_ENVIRONMENT_VARIABLE, TravelMode from repositories.listing_repository import ListingRepository from ui_exporter import export_immoweb as export_immoweb_ui @@ -230,17 +230,18 @@ def routing( @cli.command() -@click.option( - "--columns", - "-C", - help="Columns to include in the CSV file", - type=click.Choice( - Listing.ALL_COLUMNS, - case_sensitive=False, - ), - multiple=True, - default=Listing.ALL_COLUMNS, -) +# @click.option( +# "--columns", +# "-C", +# help="Columns to include in the CSV file", +# type=click.Choice( +# # csv_exporter.get_columns_from_listings(), +# [1], +# case_sensitive=False, +# ), +# multiple=True, +# default=Listing.ALL_COLUMNS, +# ) @click.option( "--output-file", "-O", @@ -258,7 +259,7 @@ def routing( def export_csv( ctx: click.core.Context, output_file: str, - columns: tuple[str], + # columns: tuple[str], district: list[str], min_bedrooms: int, max_bedrooms: int, @@ -270,6 +271,7 @@ def export_csv( last_seen_days: int, min_sqm: int | None = None, ): + # use model data_dir = ctx.obj["data_dir"] query_parameters = QueryParameters( listing_type=ListingType[type], @@ -287,13 +289,12 @@ def export_csv( f"Exporting data to {output_file} using {data_dir=} and query parameters: {query_parameters}" ) output_file_path = pathlib.Path(output_file) - listing_paths = sorted(list(pathlib.Path(data_dir).glob("*/listing.json"))) - listings = Listing.get_all_listings([path for path in listing_paths]) + repository = ListingRepository(engine=engine) asyncio.run( csv_exporter.export_to_csv( - listings, + repository, output_file_path, - list(columns), + # list(columns), query_parameters=query_parameters, ), ) @@ -328,6 +329,7 @@ def export_immoweb( last_seen_days: int, min_sqm: int | None = None, ): + # use model query_parameters = QueryParameters( listing_type=ListingType[type], district_names=set(district), diff --git a/crawler/models/listing.py b/crawler/models/listing.py index fc8ff0d..cf6b6bb 100644 --- a/crawler/models/listing.py +++ b/crawler/models/listing.py @@ -132,7 +132,8 @@ class Listing(SQLModel, table=False): class FurnishType(enum.StrEnum): FURNISHED = "furnished" UNFURNISHED = "unfurnished" - PART_FURNISHED = "partFurnished" + PART_FURNISHED = "part furnished" + ASK_LANDLORD = "ask landlord" UNKNOWN = "unknown" @@ -156,25 +157,6 @@ class DestinationMode: def __hash__(self) -> int: return hash((self.destination_address, self.travel_mode)) - # def to_dict(self) -> dict[str, str | routing.TravelMode]: - # return { - # "destination_address": self.destination_address, - # "travel_mode": self.travel_mode.value, - # } - - # @classmethod - # def from_dict(cls, data: dict): - # return cls( - # destination_address=data["destination_address"], - # travel_mode=routing.TravelMode(data["travel_mode"]), - # ) - - # def __json__(self) -> dict[str, str | routing.TravelMode]: - # return { - # "destination_address": self.destination_address, - # "travel_mode": self.travel_mode.value, - # } - def __getstate__(self): # This allows serializers to pick up a dict representation return asdict(self) @@ -182,3 +164,28 @@ class DestinationMode: def __iter__(self): # Makes it behave like a dict when expected return iter(asdict(self).items()) + + +class ListingType(enum.StrEnum): + BUY = "BUY" + RENT = "RENT" + + +@dataclass(frozen=True) +class QueryParameters: + listing_type: ListingType + min_bedrooms: int = 1 + max_bedrooms: int = 999 + min_price: int = 0 + max_price: int = 10_000_000 + district_names: set[str] = dataclasses.field(default_factory=set) + radius: float = 0 + page_size: int = 500 # items per page + max_days_since_added: int = 30 + furnish_types: list[FurnishType] | None = None + # The values below are not supported by rightmove + # hence we apply them after fetching + # available from; council tax + let_date_available_from: datetime | None = None + last_seen_days: int | None = None + min_sqm: int | None = None diff --git a/crawler/rec/query.py b/crawler/rec/query.py index 5ce2bce..003fb2b 100644 --- a/crawler/rec/query.py +++ b/crawler/rec/query.py @@ -6,70 +6,7 @@ import enum from typing import Any import aiohttp from data_access import Listing -from models.listing import FurnishType - - -class ListingType(enum.StrEnum): - BUY = "BUY" - RENT = "RENT" - - -@dataclass(frozen=True) -class QueryParameters: - listing_type: ListingType - min_bedrooms: int - max_bedrooms: int - min_price: int - max_price: int - district_names: set[str] - radius: float = 0 - page_size: int = 500 # items per page - max_days_since_added: int = 30 - furnish_types: list[FurnishType] | None = None - # The values below are not supported by rightmove - # hence we apply them after fetching - # available from; council tax - let_date_available_from: datetime | None = None - last_seen_days: int | None = None - min_sqm: int | None = None - - -async def filter_listings( - listings: list[Listing], - query_parameters: QueryParameters, -) -> list[Listing]: - """ - Filter listings based on the provided query parameters. - """ - filtered_listings = [] - for listing in listings: - if ( - listing.bedrooms > query_parameters.max_bedrooms - or listing.bedrooms < query_parameters.min_bedrooms - ): - continue - if ( - listing.price < query_parameters.min_price - or listing.price > query_parameters.max_price - ): - continue - if ( - query_parameters.last_seen_days is not None - and listing.last_seen > query_parameters.last_seen_days - ): - continue - if ( - listing.letDateAvailable is not None - and query_parameters.let_date_available_from is not None - and listing.letDateAvailable < query_parameters.let_date_available_from - ): - continue - sqm_ocr = await listing.sqm_ocr() or 0 - if query_parameters.min_sqm is not None and sqm_ocr < query_parameters.min_sqm: - continue - filtered_listings.append(listing) - - return filtered_listings +from models.listing import FurnishType, ListingType, QueryParameters headers = { diff --git a/crawler/ui_exporter.py b/crawler/ui_exporter.py index 32285da..5db5483 100644 --- a/crawler/ui_exporter.py +++ b/crawler/ui_exporter.py @@ -2,7 +2,7 @@ import json import pathlib from data_access import Listing -from rec.query import QueryParameters, filter_listings +from rec.query import QueryParameters async def export_immoweb(