migrate immoweb exporter to use models

This commit is contained in:
Viktor Barzin 2025-06-08 18:18:38 +00:00
parent e317d2ec54
commit 3785d01009
No known key found for this signature in database
GPG key ID: 4056458DBDBF8863
5 changed files with 94 additions and 22 deletions

View file

@ -4,7 +4,7 @@ from dataclasses import dataclass
import json import json
import pathlib import pathlib
from typing import Any, List, Dict from typing import Any, List, Dict
from models.listing import ListingSite from models.listing import ListingSite, PriceHistoryItem
from rec import floorplan, routing from rec import floorplan, routing
import re import re
import datetime import datetime
@ -381,11 +381,19 @@ class Listing:
return None return None
@property @property
def priceHistory(self) -> list[dict[str, Any]]: def priceHistory(self) -> list[PriceHistoryItem]:
if not self.path_price_history().exists(): if not self.path_price_history().exists():
return [] return []
with open(self.path_price_history(), "r") as f: with open(self.path_price_history(), "r") as f:
return json.load(f) data = json.load(f)
return [
PriceHistoryItem(
first_seen=datetime.datetime.fromisoformat(item["first_seen"]),
last_seen=datetime.datetime.fromisoformat(item["last_seen"]),
price=item["price"],
)
for item in data
]
@property @property
def longtitude(self) -> float: def longtitude(self) -> float:

View file

@ -329,7 +329,6 @@ def export_immoweb(
last_seen_days: int, last_seen_days: int,
min_sqm: int | None = None, min_sqm: int | None = None,
): ):
# use model
query_parameters = QueryParameters( query_parameters = QueryParameters(
listing_type=ListingType[type], listing_type=ListingType[type],
district_names=set(district), district_names=set(district),
@ -343,9 +342,10 @@ def export_immoweb(
min_sqm=min_sqm, min_sqm=min_sqm,
) )
click.echo( click.echo(
f"Exporting data to {output_file} that matches the query parameters: {query_parameters}" f"Exporting data to {output_file} for listings stored in {engine.url} that match the query parameters: {query_parameters}"
) )
asyncio.run(export_immoweb_ui(ctx, output_file, query_parameters)) repository = ListingRepository(engine=engine)
asyncio.run(export_immoweb_ui(repository, output_file, query_parameters))
@cli.command() @cli.command()

View file

@ -10,12 +10,19 @@ from rec import routing
from sqlmodel import JSON, SQLModel, Field, String from sqlmodel import JSON, SQLModel, Field, String
@dataclass @dataclass(frozen=True)
class PriceHistoryItem: class PriceHistoryItem:
first_seen: datetime first_seen: datetime
last_seen: datetime last_seen: datetime
price: float price: float
def to_dict(self) -> Dict[str, float | str]:
return {
"first_seen": self.first_seen.isoformat(),
"last_seen": self.last_seen.isoformat(),
"price": self.price,
}
@dataclass(frozen=True) @dataclass(frozen=True)
class Route: class Route:
@ -54,7 +61,8 @@ class Listing(SQLModel, table=False):
council_tax_band: str | None = Field(default=None, nullable=True) council_tax_band: str | None = Field(default=None, nullable=True)
longtitude: float = Field(nullable=False) longtitude: float = Field(nullable=False)
latitude: float = Field(nullable=False) latitude: float = Field(nullable=False)
price_history: List[Dict[str, Any]] = Field(default_factory=list, sa_type=JSON) # price_history: List[Dict[str, Any]] = Field(default_factory=list, sa_type=JSON)
price_history_json: str = Field(sa_type=String)
listing_site: ListingSite = Field(nullable=False) listing_site: ListingSite = Field(nullable=False)
last_seen: datetime = Field(default_factory=datetime.now, nullable=False) last_seen: datetime = Field(default_factory=datetime.now, nullable=False)
photo_thumbnail: str | None = Field(default=None, nullable=True) photo_thumbnail: str | None = Field(default=None, nullable=True)
@ -72,6 +80,56 @@ class Listing(SQLModel, table=False):
def is_removed(self) -> bool: def is_removed(self) -> bool:
return not self.additional_info["property"]["visible"] return not self.additional_info["property"]["visible"]
@property
def price_per_square_meter(self) -> float | None:
"""
Returns the price per square meter.
"""
if self.square_meters is None or self.square_meters == 0:
return None
return round(self.price / self.square_meters, 2)
@property
def url(self):
return f"https://www.rightmove.co.uk/properties/{self.id}"
@property
def price_history(self) -> List[PriceHistoryItem]:
"""
Returns a list of PriceHistoryItem objects from the price_history_json.
"""
if not self.price_history_json:
return []
parsed: list = json.loads(str(self.price_history_json))
for item in parsed:
item["first_seen"] = datetime.fromisoformat(item["first_seen"])
item["last_seen"] = datetime.fromisoformat(item["last_seen"])
return [
PriceHistoryItem(
first_seen=item["first_seen"],
last_seen=item["last_seen"],
price=item["price"],
)
for item in parsed
]
@staticmethod
def serialize_price_history(price_history: List[PriceHistoryItem]) -> str:
"""
Serializes the price history to a JSON string.
"""
serialized = json.dumps(
[
{
"first_seen": item.first_seen.isoformat(),
"last_seen": item.last_seen.isoformat(),
"price": item.price,
}
for item in price_history
]
)
return serialized
@property @property
def routing_info(self) -> dict[DestinationMode, List[Route]]: def routing_info(self) -> dict[DestinationMode, List[Route]]:
""" """

View file

@ -2,11 +2,14 @@
set -euxo pipefail set -euxo pipefail
DATA_DIR="data/rs" DATA_DIR="data/rs/test"
LISTING_FILTER_OPTIONS="--min-price 2000 --max-price 4000 --min-bedrooms 2 --max-bedrooms 4 -t rent --available-from $(date +%Y-%m-%d) --last-seen-days 7 --furnish-types furnished" LISTING_FILTER_OPTIONS="--min-price 2000 --max-price 4000 --min-bedrooms 2 --max-bedrooms 4 -t rent --available-from $(date +%Y-%m-%d) --last-seen-days 7 --furnish-types furnished"
#LISTING_FILTER_OPTIONS="--min-price 2000 --max-price 2500 --min-bedrooms 2 --max-bedrooms 4 -t rent --available-from $(date +%Y-%m-%d) --last-seen-days 7 --furnish-types furnished --district Islington" # DEBUG: UNCOMMENT ME WHEN TESTING
poetry install
alembic upgrade head # init db alembic upgrade head # init db
python main.py --data-dir $DATA_DIR dump-listings $LISTING_FILTER_OPTIONS python main.py --data-dir $DATA_DIR dump-listings $LISTING_FILTER_OPTIONS
python main.py --data-dir $DATA_DIR dump-images python main.py --data-dir $DATA_DIR dump-images
python main.py --data-dir $DATA_DIR detect-floorplan python main.py --data-dir $DATA_DIR detect-floorplan

View file

@ -1,23 +1,22 @@
import dataclasses
import json import json
import pathlib import pathlib
from data_access import Listing from data_access import Listing
from rec.query import QueryParameters from rec.query import QueryParameters
from repositories.listing_repository import ListingRepository
async def export_immoweb( async def export_immoweb(
ctx, repository: ListingRepository,
output_file: str, output_file: str,
query_parameters: QueryParameters | None = None, query_parameters: QueryParameters | None = None,
): ):
data_dir = ctx.obj["data_dir"]
output_file_path = pathlib.Path(output_file) output_file_path = pathlib.Path(output_file)
output_file_path.touch(exist_ok=True) output_file_path.touch(exist_ok=True)
listing_paths = sorted(list(pathlib.Path(data_dir).glob("*/listing.json"))) listings = await repository.get_listings(
# listing_paths = listing_paths[:10] query_parameters=query_parameters,
listings = Listing.get_all_listings([str(path) for path in listing_paths]) )
if query_parameters is not None:
listings = await filter_listings(listings, query_parameters)
# Convert listings to immoweb format # Convert listings to immoweb format
immoweb_listings = [] immoweb_listings = []
@ -27,18 +26,22 @@ async def export_immoweb(
"properties": { "properties": {
"city": "London", # change me "city": "London", # change me
"country": "United Kingdom", "country": "United Kingdom",
"qm": await listing.sqm_ocr(), "qm": listing.square_meters,
"qmprice": round(await listing.price_per_sqm(), 2), "qmprice": listing.price_per_square_meter,
"rooms": listing.bedrooms, "rooms": listing.number_of_bedrooms,
"total_price": listing.price, "total_price": listing.price,
"url": listing.url, "url": listing.url,
"photo_thumbnail": listing.photo_thumbnail,
"last_seen": listing.last_seen.isoformat(),
"price_history": [item.to_dict() for item in listing.price_history],
"agency": listing.agency,
# Additional info; the above is GeoJSON format # Additional info; the above is GeoJSON format
# Below is all other crap we want in the UI # Below is all other crap we want in the UI
"info": await listing.dict_nicely(), "info": listing.additional_info,
}, },
"geometry": { "geometry": {
"coordinates": [ "coordinates": [
listing.longitude, listing.longtitude,
listing.latitude, listing.latitude,
], ],
"type": "Point", "type": "Point",