From 3785d010093a0c9ca839dba8e3472996c2dbee94 Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Sun, 8 Jun 2025 18:18:38 +0000 Subject: [PATCH] migrate immoweb exporter to use models --- crawler/data_access.py | 14 +++++++-- crawler/main.py | 6 ++-- crawler/models/listing.py | 62 +++++++++++++++++++++++++++++++++++++-- crawler/runall.sh | 7 +++-- crawler/ui_exporter.py | 27 +++++++++-------- 5 files changed, 94 insertions(+), 22 deletions(-) diff --git a/crawler/data_access.py b/crawler/data_access.py index de6609d..f6e4bf2 100644 --- a/crawler/data_access.py +++ b/crawler/data_access.py @@ -4,7 +4,7 @@ from dataclasses import dataclass import json import pathlib from typing import Any, List, Dict -from models.listing import ListingSite +from models.listing import ListingSite, PriceHistoryItem from rec import floorplan, routing import re import datetime @@ -381,11 +381,19 @@ class Listing: return None @property - def priceHistory(self) -> list[dict[str, Any]]: + def priceHistory(self) -> list[PriceHistoryItem]: if not self.path_price_history().exists(): return [] with open(self.path_price_history(), "r") as f: - return json.load(f) + data = json.load(f) + return [ + PriceHistoryItem( + first_seen=datetime.datetime.fromisoformat(item["first_seen"]), + last_seen=datetime.datetime.fromisoformat(item["last_seen"]), + price=item["price"], + ) + for item in data + ] @property def longtitude(self) -> float: diff --git a/crawler/main.py b/crawler/main.py index 64d9d54..150efa5 100644 --- a/crawler/main.py +++ b/crawler/main.py @@ -329,7 +329,6 @@ def export_immoweb( last_seen_days: int, min_sqm: int | None = None, ): - # use model query_parameters = QueryParameters( listing_type=ListingType[type], district_names=set(district), @@ -343,9 +342,10 @@ def export_immoweb( min_sqm=min_sqm, ) click.echo( - f"Exporting data to {output_file} that matches the query parameters: {query_parameters}" + f"Exporting data to {output_file} for listings stored in {engine.url} that match the query parameters: {query_parameters}" ) - asyncio.run(export_immoweb_ui(ctx, output_file, query_parameters)) + repository = ListingRepository(engine=engine) + asyncio.run(export_immoweb_ui(repository, output_file, query_parameters)) @cli.command() diff --git a/crawler/models/listing.py b/crawler/models/listing.py index cf6b6bb..f97b53d 100644 --- a/crawler/models/listing.py +++ b/crawler/models/listing.py @@ -10,12 +10,19 @@ from rec import routing from sqlmodel import JSON, SQLModel, Field, String -@dataclass +@dataclass(frozen=True) class PriceHistoryItem: first_seen: datetime last_seen: datetime price: float + def to_dict(self) -> Dict[str, float | str]: + return { + "first_seen": self.first_seen.isoformat(), + "last_seen": self.last_seen.isoformat(), + "price": self.price, + } + @dataclass(frozen=True) class Route: @@ -54,7 +61,8 @@ class Listing(SQLModel, table=False): council_tax_band: str | None = Field(default=None, nullable=True) longtitude: float = Field(nullable=False) latitude: float = Field(nullable=False) - price_history: List[Dict[str, Any]] = Field(default_factory=list, sa_type=JSON) + # price_history: List[Dict[str, Any]] = Field(default_factory=list, sa_type=JSON) + price_history_json: str = Field(sa_type=String) listing_site: ListingSite = Field(nullable=False) last_seen: datetime = Field(default_factory=datetime.now, nullable=False) photo_thumbnail: str | None = Field(default=None, nullable=True) @@ -72,6 +80,56 @@ class Listing(SQLModel, table=False): def is_removed(self) -> bool: return not self.additional_info["property"]["visible"] + @property + def price_per_square_meter(self) -> float | None: + """ + Returns the price per square meter. + """ + if self.square_meters is None or self.square_meters == 0: + return None + return round(self.price / self.square_meters, 2) + + @property + def url(self): + return f"https://www.rightmove.co.uk/properties/{self.id}" + + @property + def price_history(self) -> List[PriceHistoryItem]: + """ + Returns a list of PriceHistoryItem objects from the price_history_json. + """ + if not self.price_history_json: + return [] + parsed: list = json.loads(str(self.price_history_json)) + for item in parsed: + item["first_seen"] = datetime.fromisoformat(item["first_seen"]) + item["last_seen"] = datetime.fromisoformat(item["last_seen"]) + return [ + PriceHistoryItem( + first_seen=item["first_seen"], + last_seen=item["last_seen"], + price=item["price"], + ) + for item in parsed + ] + + @staticmethod + def serialize_price_history(price_history: List[PriceHistoryItem]) -> str: + """ + Serializes the price history to a JSON string. + """ + serialized = json.dumps( + [ + { + "first_seen": item.first_seen.isoformat(), + "last_seen": item.last_seen.isoformat(), + "price": item.price, + } + for item in price_history + ] + ) + return serialized + @property def routing_info(self) -> dict[DestinationMode, List[Route]]: """ diff --git a/crawler/runall.sh b/crawler/runall.sh index 5ec7bc2..b5500af 100755 --- a/crawler/runall.sh +++ b/crawler/runall.sh @@ -2,11 +2,14 @@ set -euxo pipefail -DATA_DIR="data/rs" +DATA_DIR="data/rs/test" + LISTING_FILTER_OPTIONS="--min-price 2000 --max-price 4000 --min-bedrooms 2 --max-bedrooms 4 -t rent --available-from $(date +%Y-%m-%d) --last-seen-days 7 --furnish-types furnished" +#LISTING_FILTER_OPTIONS="--min-price 2000 --max-price 2500 --min-bedrooms 2 --max-bedrooms 4 -t rent --available-from $(date +%Y-%m-%d) --last-seen-days 7 --furnish-types furnished --district Islington" # DEBUG: UNCOMMENT ME WHEN TESTING + +poetry install alembic upgrade head # init db - python main.py --data-dir $DATA_DIR dump-listings $LISTING_FILTER_OPTIONS python main.py --data-dir $DATA_DIR dump-images python main.py --data-dir $DATA_DIR detect-floorplan diff --git a/crawler/ui_exporter.py b/crawler/ui_exporter.py index 5db5483..2091f86 100644 --- a/crawler/ui_exporter.py +++ b/crawler/ui_exporter.py @@ -1,23 +1,22 @@ +import dataclasses import json import pathlib from data_access import Listing from rec.query import QueryParameters +from repositories.listing_repository import ListingRepository async def export_immoweb( - ctx, + repository: ListingRepository, output_file: str, query_parameters: QueryParameters | None = None, ): - data_dir = ctx.obj["data_dir"] output_file_path = pathlib.Path(output_file) output_file_path.touch(exist_ok=True) - listing_paths = sorted(list(pathlib.Path(data_dir).glob("*/listing.json"))) - # listing_paths = listing_paths[:10] - listings = Listing.get_all_listings([str(path) for path in listing_paths]) - if query_parameters is not None: - listings = await filter_listings(listings, query_parameters) + listings = await repository.get_listings( + query_parameters=query_parameters, + ) # Convert listings to immoweb format immoweb_listings = [] @@ -27,18 +26,22 @@ async def export_immoweb( "properties": { "city": "London", # change me "country": "United Kingdom", - "qm": await listing.sqm_ocr(), - "qmprice": round(await listing.price_per_sqm(), 2), - "rooms": listing.bedrooms, + "qm": listing.square_meters, + "qmprice": listing.price_per_square_meter, + "rooms": listing.number_of_bedrooms, "total_price": listing.price, "url": listing.url, + "photo_thumbnail": listing.photo_thumbnail, + "last_seen": listing.last_seen.isoformat(), + "price_history": [item.to_dict() for item in listing.price_history], + "agency": listing.agency, # Additional info; the above is GeoJSON format # Below is all other crap we want in the UI - "info": await listing.dict_nicely(), + "info": listing.additional_info, }, "geometry": { "coordinates": [ - listing.longitude, + listing.longtitude, listing.latitude, ], "type": "Point",