From 7e8c79d3d197804f8e66f5c90ec8a81380f7238e Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Mon, 26 May 2025 19:36:54 +0000 Subject: [PATCH] add command to export the data in a way that the ui (immoweb) can consume --- crawler/main.py | 20 ++++++++++++++++++++ crawler/rec/query.py | 2 +- crawler/runall.sh | 4 ++-- crawler/ui_exporter.py | 43 ++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 66 insertions(+), 3 deletions(-) create mode 100644 crawler/ui_exporter.py diff --git a/crawler/main.py b/crawler/main.py index 7a24d83..ef0dcfd 100644 --- a/crawler/main.py +++ b/crawler/main.py @@ -10,6 +10,7 @@ from data_access import Listing import csv_exporter from rec.query import ListingType, FurnishType from rec.routing import API_KEY_ENVIRONMENT_VARIABLE, TravelMode +from ui_exporter import export_immoweb as export_immoweb_ui dump_listings_module = importlib.import_module('1_dump_listings') dump_detail_module = importlib.import_module('2_dump_detail') @@ -230,6 +231,25 @@ def export_csv(ctx: click.core.Context, output_file: str, columns: tuple[str]): asyncio.run( csv_exporter.export_to_csv(listings, output_file_path, list(columns)), ) + +@cli.command() +@click.option( + '--output-file', + '-O', + help='Path to the output immoweb file', + required=True, + type=click.Path( + writable=True, + file_okay=True, + dir_okay=False, + resolve_path=True, + ), +) +@click.pass_context +def export_immoweb(ctx, output_file: str): + click.echo(f'Exporting data to {output_file}') + asyncio.run(export_immoweb_ui(ctx, output_file)) + if __name__ == '__main__': diff --git a/crawler/rec/query.py b/crawler/rec/query.py index ce0a2bc..47f4fd7 100644 --- a/crawler/rec/query.py +++ b/crawler/rec/query.py @@ -71,7 +71,7 @@ async def listing_query( ) -> dict[str, Any]: params: dict[str, str] = { "locationIdentifier": location_id, - "channel": channel.upper(), + "channel": str(channel).upper(), "page": str(page), "numberOfPropertiesPerPage": str(page_size), "radius": str(radius), diff --git a/crawler/runall.sh b/crawler/runall.sh index d372983..c483545 100755 --- a/crawler/runall.sh +++ b/crawler/runall.sh @@ -4,10 +4,10 @@ set -euxo pipefail DATA_DIR="data/rs" -python main.py --data-dir $DATA_DIR dump-listings --min-price 2500 --max-price 3500 --min-bedrooms 2 --max-bedrooms 4 --district islington -t rent +python main.py --data-dir $DATA_DIR dump-listings --min-price 2000 --max-price 4000 --min-bedrooms 2 --max-bedrooms 4 -t rent python main.py --data-dir $DATA_DIR dump-details python main.py --data-dir $DATA_DIR dump-images python main.py --data-dir $DATA_DIR detect-floorplan #python main.py --data-dir $DATA_DIR routing --destination-address 'Meta Brock Street' -m transit # NOTE: THIS CONSUMES API CALLS; USE CAREFULLY; add -l to limit number of entries python main.py --data-dir $DATA_DIR export-csv -O data/listings.csv - +python main.py --data-dir $DATA_DIR export-immoweb -O ../immoweb/data/london_geojs.js diff --git a/crawler/ui_exporter.py b/crawler/ui_exporter.py new file mode 100644 index 0000000..24496ff --- /dev/null +++ b/crawler/ui_exporter.py @@ -0,0 +1,43 @@ + +import json +import pathlib + +from data_access import Listing + + +async def export_immoweb(ctx, output_file: str): + data_dir = ctx.obj['data_dir'] + output_file_path = pathlib.Path(output_file) + listing_paths = sorted(list(pathlib.Path(data_dir).glob("*/listing.json"))) + # listing_paths = listing_paths[:10] + listings = Listing.get_all_listings([str(path) for path in listing_paths]) + + # Convert listings to immoweb format + immoweb_listings = [] + for listing in listings: + immoweb_listing = { + 'type': 'Feature', + 'properties': { + 'city': 'London', # change me + 'country': 'United Kingdom', + 'qm': await listing.sqm_ocr(), + 'qmprice': await listing.price_per_sqm(), + 'rooms': listing.bedrooms, + 'total_price': listing.price, + }, + 'geometry': { + 'coordinates': [ + listing.longitude, + listing.latitude, + ], + 'type': 'Point', + } + } + immoweb_listings.append(immoweb_listing) + + prefix = 'var data = ' + serialized_data = {"type": "FeatureCollection", "features": immoweb_listings} + result = prefix + json.dumps(serialized_data, indent=4) + with open(output_file_path, 'w') as f: + f.write(result) + # json.dump(serialized_data, f, indent=4)