From 9f3e466b23c20886faedd599e31e4d8bf1c0eb12 Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Sun, 18 May 2025 17:22:48 +0000 Subject: [PATCH] add filter for furnished/unfurnished type for rented listings --- crawler/1_dump_listings.py | 9 +++++++-- crawler/main.py | 33 +++++++++++++++++++++------------ crawler/rec/query.py | 15 +++++++++++++-- 3 files changed, 41 insertions(+), 16 deletions(-) diff --git a/crawler/1_dump_listings.py b/crawler/1_dump_listings.py index 6f76e04..da36915 100644 --- a/crawler/1_dump_listings.py +++ b/crawler/1_dump_listings.py @@ -1,7 +1,7 @@ import asyncio from dataclasses import dataclass import pathlib -from rec.query import ListingType, listing_query +from rec.query import ListingType, listing_query, FurnishType from rec.districts import get_districts from data_access import Listing @@ -17,7 +17,11 @@ class QueryParameters: radius: float = 0 page_size: int = 500 # items per page max_days_since_added: int = 30 - # available from; furnished/unfurnished; council tax + furnish_types: list[FurnishType] | None = None + + # The values below are not supported by rightmove + # hence we apply them after fetching + # available from; council tax async def dump_listings( @@ -44,6 +48,7 @@ async def dump_listings( location_id=locid, page_size=parameters.page_size, max_days_since_added=parameters.max_days_since_added, + furnish_types=parameters.furnish_types or [], ) for locid in districts.values() for i in [1, 2] ]) listings = [] diff --git a/crawler/main.py b/crawler/main.py index 570ef31..f6136b6 100644 --- a/crawler/main.py +++ b/crawler/main.py @@ -6,7 +6,7 @@ import importlib from rec.districts import get_districts from data_access import Listing import csv_exporter -from rec.query import ListingType +from rec.query import ListingType, FurnishType dump_listings_module = importlib.import_module('1_dump_listings') dump_detail_module = importlib.import_module('2_dump_detail') @@ -14,14 +14,6 @@ dump_images_module = importlib.import_module('3_dump_images') detect_floorplan_module = importlib.import_module('4_detect_floorplan') routing_module = importlib.import_module('5_routing') -steps_to_handlers = { - 'dump_listings': dump_listings_module.dump_listings, - 'dump_detail': dump_detail_module.dump_detail, - 'dump_images': dump_images_module.dump_images, - 'detect_floorplan': detect_floorplan_module.detect_floorplan, - 'routing': routing_module.calculate_route, -} - @click.group() @click.option( @@ -84,6 +76,19 @@ def cli(ctx, data_dir: str): type=click.Choice(get_districts().keys(), case_sensitive=False), multiple=True, ) +@click.option( + '--furnish-types', + '-f', + help='Furnish types for rented listings', + type=click.Choice( + [ + furnish_type.name + for furnish_type in FurnishType.__members__.values() + ], + case_sensitive=False, + ), + multiple=True, +) @click.pass_context def dump_listings( ctx: click.core.Context, @@ -93,6 +98,7 @@ def dump_listings( min_price: int, max_price: int, type: str, + furnish_types: list[str], ): data_dir: str = ctx.obj['data_dir'] query_parameters = dump_listings_module.QueryParameters( @@ -102,13 +108,16 @@ def dump_listings( max_bedrooms=max_bedrooms, min_price=min_price, max_price=max_price, + furnish_types=[ + FurnishType[furnish_type] for furnish_type in furnish_types + ], ) click.echo( f'Running dump_listings for districts {district}, data dir {data_dir} and parameters: ' - f'{query_parameters}' - ) + f'{query_parameters}') data_dir_path = pathlib.Path(data_dir) - asyncio.run(dump_listings_module.dump_listings(query_parameters, data_dir_path)) + asyncio.run( + dump_listings_module.dump_listings(query_parameters, data_dir_path)) @cli.command() diff --git a/crawler/rec/query.py b/crawler/rec/query.py index 4e7b2f4..ce0a2bc 100644 --- a/crawler/rec/query.py +++ b/crawler/rec/query.py @@ -1,6 +1,6 @@ # from diskcache import Cache import enum -from typing import Any, List +from typing import Any import aiohttp import requests import urllib3 @@ -13,6 +13,12 @@ class ListingType(enum.StrEnum): RENT = "RENT" +class FurnishType(enum.StrEnum): + FURNISHED = "furnished" + UNFURNISHED = "unfurnished" + PART_FURNISHED = "partFurnished" + + headers = { "Host": "api.rightmove.co.uk", # 'Accept-Encoding': 'gzip, deflate, br', @@ -48,6 +54,7 @@ async def detail_query(detail_id: int): async def listing_query( + *, page: int, channel: ListingType, min_bedrooms: int, @@ -58,8 +65,9 @@ async def listing_query( location_id: str = "STATION^5168", # kings cross station mustNewHome: bool = False, max_days_since_added: int = 30, - property_type: List["PropertyType"] = [], + property_type: list[PropertyType] = [], page_size: int = 25, + furnish_types: list[FurnishType] = [], ) -> dict[str, Any]: params: dict[str, str] = { "locationIdentifier": location_id, @@ -88,6 +96,9 @@ async def listing_query( if mustNewHome: params["mustHave"] = "newHome" + if channel is ListingType.RENT: + if furnish_types: + params["furnishTypes"] = ",".join(furnish_types) headers = { "Host": "api.rightmove.co.uk",