From df24c2c1b796558fef3770f8af6e3d463ef3fc76 Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Sat, 17 May 2025 21:22:39 +0000 Subject: [PATCH] add cli param for querying properties to rent example: python main.py --data-dir data/rs2 dump-listings --max-price 3500 --min-bedrooms 2 --max-bedrooms 4 --district islington -t rent --- crawler/1_dump_listings.py | 12 +++++++----- crawler/main.py | 13 +++++++++++++ crawler/rec/query.py | 37 ++++++++++++++++++++++++------------- 3 files changed, 44 insertions(+), 18 deletions(-) diff --git a/crawler/1_dump_listings.py b/crawler/1_dump_listings.py index ea6162f..60b6a1b 100644 --- a/crawler/1_dump_listings.py +++ b/crawler/1_dump_listings.py @@ -1,12 +1,13 @@ from dataclasses import dataclass import pathlib -from rec.query import listing_query +from rec.query import ListingType, listing_query from rec.districts import get_districts from data_access import Listing @dataclass(frozen=True) class QueryParameters: + listing_type: ListingType min_bedrooms: int max_bedrooms: int min_price: int @@ -32,8 +33,9 @@ def dump_listings( print("#### District:", district) for i in [1, 2]: try: - d = listing_query( + response_json = listing_query( page=i, + channel=parameters.listing_type, min_bedrooms=parameters.min_bedrooms, max_bedrooms=parameters.max_bedrooms, radius=parameters.radius, @@ -47,12 +49,12 @@ def dump_listings( print(e) break if i == 1: - print("totalAvailableResults: ", d["totalAvailableResults"]) - if len(d["properties"]) == 0: + print("totalAvailableResults: ", response_json["totalAvailableResults"]) + if len(response_json["properties"]) == 0: break print(f"page {i}", end=", ", flush=True) - for property in d["properties"]: + for property in response_json["properties"]: identifier = property["identifier"] listing = Listing(identifier, data_dir=data_dir) diff --git a/crawler/main.py b/crawler/main.py index 66e5687..be5c93c 100644 --- a/crawler/main.py +++ b/crawler/main.py @@ -5,6 +5,7 @@ import importlib from rec.districts import get_districts from data_access import Listing import csv_exporter +from rec.query import ListingType dump_listings_module = importlib.import_module('1_dump_listings') dump_detail_module = importlib.import_module('2_dump_detail') @@ -41,6 +42,16 @@ def cli(ctx, data_dir: str): @cli.command() +@click.option( + '--type', + '-t', + help='Type of listing to scrape', + type=click.Choice( + ListingType.__members__.keys(), + case_sensitive=False, + ), + required=True, +) @click.option( '--min-bedrooms', default=1, @@ -80,9 +91,11 @@ def dump_listings( max_bedrooms: int, min_price: int, max_price: int, + type: str, ): data_dir: str = ctx.obj['data_dir'] query_parameters = dump_listings_module.QueryParameters( + listing_type=ListingType[type], district_names=set(district), min_bedrooms=min_bedrooms, max_bedrooms=max_bedrooms, diff --git a/crawler/rec/query.py b/crawler/rec/query.py index 801cd6d..304e9b9 100644 --- a/crawler/rec/query.py +++ b/crawler/rec/query.py @@ -6,7 +6,11 @@ import urllib3 urllib3.disable_warnings() -# cache = Cache(r"_cache") + +class ListingType(enum.StrEnum): + BUY = "BUY" + RENT = "RENT" + headers = { "Host": "api.rightmove.co.uk", @@ -42,9 +46,9 @@ def detail_query(detail_id: int): return response.json() -# @cache.memoize() def listing_query( page: int, + channel: ListingType, min_bedrooms: int, max_bedrooms: int, radius: float, @@ -58,29 +62,38 @@ def listing_query( ) -> dict: params = { "locationIdentifier": location_id, - "channel": "BUY", + "channel": channel.upper(), "page": str(page), "numberOfPropertiesPerPage": str(page_size), "radius": str(radius), "sortBy": "distance", "includeUnavailableProperties": "false", - "dontShow": "sharedOwnership,retirement", "minPrice": str(min_price), "maxPrice": str(max_price), "minBedrooms": str(min_bedrooms), "maxBedrooms": str(max_bedrooms), "apiApplication": "ANDROID", - "appVersion": "3.70.0", + "appVersion": "4.28.0", } - if len(property_type) > 0: - params["propertyTypes"] = ",".join(property_type) - if max_days_since_added: - if max_days_since_added not in [1, 3, 7, 14]: + if channel is ListingType.BUY: + params["dontShow"] = "sharedOwnership,retirement", + if len(property_type) > 0: + params["propertyTypes"] = ",".join(property_type) + if max_days_since_added is not None and max_days_since_added not in [ + 1, 3, 7, 14 + ]: raise Exception("Invalid max days. Can only be", [1, 3, 7, 14]) params["maxDaysSinceAdded"] = max_days_since_added - if mustNewHome: - params["mustHave"] = "newHome" + if mustNewHome: + params["mustHave"] = "newHome" + + headers = { + "Host": "api.rightmove.co.uk", + "Accept-Encoding": "gzip, deflate, br", + "User-Agent": "okhttp/4.12.0", + "Connection": "keep-alive" + } response = requests.get( "https://api.rightmove.co.uk/api/property-listing", params=params, @@ -91,5 +104,3 @@ def listing_query( raise Exception("Failed due to: ", response.text) return response.json() - -