wrongmove/crawler/1_dump_listings.py

73 lines
2.3 KiB
Python
Raw Normal View History

2025-05-17 21:55:42 +00:00
import asyncio
from dataclasses import dataclass
import pathlib
from rec.query import ListingType, listing_query, FurnishType
from rec.districts import get_districts
from data_access import Listing
@dataclass(frozen=True)
class QueryParameters:
listing_type: ListingType
min_bedrooms: int
max_bedrooms: int
min_price: int
max_price: int
district_names: set[str]
radius: float = 0
page_size: int = 500 # items per page
2025-05-18 12:27:26 +00:00
max_days_since_added: int = 30
furnish_types: list[FurnishType] | None = None
# The values below are not supported by rightmove
# hence we apply them after fetching
# available from; council tax
2025-05-17 21:55:42 +00:00
async def dump_listings(
2025-05-18 12:27:26 +00:00
parameters: QueryParameters,
data_dir: pathlib.Path = pathlib.Path("data/rs/"),
) -> list[Listing]:
if parameters.district_names:
districts = {
district: locid
for district, locid in get_districts().items()
if district in parameters.district_names
}
else:
districts = get_districts()
print("Valid districts to scrape:", districts.keys())
listings = []
2025-05-18 12:27:26 +00:00
json_responses = await asyncio.gather(*[
listing_query(
page=i,
channel=parameters.listing_type,
min_bedrooms=parameters.min_bedrooms,
max_bedrooms=parameters.max_bedrooms,
radius=parameters.radius,
min_price=parameters.min_price,
max_price=parameters.max_price,
location_id=locid,
page_size=parameters.page_size,
max_days_since_added=parameters.max_days_since_added,
furnish_types=parameters.furnish_types or [],
2025-05-18 12:27:26 +00:00
) for locid in districts.values() for i in [1, 2]
])
2025-05-17 21:55:42 +00:00
listings = []
for response_json in json_responses:
if response_json["totalAvailableResults"] == 0:
print("No results found")
continue
if response_json["totalAvailableResults"] > 0:
2025-05-18 12:27:26 +00:00
print("totalAvailableResults: ",
response_json["totalAvailableResults"])
2025-05-17 21:55:42 +00:00
for property in response_json["properties"]:
identifier = property["identifier"]
listing = Listing(identifier, data_dir=data_dir)
listing.dump_listing(property)
listings.append(listing)
return listings