wrongmove/crawler/1_dump_listings.py
Viktor Barzin df24c2c1b7
add cli param for querying properties to rent
example:
python main.py --data-dir data/rs2 dump-listings --max-price 3500 --min-bedrooms 2 --max-bedrooms 4 --district islington -t rent
2025-05-17 21:22:39 +00:00

72 lines
2.2 KiB
Python

from dataclasses import dataclass
import pathlib
from rec.query import ListingType, listing_query
from rec.districts import get_districts
from data_access import Listing
@dataclass(frozen=True)
class QueryParameters:
listing_type: ListingType
min_bedrooms: int
max_bedrooms: int
min_price: int
max_price: int
district_names: set[str]
radius: float = 0
page_size: int = 500 # items per page
max_days_since_added: int | None = None
def dump_listings(
parameters: QueryParameters,
data_dir: pathlib.Path = pathlib.Path("data/rs/"),
) -> list[Listing]:
districts = {
district: locid
for district, locid in get_districts().items()
if district in parameters.district_names
}
print("Valid districts to scrape:", districts.keys())
listings = []
for district, locid in districts.items():
print("#### District:", district)
for i in [1, 2]:
try:
response_json = listing_query(
page=i,
channel=parameters.listing_type,
min_bedrooms=parameters.min_bedrooms,
max_bedrooms=parameters.max_bedrooms,
radius=parameters.radius,
min_price=parameters.min_price,
max_price=parameters.max_price,
location_id=locid,
page_size=parameters.page_size,
max_days_since_added=parameters.max_days_since_added,
)
except Exception as e:
print(e)
break
if i == 1:
print("totalAvailableResults: ", response_json["totalAvailableResults"])
if len(response_json["properties"]) == 0:
break
print(f"page {i}", end=", ", flush=True)
for property in response_json["properties"]:
identifier = property["identifier"]
listing = Listing(identifier, data_dir=data_dir)
listing.dump_listing(property)
listings.append(listing)
print() # break line as we used end=, above.
return listings
def main():
dump_listings()
if __name__ == "__main__":
main()