98 lines
3.2 KiB
Python
98 lines
3.2 KiB
Python
import asyncio
|
|
import json
|
|
import pathlib
|
|
from typing import Any
|
|
from rec.query import detail_query, listing_query, QueryParameters
|
|
from rec.districts import get_districts
|
|
import repositories
|
|
from sqlalchemy import Engine
|
|
from tqdm.asyncio import tqdm
|
|
from data_access import Listing
|
|
|
|
|
|
async def dump_listings(
|
|
parameters: QueryParameters,
|
|
db_engine: Engine,
|
|
data_dir: pathlib.Path = pathlib.Path("data/rs/"),
|
|
) -> list[Listing]:
|
|
if parameters.district_names:
|
|
districts = {
|
|
district: locid
|
|
for district, locid in get_districts().items()
|
|
if district in parameters.district_names
|
|
}
|
|
else:
|
|
districts = get_districts()
|
|
print("Valid districts to scrape:", districts.keys())
|
|
|
|
semaphore = asyncio.Semaphore(5) # if too high, rightmove drops connections
|
|
json_responses = await tqdm.gather(
|
|
*[
|
|
_dump_listings_with_semaphore(semaphore, i, parameters, locid)
|
|
for locid in districts.values()
|
|
for i in [1, 2]
|
|
],
|
|
desc="Fetching listings",
|
|
)
|
|
listings: list[Listing] = []
|
|
for response_json in json_responses:
|
|
if response_json["totalAvailableResults"] == 0:
|
|
continue
|
|
for property in response_json["properties"]:
|
|
identifier = property["identifier"]
|
|
|
|
listing = Listing(identifier, data_dir=data_dir)
|
|
listing.dump_listing(property)
|
|
listings.append(listing)
|
|
|
|
# if listing is already in db, do not fetch details again
|
|
repository = repositories.ListingRepository(db_engine)
|
|
all_listings = await repository.get_listings(
|
|
only_ids=[listing.identifier for listing in listings]
|
|
)
|
|
all_listing_ids = {listing.id for listing in all_listings}
|
|
|
|
await tqdm.gather(
|
|
*[
|
|
_dump_detail_with_semaphore(semaphore, listing)
|
|
for listing in listings
|
|
# if listing.identifier not in all_listing_ids # One day we will rely solely on the model data
|
|
],
|
|
desc="Fetching details",
|
|
)
|
|
return listings
|
|
|
|
|
|
async def _dump_listings_with_semaphore(
|
|
semaphore: asyncio.Semaphore,
|
|
page_id: int,
|
|
parameters: QueryParameters,
|
|
location_id: str,
|
|
) -> dict[str, Any]:
|
|
async with semaphore:
|
|
listing_query_result = await listing_query(
|
|
page=page_id,
|
|
channel=parameters.listing_type,
|
|
min_bedrooms=parameters.min_bedrooms,
|
|
max_bedrooms=parameters.max_bedrooms,
|
|
radius=parameters.radius,
|
|
min_price=parameters.min_price,
|
|
max_price=parameters.max_price,
|
|
location_id=location_id,
|
|
page_size=parameters.page_size,
|
|
max_days_since_added=parameters.max_days_since_added,
|
|
furnish_types=parameters.furnish_types or [],
|
|
)
|
|
|
|
return listing_query_result
|
|
|
|
|
|
async def _dump_detail_with_semaphore(semaphore: asyncio.Semaphore, listing: Listing):
|
|
if listing.path_detail_json().exists():
|
|
return
|
|
|
|
# for listing in tqdm(filtered_listings):
|
|
async with semaphore:
|
|
d = await detail_query(listing.identifier)
|
|
with open(listing.path_detail_json(), "w") as f:
|
|
json.dump(d, f)
|