2025-05-17 21:55:42 +00:00
|
|
|
import asyncio
|
2025-06-21 12:04:31 +00:00
|
|
|
import importlib
|
2025-06-22 12:43:24 +00:00
|
|
|
import itertools
|
2025-06-07 12:00:23 +00:00
|
|
|
import json
|
2025-06-22 14:00:47 +00:00
|
|
|
import logging
|
2025-05-14 20:19:08 +00:00
|
|
|
import pathlib
|
2025-06-06 20:08:38 +00:00
|
|
|
from typing import Any
|
2025-06-07 12:00:23 +00:00
|
|
|
from rec.query import detail_query, listing_query, QueryParameters
|
2024-03-30 19:23:19 +01:00
|
|
|
from rec.districts import get_districts
|
2025-06-07 12:46:53 +00:00
|
|
|
from repositories import ListingRepository
|
2025-06-22 12:43:24 +00:00
|
|
|
import requests
|
2025-06-07 12:00:23 +00:00
|
|
|
from tqdm.asyncio import tqdm
|
2024-03-11 14:43:53 +00:00
|
|
|
from data_access import Listing
|
2025-06-07 12:46:53 +00:00
|
|
|
from models import Listing as modelListing
|
2024-03-10 18:49:39 +00:00
|
|
|
|
2025-06-21 12:04:31 +00:00
|
|
|
dump_images_module = importlib.import_module("3_dump_images")
|
|
|
|
|
detect_floorplan_module = importlib.import_module("4_detect_floorplan")
|
|
|
|
|
|
2025-06-22 21:15:30 +00:00
|
|
|
logger = logging.getLogger("uvicorn.error")
|
2025-06-22 12:43:24 +00:00
|
|
|
|
2025-06-21 12:04:31 +00:00
|
|
|
|
|
|
|
|
async def dump_listings_full(
|
|
|
|
|
parameters: QueryParameters,
|
|
|
|
|
repository: ListingRepository,
|
|
|
|
|
data_dir: pathlib.Path = pathlib.Path("data/rs/"),
|
|
|
|
|
) -> list[modelListing]:
|
|
|
|
|
"""Fetches all listings, images as well as detects floorplans"""
|
|
|
|
|
new_listings = await dump_listings(parameters, repository, data_dir)
|
2025-06-22 21:15:30 +00:00
|
|
|
logger.debug(f"Upserted {len(new_listings)} new listings")
|
|
|
|
|
logger.debug("Starting to fetch floorplans")
|
2025-06-21 12:04:31 +00:00
|
|
|
await dump_images_module.dump_images(repository, image_base_path=data_dir)
|
2025-06-22 21:15:30 +00:00
|
|
|
logger.debug("Completed fetching floorplans")
|
|
|
|
|
logger.debug("Starting floorplan detection")
|
2025-06-21 12:04:31 +00:00
|
|
|
await detect_floorplan_module.detect_floorplan(repository)
|
2025-06-22 21:15:30 +00:00
|
|
|
logger.debug("Completed floorplan detection")
|
2025-06-21 12:04:31 +00:00
|
|
|
# refresh listings
|
|
|
|
|
listings = await repository.get_listings(parameters) # this can be better
|
|
|
|
|
new_listings = [l for l in listings if l.id in new_listings]
|
|
|
|
|
return new_listings
|
|
|
|
|
|
2025-05-11 18:59:41 +00:00
|
|
|
|
2025-05-17 21:55:42 +00:00
|
|
|
async def dump_listings(
|
2025-05-31 23:50:43 +00:00
|
|
|
parameters: QueryParameters,
|
2025-06-07 12:46:53 +00:00
|
|
|
repository: ListingRepository,
|
2025-05-31 23:50:43 +00:00
|
|
|
data_dir: pathlib.Path = pathlib.Path("data/rs/"),
|
2025-06-07 12:46:53 +00:00
|
|
|
) -> list[modelListing]:
|
2025-05-26 19:31:33 +00:00
|
|
|
if parameters.district_names:
|
|
|
|
|
districts = {
|
|
|
|
|
district: locid
|
|
|
|
|
for district, locid in get_districts().items()
|
|
|
|
|
if district in parameters.district_names
|
|
|
|
|
}
|
|
|
|
|
else:
|
|
|
|
|
districts = get_districts()
|
2025-06-22 21:15:30 +00:00
|
|
|
logger.debug("Valid districts to scrape:", districts.keys())
|
2025-05-11 18:59:41 +00:00
|
|
|
|
2025-06-01 00:27:12 +00:00
|
|
|
semaphore = asyncio.Semaphore(5) # if too high, rightmove drops connections
|
2025-06-22 12:43:24 +00:00
|
|
|
json_responses: list[list[dict[str, Any]]] = await tqdm.gather(
|
2025-05-31 23:50:43 +00:00
|
|
|
*[
|
2025-06-22 12:43:24 +00:00
|
|
|
_fetch_listings_with_semaphore(semaphore, parameters, district)
|
|
|
|
|
for district in districts.keys()
|
2025-06-07 12:00:23 +00:00
|
|
|
],
|
|
|
|
|
desc="Fetching listings",
|
2025-05-31 23:50:43 +00:00
|
|
|
)
|
2025-06-22 12:43:24 +00:00
|
|
|
json_responses_flat = list(itertools.chain.from_iterable(json_responses))
|
2025-06-22 21:15:30 +00:00
|
|
|
logger.debug(f"Total listings fetched {len(json_responses_flat)}")
|
2025-06-07 12:00:23 +00:00
|
|
|
listings: list[Listing] = []
|
2025-06-22 12:43:24 +00:00
|
|
|
for response_json in json_responses_flat:
|
|
|
|
|
if response_json == {}:
|
|
|
|
|
continue
|
2025-05-17 21:55:42 +00:00
|
|
|
if response_json["totalAvailableResults"] == 0:
|
|
|
|
|
continue
|
|
|
|
|
for property in response_json["properties"]:
|
|
|
|
|
identifier = property["identifier"]
|
|
|
|
|
|
2025-06-07 12:46:53 +00:00
|
|
|
listing = Listing(identifier, data_dir=data_dir, _listing_object=property)
|
2025-05-17 21:55:42 +00:00
|
|
|
listings.append(listing)
|
2025-05-11 18:59:41 +00:00
|
|
|
|
2025-06-07 12:00:23 +00:00
|
|
|
# if listing is already in db, do not fetch details again
|
2025-06-09 21:23:54 +00:00
|
|
|
all_listing_ids = [l.id for l in await repository.get_listings()]
|
|
|
|
|
missing_listing = [
|
|
|
|
|
listing for listing in listings if listing.identifier not in all_listing_ids
|
2025-06-07 12:46:53 +00:00
|
|
|
]
|
2025-06-22 21:15:30 +00:00
|
|
|
logger.debug(f"Fetching details for {len(missing_listing)} missing listings")
|
2025-06-07 12:46:53 +00:00
|
|
|
listing_details = await tqdm.gather(
|
2025-06-07 12:00:23 +00:00
|
|
|
*[
|
2025-06-07 12:46:53 +00:00
|
|
|
_fetch_detail_with_semaphore(semaphore, listing.identifier)
|
2025-06-09 21:23:54 +00:00
|
|
|
for listing in missing_listing
|
2025-06-07 12:00:23 +00:00
|
|
|
],
|
2025-06-08 20:58:28 +00:00
|
|
|
desc="Fetching details (only missing)",
|
2025-06-07 12:00:23 +00:00
|
|
|
)
|
2025-06-09 21:23:54 +00:00
|
|
|
for listing, detail in zip(missing_listing, listing_details):
|
2025-06-07 12:46:53 +00:00
|
|
|
listing._details_object = detail
|
2025-06-06 20:08:38 +00:00
|
|
|
|
2025-06-22 21:15:30 +00:00
|
|
|
logger.debug("Dumping listings to fs")
|
2025-06-09 21:23:54 +00:00
|
|
|
await dump_listings_to_fs(missing_listing)
|
2025-06-22 21:15:30 +00:00
|
|
|
logger.debug("Upserting listings in db")
|
2025-06-09 21:23:54 +00:00
|
|
|
model_listings = await repository.upsert_listings_legacy(
|
|
|
|
|
missing_listing
|
|
|
|
|
) # upsert in db
|
2025-06-06 20:08:38 +00:00
|
|
|
|
2025-06-07 12:46:53 +00:00
|
|
|
return model_listings
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
async def _fetch_listings_with_semaphore(
|
2025-06-06 20:08:38 +00:00
|
|
|
semaphore: asyncio.Semaphore,
|
|
|
|
|
parameters: QueryParameters,
|
2025-06-22 12:43:24 +00:00
|
|
|
district: str,
|
|
|
|
|
) -> list[dict[str, Any]]:
|
|
|
|
|
result = []
|
|
|
|
|
# we don't know how many pages we have but we stop as soon as there's no more
|
2025-06-22 13:04:10 +00:00
|
|
|
for page_id in range(1, 3):
|
2025-06-22 21:15:30 +00:00
|
|
|
logger.debug(f"Processing {page_id=} for {district=}")
|
2025-06-22 13:04:10 +00:00
|
|
|
# seems like all searches stop at 1500 entries (page_id * page_size)
|
2025-06-22 12:43:24 +00:00
|
|
|
async with semaphore:
|
|
|
|
|
try:
|
|
|
|
|
listing_query_result = await listing_query(
|
|
|
|
|
page=page_id,
|
|
|
|
|
channel=parameters.listing_type,
|
|
|
|
|
min_bedrooms=parameters.min_bedrooms,
|
|
|
|
|
max_bedrooms=parameters.max_bedrooms,
|
|
|
|
|
radius=parameters.radius,
|
|
|
|
|
min_price=parameters.min_price,
|
|
|
|
|
max_price=parameters.max_price,
|
|
|
|
|
district=district,
|
|
|
|
|
page_size=parameters.page_size,
|
|
|
|
|
max_days_since_added=parameters.max_days_since_added,
|
|
|
|
|
furnish_types=parameters.furnish_types or [],
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
if "GENERIC_ERROR" in str(e): # Too big page id
|
|
|
|
|
logger.debug(f"Max page id for {district=}: {page_id-1}")
|
|
|
|
|
break
|
|
|
|
|
raise e
|
|
|
|
|
result.append(listing_query_result)
|
|
|
|
|
return result
|
2025-06-07 12:00:23 +00:00
|
|
|
|
|
|
|
|
|
2025-06-07 12:46:53 +00:00
|
|
|
async def _fetch_detail_with_semaphore(
|
|
|
|
|
semaphore: asyncio.Semaphore, listing_id: int
|
|
|
|
|
) -> dict[str, Any]:
|
2025-06-07 12:00:23 +00:00
|
|
|
async with semaphore:
|
2025-06-07 12:46:53 +00:00
|
|
|
d = await detail_query(listing_id)
|
|
|
|
|
return d
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
async def dump_listings_to_fs(listings: list[Listing]) -> None:
|
2025-06-09 21:23:54 +00:00
|
|
|
for listing in tqdm(listings, desc="Dumping listings to FS"):
|
|
|
|
|
listing.dump_listing()
|
|
|
|
|
# if not listing.path_detail_json().exists():
|
|
|
|
|
with open(listing.path_detail_json(), "w") as f:
|
|
|
|
|
json.dump(listing._details_object, f, indent=4)
|