refactor dump listings to start using model instead of the data_access object
This commit is contained in:
parent
842f7cefbe
commit
4f5a934fa9
5 changed files with 52 additions and 38 deletions
|
|
@ -4,17 +4,18 @@ import pathlib
|
|||
from typing import Any
|
||||
from rec.query import detail_query, listing_query, QueryParameters
|
||||
from rec.districts import get_districts
|
||||
import repositories
|
||||
from repositories import ListingRepository
|
||||
from sqlalchemy import Engine
|
||||
from tqdm.asyncio import tqdm
|
||||
from data_access import Listing
|
||||
from models import Listing as modelListing
|
||||
|
||||
|
||||
async def dump_listings(
|
||||
parameters: QueryParameters,
|
||||
db_engine: Engine,
|
||||
repository: ListingRepository,
|
||||
data_dir: pathlib.Path = pathlib.Path("data/rs/"),
|
||||
) -> list[Listing]:
|
||||
) -> list[modelListing]:
|
||||
if parameters.district_names:
|
||||
districts = {
|
||||
district: locid
|
||||
|
|
@ -28,7 +29,7 @@ async def dump_listings(
|
|||
semaphore = asyncio.Semaphore(5) # if too high, rightmove drops connections
|
||||
json_responses = await tqdm.gather(
|
||||
*[
|
||||
_dump_listings_with_semaphore(semaphore, i, parameters, locid)
|
||||
_fetch_listings_with_semaphore(semaphore, i, parameters, locid)
|
||||
for locid in districts.values()
|
||||
for i in [1, 2]
|
||||
],
|
||||
|
|
@ -41,29 +42,36 @@ async def dump_listings(
|
|||
for property in response_json["properties"]:
|
||||
identifier = property["identifier"]
|
||||
|
||||
listing = Listing(identifier, data_dir=data_dir)
|
||||
listing.dump_listing(property)
|
||||
listing = Listing(identifier, data_dir=data_dir, _listing_object=property)
|
||||
listings.append(listing)
|
||||
|
||||
# if listing is already in db, do not fetch details again
|
||||
repository = repositories.ListingRepository(db_engine)
|
||||
all_listings = await repository.get_listings(
|
||||
only_ids=[listing.identifier for listing in listings]
|
||||
)
|
||||
all_listing_ids = {listing.id for listing in all_listings}
|
||||
|
||||
await tqdm.gather(
|
||||
listings_without_details = [
|
||||
listing for listing in listings if not listing.path_detail_json().exists()
|
||||
]
|
||||
listing_details = await tqdm.gather(
|
||||
*[
|
||||
_dump_detail_with_semaphore(semaphore, listing)
|
||||
for listing in listings
|
||||
_fetch_detail_with_semaphore(semaphore, listing.identifier)
|
||||
for listing in listings_without_details
|
||||
# if listing.identifier not in all_listing_ids # One day we will rely solely on the model data
|
||||
],
|
||||
desc="Fetching details",
|
||||
)
|
||||
return listings
|
||||
for listing, detail in zip(listings_without_details, listing_details):
|
||||
listing._details_object = detail
|
||||
|
||||
model_listings = await repository.upsert_listings(listings) # upsert in db
|
||||
await dump_listings_to_fs(listings)
|
||||
|
||||
return model_listings
|
||||
|
||||
|
||||
async def _dump_listings_with_semaphore(
|
||||
async def _fetch_listings_with_semaphore(
|
||||
semaphore: asyncio.Semaphore,
|
||||
page_id: int,
|
||||
parameters: QueryParameters,
|
||||
|
|
@ -87,12 +95,18 @@ async def _dump_listings_with_semaphore(
|
|||
return listing_query_result
|
||||
|
||||
|
||||
async def _dump_detail_with_semaphore(semaphore: asyncio.Semaphore, listing: Listing):
|
||||
if listing.path_detail_json().exists():
|
||||
return
|
||||
|
||||
# for listing in tqdm(filtered_listings):
|
||||
async def _fetch_detail_with_semaphore(
|
||||
semaphore: asyncio.Semaphore, listing_id: int
|
||||
) -> dict[str, Any]:
|
||||
async with semaphore:
|
||||
d = await detail_query(listing.identifier)
|
||||
with open(listing.path_detail_json(), "w") as f:
|
||||
json.dump(d, f)
|
||||
d = await detail_query(listing_id)
|
||||
return d
|
||||
|
||||
|
||||
async def dump_listings_to_fs(listings: list[Listing]) -> None:
|
||||
for listing in listings:
|
||||
if not listing.path_listing_json().exists():
|
||||
listing.dump_listing()
|
||||
if not listing.path_detail_json().exists():
|
||||
with open(listing.path_detail_json(), "w") as f:
|
||||
json.dump(listing._details_object, f, indent=4)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue