refactor dump listings to start using model instead of the data_access object
This commit is contained in:
parent
842f7cefbe
commit
4f5a934fa9
5 changed files with 52 additions and 38 deletions
|
|
@ -4,17 +4,18 @@ import pathlib
|
|||
from typing import Any
|
||||
from rec.query import detail_query, listing_query, QueryParameters
|
||||
from rec.districts import get_districts
|
||||
import repositories
|
||||
from repositories import ListingRepository
|
||||
from sqlalchemy import Engine
|
||||
from tqdm.asyncio import tqdm
|
||||
from data_access import Listing
|
||||
from models import Listing as modelListing
|
||||
|
||||
|
||||
async def dump_listings(
|
||||
parameters: QueryParameters,
|
||||
db_engine: Engine,
|
||||
repository: ListingRepository,
|
||||
data_dir: pathlib.Path = pathlib.Path("data/rs/"),
|
||||
) -> list[Listing]:
|
||||
) -> list[modelListing]:
|
||||
if parameters.district_names:
|
||||
districts = {
|
||||
district: locid
|
||||
|
|
@ -28,7 +29,7 @@ async def dump_listings(
|
|||
semaphore = asyncio.Semaphore(5) # if too high, rightmove drops connections
|
||||
json_responses = await tqdm.gather(
|
||||
*[
|
||||
_dump_listings_with_semaphore(semaphore, i, parameters, locid)
|
||||
_fetch_listings_with_semaphore(semaphore, i, parameters, locid)
|
||||
for locid in districts.values()
|
||||
for i in [1, 2]
|
||||
],
|
||||
|
|
@ -41,29 +42,36 @@ async def dump_listings(
|
|||
for property in response_json["properties"]:
|
||||
identifier = property["identifier"]
|
||||
|
||||
listing = Listing(identifier, data_dir=data_dir)
|
||||
listing.dump_listing(property)
|
||||
listing = Listing(identifier, data_dir=data_dir, _listing_object=property)
|
||||
listings.append(listing)
|
||||
|
||||
# if listing is already in db, do not fetch details again
|
||||
repository = repositories.ListingRepository(db_engine)
|
||||
all_listings = await repository.get_listings(
|
||||
only_ids=[listing.identifier for listing in listings]
|
||||
)
|
||||
all_listing_ids = {listing.id for listing in all_listings}
|
||||
|
||||
await tqdm.gather(
|
||||
listings_without_details = [
|
||||
listing for listing in listings if not listing.path_detail_json().exists()
|
||||
]
|
||||
listing_details = await tqdm.gather(
|
||||
*[
|
||||
_dump_detail_with_semaphore(semaphore, listing)
|
||||
for listing in listings
|
||||
_fetch_detail_with_semaphore(semaphore, listing.identifier)
|
||||
for listing in listings_without_details
|
||||
# if listing.identifier not in all_listing_ids # One day we will rely solely on the model data
|
||||
],
|
||||
desc="Fetching details",
|
||||
)
|
||||
return listings
|
||||
for listing, detail in zip(listings_without_details, listing_details):
|
||||
listing._details_object = detail
|
||||
|
||||
model_listings = await repository.upsert_listings(listings) # upsert in db
|
||||
await dump_listings_to_fs(listings)
|
||||
|
||||
return model_listings
|
||||
|
||||
|
||||
async def _dump_listings_with_semaphore(
|
||||
async def _fetch_listings_with_semaphore(
|
||||
semaphore: asyncio.Semaphore,
|
||||
page_id: int,
|
||||
parameters: QueryParameters,
|
||||
|
|
@ -87,12 +95,18 @@ async def _dump_listings_with_semaphore(
|
|||
return listing_query_result
|
||||
|
||||
|
||||
async def _dump_detail_with_semaphore(semaphore: asyncio.Semaphore, listing: Listing):
|
||||
if listing.path_detail_json().exists():
|
||||
return
|
||||
|
||||
# for listing in tqdm(filtered_listings):
|
||||
async def _fetch_detail_with_semaphore(
|
||||
semaphore: asyncio.Semaphore, listing_id: int
|
||||
) -> dict[str, Any]:
|
||||
async with semaphore:
|
||||
d = await detail_query(listing.identifier)
|
||||
with open(listing.path_detail_json(), "w") as f:
|
||||
json.dump(d, f)
|
||||
d = await detail_query(listing_id)
|
||||
return d
|
||||
|
||||
|
||||
async def dump_listings_to_fs(listings: list[Listing]) -> None:
|
||||
for listing in listings:
|
||||
if not listing.path_listing_json().exists():
|
||||
listing.dump_listing()
|
||||
if not listing.path_detail_json().exists():
|
||||
with open(listing.path_detail_json(), "w") as f:
|
||||
json.dump(listing._details_object, f, indent=4)
|
||||
|
|
|
|||
|
|
@ -13,7 +13,8 @@ import datetime
|
|||
@dataclass()
|
||||
class Listing:
|
||||
identifier: int
|
||||
_cached: Dict | None = None
|
||||
_details_object: dict[str, Any] | None = None
|
||||
_listing_object: dict[str, Any] | None = None
|
||||
data_dir: pathlib.Path = pathlib.Path("data/rs/")
|
||||
ALL_COLUMNS = [
|
||||
"identifier",
|
||||
|
|
@ -98,15 +99,19 @@ class Listing:
|
|||
def path_price_history(self) -> pathlib.Path:
|
||||
return self.path_listing() / "price_history.json"
|
||||
|
||||
def dump_listing(self, d: dict):
|
||||
def dump_listing(self) -> None:
|
||||
if self._listing_object is None:
|
||||
raise ValueError("No listing data provided to dump.")
|
||||
with open(self.path_listing_json(), "w") as f:
|
||||
json.dump(d, f)
|
||||
json.dump(self._listing_object, f)
|
||||
with open(self.path_last_seen_listing(), "w") as f:
|
||||
dt = datetime.datetime.now().isoformat()
|
||||
json.dump(dt, f)
|
||||
|
||||
# some places list pw in price and others pcm
|
||||
price = max(d["price"], d.get("monthlyRent", 0))
|
||||
price = max(
|
||||
self._listing_object["price"], self._listing_object.get("monthlyRent", 0)
|
||||
)
|
||||
self.append_price_history(price)
|
||||
|
||||
def append_price_history(self, price: float) -> None:
|
||||
|
|
@ -249,19 +254,18 @@ class Listing:
|
|||
|
||||
@property
|
||||
def listingobject(self):
|
||||
if self._cached is None:
|
||||
with open(self.path_listing_json()) as f:
|
||||
return json.load(f)
|
||||
with open(self.path_listing_json()) as f:
|
||||
return json.load(f)
|
||||
|
||||
@property
|
||||
def detailobject(self) -> dict[str, Any]:
|
||||
if self._cached is None:
|
||||
if self._details_object is None:
|
||||
if self.path_detail_json().exists():
|
||||
with open(self.path_detail_json()) as f:
|
||||
self._cached = json.load(f)
|
||||
self._details_object = json.load(f)
|
||||
else:
|
||||
return {}
|
||||
return self._cached # type: ignore
|
||||
return self._details_object # type: ignore
|
||||
|
||||
@property
|
||||
def price(self) -> float:
|
||||
|
|
|
|||
|
|
@ -155,11 +155,10 @@ def dump_listings(
|
|||
f"{query_parameters}"
|
||||
)
|
||||
data_dir_path = pathlib.Path(data_dir)
|
||||
listings = asyncio.run(
|
||||
dump_listings_module.dump_listings(query_parameters, engine, data_dir_path)
|
||||
)
|
||||
repository = ListingRepository(engine=engine)
|
||||
asyncio.run(repository.upsert_listings(listings))
|
||||
asyncio.run(
|
||||
dump_listings_module.dump_listings(query_parameters, repository, data_dir_path)
|
||||
)
|
||||
|
||||
|
||||
@cli.command()
|
||||
|
|
|
|||
|
|
@ -50,9 +50,6 @@ class RentListing(Listing, table=True):
|
|||
|
||||
class BuyListing(Listing, table=True):
|
||||
service_charge: float | None = Field(default=None, nullable=True)
|
||||
council_tax_band: str | None = Field(
|
||||
default=None, nullable=True
|
||||
) # e.g., A, B, C, D, E, F, G
|
||||
lease_left: int | None = Field(
|
||||
default=None, nullable=True
|
||||
) # in years, e.g., 90, 80, etc.
|
||||
|
|
|
|||
|
|
@ -115,7 +115,7 @@ async def listing_query(
|
|||
radius: float,
|
||||
min_price: int,
|
||||
max_price: int,
|
||||
location_id: str = "STATION^5168", # kings cross station
|
||||
location_id: str, # = "STATION^5168", # kings cross station
|
||||
mustNewHome: bool = False,
|
||||
max_days_since_added: int = 30,
|
||||
property_type: list[PropertyType] = [],
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue