migrate processing to a pipeline approach where each listing is processed in a pipeline in parallel and status reported back to track progress
This commit is contained in:
parent
4fa09e31c8
commit
91a0436f7f
6 changed files with 347 additions and 26 deletions
|
|
@ -1,16 +1,17 @@
|
|||
import asyncio
|
||||
import importlib
|
||||
import itertools
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
from celery import Celery, Task
|
||||
from celery import Task
|
||||
from celery_app import app
|
||||
from models.listing import FurnishType, Listing, ListingType, QueryParameters
|
||||
from listing_processor import ListingProcessor
|
||||
from models.listing import Listing, ListingType, QueryParameters
|
||||
from rec.districts import get_districts
|
||||
from rec.query import listing_query
|
||||
from repositories.listing_repository import ListingRepository
|
||||
from database import engine
|
||||
from tasks.task_state import TaskStatus
|
||||
|
||||
dump_listings_module = importlib.import_module("1_dump_listings")
|
||||
dump_images_module = importlib.import_module("3_dump_images")
|
||||
detect_floorplan_module = importlib.import_module("4_detect_floorplan")
|
||||
|
||||
|
|
@ -20,28 +21,61 @@ logger = logging.getLogger("uvicorn.error")
|
|||
@app.task(bind=True, pydantic=True)
|
||||
def dump_listings_task(self: Task, parameters_json: str) -> dict[str, Any]:
|
||||
parsed_parameters = QueryParameters.model_validate_json(parameters_json)
|
||||
asyncio.run(dump_listings_full(self, parsed_parameters))
|
||||
return {"progress": 1}
|
||||
asyncio.run(dump_listings_full(task=self, parameters=parsed_parameters))
|
||||
self.update_state(state="Starting...", meta={"progress": 0})
|
||||
return {"progress": 0}
|
||||
|
||||
|
||||
async def dump_listings_full(self: Task, parameters: QueryParameters) -> list[Listing]:
|
||||
async def async_dump_listings_task(parameters_json: str) -> dict[str, Any]:
|
||||
parsed_parameters = QueryParameters.model_validate_json(parameters_json)
|
||||
await dump_listings_full(task=Task(), parameters=parsed_parameters)
|
||||
return {"progress": 0}
|
||||
|
||||
|
||||
async def dump_listings_full(
|
||||
*, task: Task, parameters: QueryParameters
|
||||
) -> list[Listing]:
|
||||
"""Fetches all listings, images as well as detects floorplans"""
|
||||
self.update_state(state="FETCHING_LISTINGS", meta={"progress": 0.1})
|
||||
repository = ListingRepository(engine)
|
||||
new_listings = await dump_listings_module.dump_listings(parameters, repository)
|
||||
self.update_state(state="FETCHING_FLOORPLANS", meta={"progress": 0.3})
|
||||
logger.debug(f"Upserted {len(new_listings)} new listings")
|
||||
logger.debug("Starting to fetch floorplans")
|
||||
await dump_images_module.dump_images(repository)
|
||||
self.update_state(state="RUNNING_OCR_ON_FLOORPLANS", meta={"progress": 0.6})
|
||||
logger.debug("Completed fetching floorplans")
|
||||
logger.debug("Starting floorplan detection")
|
||||
await detect_floorplan_module.detect_floorplan(repository)
|
||||
logger.debug("Completed floorplan detection")
|
||||
# refresh listings
|
||||
listings = await repository.get_listings(parameters) # this can be better
|
||||
new_listings = [l for l in listings if l.id in new_listings]
|
||||
return new_listings
|
||||
|
||||
missing_ids = await get_missing_listing_ids(parameters, repository)
|
||||
logger.info(f"Found {len(missing_ids)} missing listings")
|
||||
|
||||
listing_processor = ListingProcessor(repository)
|
||||
logger.info(f"Starting processing {len(missing_ids)} new listings")
|
||||
return await dump_listings_and_monitor(
|
||||
task=task, listing_processor=listing_processor, missing_ids=missing_ids
|
||||
)
|
||||
|
||||
|
||||
async def dump_listings_and_monitor(
|
||||
*, task: Task, listing_processor: ListingProcessor, missing_ids: set[int]
|
||||
) -> list[Listing]:
|
||||
task_progress = {missing_id: 0 for missing_id in missing_ids}
|
||||
|
||||
async def process(missing_id: int) -> Listing | None:
|
||||
listing = await listing_processor.process_listing(missing_id)
|
||||
task_progress[missing_id] = 1
|
||||
return listing
|
||||
|
||||
async def monitor() -> None:
|
||||
while (progress := sum(task_progress.values())) < len(missing_ids):
|
||||
progress_ratio = progress / len(missing_ids)
|
||||
logger.error(
|
||||
f"Task progress: {task_progress}% ({progress} out of {len(missing_ids)})"
|
||||
)
|
||||
task.update_state(
|
||||
state=f"Progress: {progress_ratio * 100}% ({progress} out of {len(missing_ids)})",
|
||||
meta={"progress": progress_ratio},
|
||||
)
|
||||
await asyncio.sleep(1)
|
||||
|
||||
processed_listings = await asyncio.gather(
|
||||
*[process(id) for id in missing_ids], *[monitor()]
|
||||
)
|
||||
filtered_listings = [l for l in processed_listings if l is not None]
|
||||
|
||||
return filtered_listings
|
||||
|
||||
|
||||
@app.on_after_finalize.connect
|
||||
|
|
@ -57,5 +91,105 @@ def setup_periodic_tasks(sender, **kwargs):
|
|||
max_price=4000,
|
||||
).model_dump_json()
|
||||
),
|
||||
name='Daily dump of interesting rent listings',
|
||||
name="Daily dump of interesting rent listings",
|
||||
)
|
||||
|
||||
|
||||
async def get_missing_listing_ids(
|
||||
parameters: QueryParameters,
|
||||
repository: ListingRepository,
|
||||
) -> set[int]:
|
||||
semaphore = asyncio.Semaphore(5) # if too high, rightmove drops connections
|
||||
districts = await get_valid_districts_to_scrape(parameters.district_names)
|
||||
json_responses: list[list[dict[str, Any]]] = await asyncio.gather(
|
||||
*[
|
||||
_fetch_listings_with_semaphore(semaphore, parameters, district)
|
||||
for district in districts.keys()
|
||||
],
|
||||
)
|
||||
json_responses_flat = list(itertools.chain.from_iterable(json_responses))
|
||||
logger.debug(f"Total listings fetched {len(json_responses_flat)}")
|
||||
|
||||
identifiers: set[int] = set()
|
||||
for response_json in json_responses_flat:
|
||||
if response_json == {}:
|
||||
continue
|
||||
if response_json["totalAvailableResults"] == 0:
|
||||
continue
|
||||
for property in response_json["properties"]:
|
||||
identifier = property["identifier"]
|
||||
identifiers.add(identifier)
|
||||
|
||||
# if listing is already in db, do not fetch details again
|
||||
all_listing_ids = {l.id for l in await repository.get_listings()}
|
||||
missing_ids = all_listing_ids - identifiers
|
||||
return missing_ids
|
||||
|
||||
|
||||
async def get_valid_districts_to_scrape(
|
||||
district_names: set[str] | None,
|
||||
) -> dict[str, str]:
|
||||
if district_names:
|
||||
districts = {
|
||||
district: locid
|
||||
for district, locid in get_districts().items()
|
||||
if district in district_names
|
||||
}
|
||||
else:
|
||||
districts = get_districts()
|
||||
return districts
|
||||
|
||||
|
||||
async def _fetch_listings_with_semaphore(
|
||||
semaphore: asyncio.Semaphore,
|
||||
parameters: QueryParameters,
|
||||
district: str,
|
||||
) -> list[dict[str, Any]]:
|
||||
result = []
|
||||
# split the price in N bands to avoid the 1.5k capping by rightmove
|
||||
# basically instead of 1 query with price between 1k and 5k that is capped at 1500 results
|
||||
# we do 10 queries each with an increment in price range so we send more queries but each
|
||||
# has a smaller chance of returning more than 1.5k results
|
||||
|
||||
number_of_steps = 1
|
||||
price_step = parameters.max_price // number_of_steps
|
||||
|
||||
for step in range(number_of_steps):
|
||||
min_price = step * price_step
|
||||
max_price = (step + 1) * price_step
|
||||
logger.debug(
|
||||
f"Step {step} of {number_of_steps} with {min_price=} and {max_price=}"
|
||||
)
|
||||
|
||||
for num_bedrooms in range(parameters.min_bedrooms, parameters.max_bedrooms + 1):
|
||||
for page_id in range(
|
||||
1,
|
||||
3, # seems like all searches stop at 1500 entries (page_id * page_size)
|
||||
):
|
||||
logger.debug(f"Processing {page_id=} for {district=}")
|
||||
|
||||
async with semaphore:
|
||||
try:
|
||||
listing_query_result = await listing_query(
|
||||
page=page_id,
|
||||
channel=parameters.listing_type,
|
||||
# min_bedrooms=parameters.min_bedrooms,
|
||||
# max_bedrooms=parameters.max_bedrooms,
|
||||
min_bedrooms=num_bedrooms,
|
||||
max_bedrooms=num_bedrooms,
|
||||
radius=parameters.radius,
|
||||
min_price=min_price,
|
||||
max_price=max_price,
|
||||
district=district,
|
||||
page_size=parameters.page_size,
|
||||
max_days_since_added=parameters.max_days_since_added,
|
||||
furnish_types=parameters.furnish_types or [],
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
if "GENERIC_ERROR" in str(e): # Too big page id
|
||||
logger.debug(f"Max page id for {district=}: {page_id-1}")
|
||||
break
|
||||
raise e
|
||||
result.append(listing_query_result)
|
||||
return result
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue