migrate processing to a pipeline approach where each listing is processed in a pipeline in parallel and status reported back to track progress

2025-07-27 18:33:39 +00:00 · 2025-07-27 18:33:39 +00:00 · 91a0436f7f
commit 91a0436f7f
parent 4fa09e31c8
6 changed files with 347 additions and 26 deletions
--- a/crawler/tasks/listing_tasks.py
+++ b/crawler/tasks/listing_tasks.py
@ -1,16 +1,17 @@
 import asyncio
 import importlib
+import itertools
 import logging
-from pathlib import Path
 from typing import Any
-from celery import Celery, Task
+from celery import Task
 from celery_app import app
-from models.listing import FurnishType, Listing, ListingType, QueryParameters
+from listing_processor import ListingProcessor
+from models.listing import Listing, ListingType, QueryParameters
+from rec.districts import get_districts
+from rec.query import listing_query
 from repositories.listing_repository import ListingRepository
 from database import engine
-from tasks.task_state import TaskStatus

-dump_listings_module = importlib.import_module("1_dump_listings")
 dump_images_module = importlib.import_module("3_dump_images")
 detect_floorplan_module = importlib.import_module("4_detect_floorplan")

@ -20,28 +21,61 @@ logger = logging.getLogger("uvicorn.error")
@app.task(bind=True, pydantic=True)
 def dump_listings_task(self: Task, parameters_json: str) -> dict[str, Any]:
    parsed_parameters = QueryParameters.model_validate_json(parameters_json)
-    asyncio.run(dump_listings_full(self, parsed_parameters))
-    return {"progress": 1}
+    asyncio.run(dump_listings_full(task=self, parameters=parsed_parameters))
+    self.update_state(state="Starting...", meta={"progress": 0})
+    return {"progress": 0}


-async def dump_listings_full(self: Task, parameters: QueryParameters) -> list[Listing]:
+async def async_dump_listings_task(parameters_json: str) -> dict[str, Any]:
+    parsed_parameters = QueryParameters.model_validate_json(parameters_json)
+    await dump_listings_full(task=Task(), parameters=parsed_parameters)
+    return {"progress": 0}
+
+
+async def dump_listings_full(
+    *, task: Task, parameters: QueryParameters
+) -> list[Listing]:
    """Fetches all listings, images as well as detects floorplans"""
-    self.update_state(state="FETCHING_LISTINGS", meta={"progress": 0.1})
    repository = ListingRepository(engine)
-    new_listings = await dump_listings_module.dump_listings(parameters, repository)
-    self.update_state(state="FETCHING_FLOORPLANS", meta={"progress": 0.3})
-    logger.debug(f"Upserted {len(new_listings)} new listings")
-    logger.debug("Starting to fetch floorplans")
-    await dump_images_module.dump_images(repository)
-    self.update_state(state="RUNNING_OCR_ON_FLOORPLANS", meta={"progress": 0.6})
-    logger.debug("Completed fetching floorplans")
-    logger.debug("Starting floorplan detection")
-    await detect_floorplan_module.detect_floorplan(repository)
-    logger.debug("Completed floorplan detection")
-    # refresh listings
-    listings = await repository.get_listings(parameters)  # this can be better
-    new_listings = [l for l in listings if l.id in new_listings]
-    return new_listings
+
+    missing_ids = await get_missing_listing_ids(parameters, repository)
+    logger.info(f"Found {len(missing_ids)} missing listings")
+
+    listing_processor = ListingProcessor(repository)
+    logger.info(f"Starting processing {len(missing_ids)} new listings")
+    return await dump_listings_and_monitor(
+        task=task, listing_processor=listing_processor, missing_ids=missing_ids
+    )
+
+
+async def dump_listings_and_monitor(
+    *, task: Task, listing_processor: ListingProcessor, missing_ids: set[int]
+) -> list[Listing]:
+    task_progress = {missing_id: 0 for missing_id in missing_ids}
+
+    async def process(missing_id: int) -> Listing | None:
+        listing = await listing_processor.process_listing(missing_id)
+        task_progress[missing_id] = 1
+        return listing
+
+    async def monitor() -> None:
+        while (progress := sum(task_progress.values())) < len(missing_ids):
+            progress_ratio = progress / len(missing_ids)
+            logger.error(
+                f"Task progress: {task_progress}% ({progress} out of {len(missing_ids)})"
+            )
+            task.update_state(
+                state=f"Progress: {progress_ratio * 100}% ({progress} out of {len(missing_ids)})",
+                meta={"progress": progress_ratio},
+            )
+            await asyncio.sleep(1)
+
+    processed_listings = await asyncio.gather(
+        *[process(id) for id in missing_ids], *[monitor()]
+    )
+    filtered_listings = [l for l in processed_listings if l is not None]
+
+    return filtered_listings


@app.on_after_finalize.connect
@ -57,5 +91,105 @@ def setup_periodic_tasks(sender, **kwargs):
                max_price=4000,
            ).model_dump_json()
        ),
-        name='Daily dump of interesting rent listings',
+        name="Daily dump of interesting rent listings",
    )
+
+
+async def get_missing_listing_ids(
+    parameters: QueryParameters,
+    repository: ListingRepository,
+) -> set[int]:
+    semaphore = asyncio.Semaphore(5)  # if too high, rightmove drops connections
+    districts = await get_valid_districts_to_scrape(parameters.district_names)
+    json_responses: list[list[dict[str, Any]]] = await asyncio.gather(
+        *[
+            _fetch_listings_with_semaphore(semaphore, parameters, district)
+            for district in districts.keys()
+        ],
+    )
+    json_responses_flat = list(itertools.chain.from_iterable(json_responses))
+    logger.debug(f"Total listings fetched {len(json_responses_flat)}")
+
+    identifiers: set[int] = set()
+    for response_json in json_responses_flat:
+        if response_json == {}:
+            continue
+        if response_json["totalAvailableResults"] == 0:
+            continue
+        for property in response_json["properties"]:
+            identifier = property["identifier"]
+            identifiers.add(identifier)
+
+    # if listing is already in db, do not fetch details again
+    all_listing_ids = {l.id for l in await repository.get_listings()}
+    missing_ids = all_listing_ids - identifiers
+    return missing_ids
+
+
+async def get_valid_districts_to_scrape(
+    district_names: set[str] | None,
+) -> dict[str, str]:
+    if district_names:
+        districts = {
+            district: locid
+            for district, locid in get_districts().items()
+            if district in district_names
+        }
+    else:
+        districts = get_districts()
+    return districts
+
+
+async def _fetch_listings_with_semaphore(
+    semaphore: asyncio.Semaphore,
+    parameters: QueryParameters,
+    district: str,
+) -> list[dict[str, Any]]:
+    result = []
+    # split the price in N bands to avoid the 1.5k capping by rightmove
+    # basically instead of 1 query with price between 1k and 5k that is capped at 1500 results
+    # we do 10 queries each with an increment in price range so we send more queries but each
+    # has a smaller chance of returning more than 1.5k results
+
+    number_of_steps = 1
+    price_step = parameters.max_price // number_of_steps
+
+    for step in range(number_of_steps):
+        min_price = step * price_step
+        max_price = (step + 1) * price_step
+        logger.debug(
+            f"Step {step} of {number_of_steps} with {min_price=} and {max_price=}"
+        )
+
+        for num_bedrooms in range(parameters.min_bedrooms, parameters.max_bedrooms + 1):
+            for page_id in range(
+                1,
+                3,  # seems like all searches stop at 1500 entries (page_id * page_size)
+            ):
+                logger.debug(f"Processing {page_id=} for {district=}")
+
+                async with semaphore:
+                    try:
+                        listing_query_result = await listing_query(
+                            page=page_id,
+                            channel=parameters.listing_type,
+                            # min_bedrooms=parameters.min_bedrooms,
+                            # max_bedrooms=parameters.max_bedrooms,
+                            min_bedrooms=num_bedrooms,
+                            max_bedrooms=num_bedrooms,
+                            radius=parameters.radius,
+                            min_price=min_price,
+                            max_price=max_price,
+                            district=district,
+                            page_size=parameters.page_size,
+                            max_days_since_added=parameters.max_days_since_added,
+                            furnish_types=parameters.furnish_types or [],
+                        )
+
+                    except Exception as e:
+                        if "GENERIC_ERROR" in str(e):  # Too big page id
+                            logger.debug(f"Max page id for {district=}: {page_id-1}")
+                            break
+                        raise e
+                result.append(listing_query_result)
+    return result