From b1e0ed170bdd76bbce027e09e6e62281f4468d8d Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Sat, 17 May 2025 22:58:35 +0000 Subject: [PATCH] detect floorplan using asyncio --- crawler/4_detect_floorplan.py | 22 ++++++++++++++++------ crawler/data_access.py | 7 +++++-- crawler/main.py | 2 +- 3 files changed, 22 insertions(+), 9 deletions(-) diff --git a/crawler/4_detect_floorplan.py b/crawler/4_detect_floorplan.py index 525dc55..bcb1c71 100644 --- a/crawler/4_detect_floorplan.py +++ b/crawler/4_detect_floorplan.py @@ -1,15 +1,25 @@ +import asyncio import pathlib from data_access import Listing -from tqdm import tqdm +from tqdm.asyncio import tqdm +import multiprocessing -def detect_floorplan(listing_paths: list[str]): +async def detect_floorplan(listing_paths: list[str]): listings = Listing.get_all_listings(listing_paths) + cpu_count = multiprocessing.cpu_count() / 4 + semaphore = asyncio.Semaphore(cpu_count) - for listing in tqdm(listings): - tqdm.write(str(listing.identifier)) - # listing.calculate_sqm_model() # using google/deplot model. Too slow, rather use tesseract - listing.calculate_sqm_ocr(recalculate=False) + await tqdm.gather( + *[_detect_floorplan_with_semaphore(listing, semaphore) for listing in listings] + ) + + +async def _detect_floorplan_with_semaphore( + listing: Listing, semaphore: asyncio.Semaphore +): + async with semaphore: + return await listing.calculate_sqm_ocr(recalculate=False) def main(): diff --git a/crawler/data_access.py b/crawler/data_access.py index 53c4791..1955255 100644 --- a/crawler/data_access.py +++ b/crawler/data_access.py @@ -1,3 +1,4 @@ +import asyncio from dataclasses import dataclass import json import pathlib @@ -135,13 +136,15 @@ class Listing: ) # filter out Nones return max_sqm - def calculate_sqm_ocr(self, recalculate=True): + async def calculate_sqm_ocr(self, recalculate=True): if not recalculate and self.path_floorplan_ocr_json().exists(): return objs = [] for floorplan_path in self.list_floorplans(): - estimated_sqm, model_output = floorplan.calculate_ocr(floorplan_path) + estimated_sqm, model_output = await asyncio.to_thread( + floorplan.calculate_ocr, floorplan_path + ) objs.append( { "floorplan_path": str(floorplan_path), diff --git a/crawler/main.py b/crawler/main.py index 23faeea..570ef31 100644 --- a/crawler/main.py +++ b/crawler/main.py @@ -135,7 +135,7 @@ def detect_floorplan(ctx: click.core.Context): data_dir = ctx.obj['data_dir'] click.echo(f'Running detect_floorplan in {data_dir}') listing_paths = sorted(list(pathlib.Path(data_dir).glob("*/listing.json"))) - detect_floorplan_module.detect_floorplan(listing_paths) + asyncio.run(detect_floorplan_module.detect_floorplan(listing_paths)) @cli.command()