From 325823e631e583f2693e0fbd20fc14b8d6f56cb2 Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Sat, 7 Jun 2025 14:30:32 +0000 Subject: [PATCH] refactor detect floorplan to use model listings --- crawler/4_detect_floorplan.py | 42 ++++++++++++++++++++++++++--------- crawler/main.py | 8 +++---- crawler/rec/floorplan.py | 2 +- 3 files changed, 37 insertions(+), 15 deletions(-) diff --git a/crawler/4_detect_floorplan.py b/crawler/4_detect_floorplan.py index 09078a7..b15902f 100644 --- a/crawler/4_detect_floorplan.py +++ b/crawler/4_detect_floorplan.py @@ -1,21 +1,43 @@ import asyncio -from data_access import Listing +from models import Listing +from rec import floorplan +from repositories.listing_repository import ListingRepository from tqdm.asyncio import tqdm import multiprocessing -async def detect_floorplan(listing_paths: list[str]): - listings = Listing.get_all_listings(listing_paths) +async def detect_floorplan(repository: ListingRepository): + listings = await repository.get_listings() cpu_count = multiprocessing.cpu_count() // 4 semaphore = asyncio.Semaphore(cpu_count) - await tqdm.gather( - *[_detect_floorplan_with_semaphore(listing, semaphore) for listing in listings] - ) + updated_listings = [ + listing + for listing in await tqdm.gather( + *[_calculate_sqm_ocr(listing, semaphore) for listing in listings] + ) + if listing is not None + ] + await repository.upsert_listings(updated_listings) -async def _detect_floorplan_with_semaphore( +async def _calculate_sqm_ocr( listing: Listing, semaphore: asyncio.Semaphore -): - async with semaphore: - return await listing.calculate_sqm_ocr(recalculate=False) +) -> Listing | None: + if listing.square_meters is not None: + return None + sqms = [] + for floorplan_path in listing.floorplan_image_paths: + async with semaphore: + estimated_sqm, _ = await asyncio.to_thread( + floorplan.calculate_ocr, floorplan_path + ) + if estimated_sqm is not None: + sqms.append(estimated_sqm) + max_sqm = max(sqms, default=None) + if max_sqm is not None: + listing.square_meters = max_sqm + return listing + else: + listing.square_meters = None + return None diff --git a/crawler/main.py b/crawler/main.py index a9942d7..931518b 100644 --- a/crawler/main.py +++ b/crawler/main.py @@ -165,7 +165,7 @@ def dump_listings( @click.pass_context def dump_images(ctx: click.core.Context): data_dir = ctx.obj["data_dir"] - click.echo(f"Running dump_images stored in {data_dir}") + click.echo(f"Running dump_images for listings stored in {engine.url}") repository = ListingRepository(engine=engine) asyncio.run(dump_images_module.dump_images(repository, image_base_path=data_dir)) @@ -174,9 +174,9 @@ def dump_images(ctx: click.core.Context): @click.pass_context def detect_floorplan(ctx: click.core.Context): data_dir = ctx.obj["data_dir"] - click.echo(f"Running detect_floorplan in {data_dir}") - listing_paths = sorted(list(pathlib.Path(data_dir).glob("*/listing.json"))) - asyncio.run(detect_floorplan_module.detect_floorplan(listing_paths)) + click.echo(f"Running detect_floorplan for listings stored in {engine.url}") + repository = ListingRepository(engine=engine) + asyncio.run(detect_floorplan_module.detect_floorplan(repository)) @cli.command() diff --git a/crawler/rec/floorplan.py b/crawler/rec/floorplan.py index bf9aa66..657e8a6 100644 --- a/crawler/rec/floorplan.py +++ b/crawler/rec/floorplan.py @@ -44,7 +44,7 @@ def improve_img_for_ocr(img: Image): return Image.fromarray(thresh) -def calculate_ocr(image_path): +def calculate_ocr(image_path) -> tuple[float | None, str]: import pytesseract img = Image.open(image_path)