wrongmove/crawler/rec/floorplan.py

import logging
import re
from pathlib import Path
from typing import Any
from PIL import Image
import cv2
import numpy as np

logger = logging.getLogger(__name__)

MIN_SQM = 30
MAX_SQM = 160


def inference(image_path: str | Path) -> tuple[str, Any]:
    from transformers import Pix2StructProcessor, Pix2StructForConditionalGeneration

    image = Image.open(image_path)
    question = "How many living rooms are displayed on this floor plan?"  # not sure if it even has an effect
    processor = Pix2StructProcessor.from_pretrained("google/deplot")
    model = Pix2StructForConditionalGeneration.from_pretrained("google/deplot")

    inputs = processor(images=image, text=question, return_tensors="pt")
    predictions = model.generate(**inputs, max_new_tokens=512)
    output = processor.decode(predictions[0], skip_special_tokens=True)

    return output, predictions


def extract_total_sqm(input_str: str) -> float | None:
    # Note: can be used on the output of inference() to extract sqm from model predictions.
    sqmregex = r"(\d+\.?\d*) ?(sq ?m|sq. ?m)"
    matches = re.findall(sqmregex, input_str.lower())
    sqms = [float(m[0]) for m in matches]
    filtered = [sqm for sqm in sqms if MIN_SQM < sqm < MAX_SQM]
    if len(filtered) == 0:
        return None
    return max(filtered)


def improve_img_for_ocr(img: Image.Image) -> Image.Image:
    grayscale_image = np.array(img.convert("L"))
    grayscale_image = cv2.resize(grayscale_image, None, fx=1.2, fy=1.2, interpolation=cv2.INTER_CUBIC)
    thresh = cv2.adaptiveThreshold(
        grayscale_image, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2
    )
    return Image.fromarray(thresh)


def calculate_ocr(image_path: str | Path) -> tuple[float | None, str]:
    import pytesseract

    path = Path(image_path)
    if not path.exists():
        raise FileNotFoundError(f"Image not found: {image_path}")

    img = Image.open(path)
    text = pytesseract.image_to_string(img)
    estimated_sqm = extract_total_sqm(text)
    if estimated_sqm is None:
        improved_img = improve_img_for_ocr(img)
        text2 = pytesseract.image_to_string(improved_img)
        estimated_sqm2 = extract_total_sqm(text2)
        logger.debug(f"before: {estimated_sqm} after: {estimated_sqm2} - {image_path}")
        return estimated_sqm2, text2

    return estimated_sqm, text