import logging import re from pathlib import Path from typing import Any from PIL import Image import cv2 import numpy as np logger = logging.getLogger(__name__) MIN_SQM = 30 MAX_SQM = 160 def inference(image_path: str | Path) -> tuple[str, Any]: from transformers import Pix2StructProcessor, Pix2StructForConditionalGeneration image = Image.open(image_path) question = "How many living rooms are displayed on this floor plan?" # not sure if it even has an effect processor = Pix2StructProcessor.from_pretrained("google/deplot") model = Pix2StructForConditionalGeneration.from_pretrained("google/deplot") inputs = processor(images=image, text=question, return_tensors="pt") predictions = model.generate(**inputs, max_new_tokens=512) output = processor.decode(predictions[0], skip_special_tokens=True) return output, predictions def extract_total_sqm(input_str: str) -> float | None: # Note: can be used on the output of inference() to extract sqm from model predictions. sqmregex = r"(\d+\.?\d*) ?(sq ?m|sq. ?m)" matches = re.findall(sqmregex, input_str.lower()) sqms = [float(m[0]) for m in matches] filtered = [sqm for sqm in sqms if MIN_SQM < sqm < MAX_SQM] if len(filtered) == 0: return None return max(filtered) def improve_img_for_ocr(img: Image.Image) -> Image.Image: grayscale_image = np.array(img.convert("L")) grayscale_image = cv2.resize(grayscale_image, None, fx=1.2, fy=1.2, interpolation=cv2.INTER_CUBIC) thresh = cv2.adaptiveThreshold( grayscale_image, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2 ) return Image.fromarray(thresh) def calculate_ocr(image_path: str | Path) -> tuple[float | None, str]: import pytesseract path = Path(image_path) if not path.exists(): raise FileNotFoundError(f"Image not found: {image_path}") img = Image.open(path) text = pytesseract.image_to_string(img) estimated_sqm = extract_total_sqm(text) if estimated_sqm is None: improved_img = improve_img_for_ocr(img) text2 = pytesseract.image_to_string(improved_img) estimated_sqm2 = extract_total_sqm(text2) logger.debug(f"before: {estimated_sqm} after: {estimated_sqm2} - {image_path}") return estimated_sqm2, text2 return estimated_sqm, text