import re from PIL import Image from transformers import Pix2StructProcessor, Pix2StructForConditionalGeneration import pytesseract def inference(image_path): image = Image.open(image_path) question = "How many living rooms are displayed on this floor plan?" # not sure if it even has an effect processor = Pix2StructProcessor.from_pretrained('google/deplot') model = Pix2StructForConditionalGeneration.from_pretrained('google/deplot') inputs = processor(images=image, text=question, return_tensors="pt") predictions = model.generate(**inputs, max_new_tokens=512) output = processor.decode(predictions[0], skip_special_tokens=True) return output, predictions def extract_total_sqm(deplot_input_str): sqmregex = r'(\d+\.\d*) ?(sq ?m|sq. ?m)' matches = re.findall(sqmregex, deplot_input_str.lower()) if len(matches) == 0: return None sqms = [float(m[0]) for m in matches] return max(sqms) def calculate_model(image_path): output, predictions_tensor = inference(image_path) estimated_sqm = extract_total_sqm(output) return estimated_sqm, output, predictions_tensor def calculate_ocr(image_path): img = Image.open(image_path) text = pytesseract.image_to_string(img) estimated_sqm = extract_total_sqm(text) return estimated_sqm, text