wrongmove/crawler/rec/floorplan.py

import re
from PIL import Image
from transformers import Pix2StructProcessor, Pix2StructForConditionalGeneration

def inference(image_path):
    image = Image.open(image_path)
    question = "How many living rooms are displayed on this floor plan?" # not sure if it even has an effect
    processor = Pix2StructProcessor.from_pretrained('google/deplot')
    model = Pix2StructForConditionalGeneration.from_pretrained('google/deplot')

    inputs = processor(images=image, text=question, return_tensors="pt")
    predictions = model.generate(**inputs, max_new_tokens=512)
    output = processor.decode(predictions[0], skip_special_tokens=True)

    return output, predictions


def extract_total_sqm(deplot_input_str):
    sqmregex = r'(\d+\.\d*) ?(sqm|sq.m|sq m|m)'
    matches = re.findall(sqmregex, deplot_input_str.lower())
    if len(matches) == 0:
        return None
    sqms = [float(m[0]) for m in matches]
    return max(sqms)


def calculate(image_path):
    output, predictions_tensor = inference(image_path)
    estimated_sqm = extract_total_sqm()
    return estimated_sqm, output, predictions_tensor