adding tesseract OCR for floorplan detection

This commit is contained in:
Kadir 2024-03-10 22:32:34 +00:00
parent 508aa02812
commit d108bf11ee
8 changed files with 153 additions and 29 deletions

View file

@ -4,4 +4,6 @@ from tqdm import tqdm
listings = Listing.get_all_listings()
for listing in tqdm(listings):
tqdm.write(listing.calculate_sqm())
tqdm.write(str(listing.identifier))
# listing.calculate_sqm_model() # using google/deplot model. Too slow, rather use tesseract
listing.calculate_sqm_ocr()

View file

@ -30,8 +30,11 @@ class Listing():
def path_detail_json(self) -> pathlib.Path:
return self.path_listing() / 'detail.json'
def path_floorplan_json(self) -> pathlib.Path:
return self.path_listing() / 'floorplan.json'
def path_floorplan_model_json(self) -> pathlib.Path:
return self.path_listing() / 'floorplan_model.json'
def path_floorplan_ocr_json(self) -> pathlib.Path:
return self.path_listing() / 'floorplan_ocr.json'
def path_pic_folder(self) -> pathlib.Path:
return self.path_listing() / 'pics'
@ -51,36 +54,58 @@ class Listing():
# todo add check if return is image
return images
def calculate_sqm(self):
def calculate_sqm_model(self):
objs = []
for floorplan_path in self.list_floorplans():
estimated_sqm, model_output, predictions = floorplan.calculate(floorplan_path)
estimated_sqm, model_output, predictions = floorplan.calculate_model(floorplan_path)
objs.append({
'floorplan_path': floorplan_path,
'floorplan_path': str(floorplan_path),
'estimated_sqm': estimated_sqm,
'model_output': model_output,
'no_predictions': len(predictions) # cant serialize the predictions itself since its a tensor
})
with open(self.path_floorplan_json(), 'w') as f:
with open(self.path_floorplan_model_json(), 'w') as f:
json.dump(objs, f)
max_sqm = max([o['estimated_sqm'] for o in objs if o is None]) # filter out Nones
return max_sqm
@property
def sqm(self, recalculate=True):
if recalculate and not self.path_floorplan_json().exists():
self.calculate_sqm()
def sqm_model(self, recalculate=True):
if recalculate and not self.path_floorplan_model_json().exists():
self.calculate_sqm_model()
with open(self.path_floorplan_json()) as f:
objs = json.load(f)
max_sqm = max([o['estimated_sqm'] for o in objs if o is None]) # filter out Nones
return max_sqm
def calculate_sqm_ocr(self):
objs = []
for floorplan_path in self.list_floorplans():
estimated_sqm, model_output = floorplan.calculate_ocr(floorplan_path)
objs.append({
'floorplan_path': str(floorplan_path),
'estimated_sqm': estimated_sqm,
'text': model_output,
})
with open(self.path_floorplan_ocr_json(), 'w') as f:
json.dump(objs, f)
@property
def sqm_ocr(self, recalculate=True):
if recalculate and not self.path_floorplan_ocr_json().exists():
self.calculate_sqm_ocr()
with open(self.path_floorplan_ocr_json()) as f:
objs = json.load(f)
max_sqm = max([o['estimated_sqm'] for o in objs if o is None]) # filter out Nones
return max_sqm
if __name__ == '__main__':
listings = Listing.get_all_listings()
print(listings[0].list_floorplans())
print(listings[0].list_floorplans())

17
crawler/poetry.lock generated
View file

@ -712,6 +712,21 @@ tests = ["check-manifest", "coverage", "defusedxml", "markdown2", "olefile", "pa
typing = ["typing-extensions"]
xmp = ["defusedxml"]
[[package]]
name = "pytesseract"
version = "0.3.10"
description = "Python-tesseract is a python wrapper for Google's Tesseract-OCR"
optional = false
python-versions = ">=3.7"
files = [
{file = "pytesseract-0.3.10-py3-none-any.whl", hash = "sha256:8f22cc98f765bf13517ead0c70effedb46c153540d25783e04014f28b55a5fc6"},
{file = "pytesseract-0.3.10.tar.gz", hash = "sha256:f1c3a8b0f07fd01a1085d451f5b8315be6eec1d5577a6796d46dc7a62bd4120f"},
]
[package.dependencies]
packaging = ">=21.3"
Pillow = ">=8.0.0"
[[package]]
name = "pyyaml"
version = "6.0.1"
@ -1455,4 +1470,4 @@ zstd = ["zstandard (>=0.18.0)"]
[metadata]
lock-version = "2.0"
python-versions = ">3.11"
content-hash = "29e82860db598c8356b279e9287d0a96c1093724b0057ddcb374ae8f71881f3d"
content-hash = "30b432cae95b5a4facbca747f698614e256df27cb0f8b1c96608bba61eca1f0c"

View file

@ -15,6 +15,7 @@ pillow = "^10.2.0"
torch = "^2.2.1"
numpy = "^1.26.4"
transformers = "^4.38.2"
pytesseract = "^0.3.10"
[tool.poetry.dev-dependencies]

View file

@ -1,6 +1,7 @@
import re
from PIL import Image
from transformers import Pix2StructProcessor, Pix2StructForConditionalGeneration
import pytesseract
def inference(image_path):
image = Image.open(image_path)
@ -24,7 +25,14 @@ def extract_total_sqm(deplot_input_str):
return max(sqms)
def calculate(image_path):
def calculate_model(image_path):
output, predictions_tensor = inference(image_path)
estimated_sqm = extract_total_sqm()
estimated_sqm = extract_total_sqm(output)
return estimated_sqm, output, predictions_tensor
def calculate_ocr(image_path):
img = Image.open(image_path)
text = pytesseract.image_to_string(img)
estimated_sqm = extract_total_sqm(text)
return estimated_sqm, text