adding tesseract OCR for floorplan detection

2024-03-10 22:32:34 +00:00 · 2024-03-10 22:32:34 +00:00 · d108bf11ee
commit d108bf11ee
parent 508aa02812
8 changed files with 153 additions and 29 deletions
--- a/crawler/4_detect_floorplan.py
+++ b/crawler/4_detect_floorplan.py
@ -4,4 +4,6 @@ from tqdm import tqdm
 listings = Listing.get_all_listings()

 for listing in tqdm(listings):
-    tqdm.write(listing.calculate_sqm())
+    tqdm.write(str(listing.identifier))
+    # listing.calculate_sqm_model() # using google/deplot model. Too slow, rather use tesseract
+    listing.calculate_sqm_ocr()
--- a/crawler/data_access.py
+++ b/crawler/data_access.py
@ -30,8 +30,11 @@ class Listing():
    def path_detail_json(self) -> pathlib.Path:
        return self.path_listing() / 'detail.json'
    
-    def path_floorplan_json(self) -> pathlib.Path:
-        return self.path_listing() / 'floorplan.json'
+    def path_floorplan_model_json(self) -> pathlib.Path:
+        return self.path_listing() / 'floorplan_model.json'
+    
+    def path_floorplan_ocr_json(self) -> pathlib.Path:
+        return self.path_listing() / 'floorplan_ocr.json'
    
    def path_pic_folder(self) -> pathlib.Path:
        return self.path_listing() / 'pics'
@ -51,36 +54,58 @@ class Listing():
        # todo add check if return is image
        return images
    
-    def calculate_sqm(self):
+    def calculate_sqm_model(self):
        objs = []
        for floorplan_path in self.list_floorplans():
-            estimated_sqm, model_output, predictions = floorplan.calculate(floorplan_path)
+            estimated_sqm, model_output, predictions = floorplan.calculate_model(floorplan_path)
            objs.append({
-                'floorplan_path': floorplan_path,
+                'floorplan_path': str(floorplan_path),
                'estimated_sqm': estimated_sqm,
                'model_output': model_output,
                'no_predictions': len(predictions) # cant serialize the predictions itself since its a tensor
            })
        
-        with open(self.path_floorplan_json(), 'w') as f:
+        with open(self.path_floorplan_model_json(), 'w') as f:
            json.dump(objs, f)
-            
-        max_sqm = max([o['estimated_sqm'] for o in objs if o is None]) # filter out Nones
-        return max_sqm
    
    @property
-    def sqm(self, recalculate=True):
-        if recalculate and not self.path_floorplan_json().exists():
-            self.calculate_sqm()
+    def sqm_model(self, recalculate=True):
+        if recalculate and not self.path_floorplan_model_json().exists():
+            self.calculate_sqm_model()
        
        with open(self.path_floorplan_json()) as f:
            objs = json.load(f)
        
        max_sqm = max([o['estimated_sqm'] for o in objs if o is None]) # filter out Nones
        return max_sqm
+    
+    def calculate_sqm_ocr(self):
+        objs = []
+        for floorplan_path in self.list_floorplans():
+            estimated_sqm, model_output = floorplan.calculate_ocr(floorplan_path)
+            objs.append({
+                'floorplan_path': str(floorplan_path),
+                'estimated_sqm': estimated_sqm,
+                'text': model_output,
+            })
        
+        with open(self.path_floorplan_ocr_json(), 'w') as f:
+            json.dump(objs, f)
+    
+    @property
+    def sqm_ocr(self, recalculate=True):
+        if recalculate and not self.path_floorplan_ocr_json().exists():
+            self.calculate_sqm_ocr()
+        
+        with open(self.path_floorplan_ocr_json()) as f:
+            objs = json.load(f)
+        
+        max_sqm = max([o['estimated_sqm'] for o in objs if o is None]) # filter out Nones
+        return max_sqm
+    
+    
            
    
 if __name__ == '__main__':    
    listings = Listing.get_all_listings()
-    print(listings[0].list_floorplans())
+    print(listings[0].list_floorplans())
--- a/crawler/poetry.lock
+++ b/crawler/poetry.lock
@ -712,6 +712,21 @@ tests = ["check-manifest", "coverage", "defusedxml", "markdown2", "olefile", "pa
 typing = ["typing-extensions"]
 xmp = ["defusedxml"]

+[[package]]
+name = "pytesseract"
+version = "0.3.10"
+description = "Python-tesseract is a python wrapper for Google's Tesseract-OCR"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "pytesseract-0.3.10-py3-none-any.whl", hash = "sha256:8f22cc98f765bf13517ead0c70effedb46c153540d25783e04014f28b55a5fc6"},
+    {file = "pytesseract-0.3.10.tar.gz", hash = "sha256:f1c3a8b0f07fd01a1085d451f5b8315be6eec1d5577a6796d46dc7a62bd4120f"},
+]
+
+[package.dependencies]
+packaging = ">=21.3"
+Pillow = ">=8.0.0"
+
 [[package]]
 name = "pyyaml"
 version = "6.0.1"
@ -1455,4 +1470,4 @@ zstd = ["zstandard (>=0.18.0)"]
 [metadata]
 lock-version = "2.0"
 python-versions = ">3.11"
-content-hash = "29e82860db598c8356b279e9287d0a96c1093724b0057ddcb374ae8f71881f3d"
+content-hash = "30b432cae95b5a4facbca747f698614e256df27cb0f8b1c96608bba61eca1f0c"
--- a/crawler/pyproject.toml
+++ b/crawler/pyproject.toml
@ -15,6 +15,7 @@ pillow = "^10.2.0"
 torch = "^2.2.1"
 numpy = "^1.26.4"
 transformers = "^4.38.2"
+pytesseract = "^0.3.10"

 [tool.poetry.dev-dependencies]

--- a/crawler/rec/floorplan.py
+++ b/crawler/rec/floorplan.py
@ -1,6 +1,7 @@
 import re
 from PIL import Image
 from transformers import Pix2StructProcessor, Pix2StructForConditionalGeneration
+import pytesseract

 def inference(image_path):
    image = Image.open(image_path)
@ -24,7 +25,14 @@ def extract_total_sqm(deplot_input_str):
    return max(sqms)


-def calculate(image_path):
+def calculate_model(image_path):
    output, predictions_tensor = inference(image_path)
-    estimated_sqm = extract_total_sqm()
+    estimated_sqm = extract_total_sqm(output)
    return estimated_sqm, output, predictions_tensor
+
+
+def calculate_ocr(image_path):
+    img = Image.open(image_path)
+    text = pytesseract.image_to_string(img)
+    estimated_sqm = extract_total_sqm(text)
+    return estimated_sqm, text