adding tesseract OCR for floorplan detection

2024-03-10 22:32:34 +00:00 · 2024-03-10 22:32:34 +00:00 · d108bf11ee
commit d108bf11ee
parent 508aa02812
8 changed files with 153 additions and 29 deletions
--- a/vqa/Untitled.ipynb
+++ b/vqa/Untitled.ipynb
@ -113,28 +113,85 @@
    "extract_total_sqm(x)"
   ]
  },
+  {
+   "cell_type": "markdown",
+   "id": "4bbf0ef5-0a38-4710-9ce1-e5e24a65d83b",
+   "metadata": {},
+   "source": [
+    "# tesseract"
+   ]
+  },
  {
   "cell_type": "code",
-   "execution_count": 16,
-   "id": "06d50e06-fead-40be-bd9d-7e8207d8df4e",
+   "execution_count": 18,
+   "id": "c73b5339-d493-4a55-a93c-9dcd7ee7a0af",
   "metadata": {},
   "outputs": [
    {
-     "ename": "TypeError",
-     "evalue": "'>' not supported between instances of 'int' and 'NoneType'",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mTypeError\u001b[0m                                 Traceback (most recent call last)",
-      "Cell \u001b[0;32mIn[16], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[38;5;28;43mmax\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m,\u001b[49m\u001b[38;5;241;43m1\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m)\u001b[49m\n",
-      "\u001b[0;31mTypeError\u001b[0m: '>' not supported between instances of 'int' and 'NoneType'"
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "DEFOE HOUSE EC2Y 8DN\n",
+      "\n",
+      "Prepared for Nicola Lee Ltd.\n",
+      "\n",
+      "Kitchen\n",
+      "93 x 74\n",
+      "\n",
+      "\\\n",
+      "Living Space “ (2.81mx 2.23m) Bedroom\n",
+      "162 11'8 18x 118\n",
+      "(3.55m x 3.55m)\n",
+      "\n",
+      "(4.90m x 3.55m)\n",
+      "\n",
+      "ee\n",
+      "\n",
+      "Balcony\n",
+      "12'6x4'll\n",
+      "(3.80m x 1.49m)\n",
+      "r\n",
+      "\n",
+      "Seventh Floor\n",
+      "Terrace\n",
+      "21'S x 16'10\n",
+      "(6.50m x 5.12m)\n",
+      "\n",
+      "Approximate Gross Internal Floor Area : 579 sq ft / 53.8 sqm\n",
+      "\n",
+      "\n"
     ]
    }
   ],
   "source": [
-    "max([None,1])"
+    "from PIL import Image\n",
+    "\n",
+    "import pytesseract\n",
+    "\n",
+    "# If you don't have tesseract executable in your PATH, include the following:\n",
+    "# pytesseract.pytesseract.tesseract_cmd = r'<full_path_to_your_tesseract_executable>'\n",
+    "# Example tesseract_cmd = r'C:\\Program Files (x86)\\Tesseract-OCR\\tesseract'\n",
+    "\n",
+    "# Simple image to string\n",
+    "print(pytesseract.image_to_string(Image.open('/Users/kadir/code/realestate/crawler/data/rs/135175484/floorplans/0_219962_DWR0004BF_FLP_00_0001.gif')))"
   ]
  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1066255c-79f8-4360-aa6b-515576cb8b03",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "232d19bb-869a-4b8b-829a-0d8e211e9c7b",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
  {
   "cell_type": "markdown",
   "id": "e0d94c3e-e8de-452a-8556-12dd2e3e2351",
--- a/vqa/poetry.lock
+++ b/vqa/poetry.lock
@ -1803,6 +1803,21 @@ files = [
 plugins = ["importlib-metadata"]
 windows-terminal = ["colorama (>=0.4.6)"]

+[[package]]
+name = "pytesseract"
+version = "0.3.10"
+description = "Python-tesseract is a python wrapper for Google's Tesseract-OCR"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "pytesseract-0.3.10-py3-none-any.whl", hash = "sha256:8f22cc98f765bf13517ead0c70effedb46c153540d25783e04014f28b55a5fc6"},
+    {file = "pytesseract-0.3.10.tar.gz", hash = "sha256:f1c3a8b0f07fd01a1085d451f5b8315be6eec1d5577a6796d46dc7a62bd4120f"},
+]
+
+[package.dependencies]
+packaging = ">=21.3"
+Pillow = ">=8.0.0"
+
 [[package]]
 name = "python-dateutil"
 version = "2.9.0"
@ -3072,4 +3087,4 @@ test = ["websockets"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.12"
-content-hash = "ec7099d470acdee1c418ff102390f5a71764b90628f9523eff9f22a8fb3e5018"
+content-hash = "dcb74934d7385cf192b13d8767dc1cdb7b20907e66acca361a941d439f487b96"
--- a/vqa/pyproject.toml
+++ b/vqa/pyproject.toml
@ -15,6 +15,7 @@ torchvision = "^0.17.1"
 torchaudio = "^2.2.1"
 jupyterlab = "^4.1.2"
 pandas = "^2.2.1"
+pytesseract = "^0.3.10"


 [build-system]