adding tesseract OCR for floorplan detection

This commit is contained in:
Kadir 2024-03-10 22:32:34 +00:00
parent 508aa02812
commit d108bf11ee
8 changed files with 153 additions and 29 deletions

View file

@ -113,28 +113,85 @@
"extract_total_sqm(x)"
]
},
{
"cell_type": "markdown",
"id": "4bbf0ef5-0a38-4710-9ce1-e5e24a65d83b",
"metadata": {},
"source": [
"# tesseract"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "06d50e06-fead-40be-bd9d-7e8207d8df4e",
"execution_count": 18,
"id": "c73b5339-d493-4a55-a93c-9dcd7ee7a0af",
"metadata": {},
"outputs": [
{
"ename": "TypeError",
"evalue": "'>' not supported between instances of 'int' and 'NoneType'",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[0;32mIn[16], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[38;5;28;43mmax\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m,\u001b[49m\u001b[38;5;241;43m1\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m)\u001b[49m\n",
"\u001b[0;31mTypeError\u001b[0m: '>' not supported between instances of 'int' and 'NoneType'"
"name": "stdout",
"output_type": "stream",
"text": [
"DEFOE HOUSE EC2Y 8DN\n",
"\n",
"Prepared for Nicola Lee Ltd.\n",
"\n",
"Kitchen\n",
"93 x 74\n",
"\n",
"\\\n",
"Living Space “ (2.81mx 2.23m) Bedroom\n",
"162 11'8 18x 118\n",
"(3.55m x 3.55m)\n",
"\n",
"(4.90m x 3.55m)\n",
"\n",
"ee\n",
"\n",
"Balcony\n",
"12'6x4'll\n",
"(3.80m x 1.49m)\n",
"r\n",
"\n",
"Seventh Floor\n",
"Terrace\n",
"21'S x 16'10\n",
"(6.50m x 5.12m)\n",
"\n",
"Approximate Gross Internal Floor Area : 579 sq ft / 53.8 sqm\n",
"\n",
"\n"
]
}
],
"source": [
"max([None,1])"
"from PIL import Image\n",
"\n",
"import pytesseract\n",
"\n",
"# If you don't have tesseract executable in your PATH, include the following:\n",
"# pytesseract.pytesseract.tesseract_cmd = r'<full_path_to_your_tesseract_executable>'\n",
"# Example tesseract_cmd = r'C:\\Program Files (x86)\\Tesseract-OCR\\tesseract'\n",
"\n",
"# Simple image to string\n",
"print(pytesseract.image_to_string(Image.open('/Users/kadir/code/realestate/crawler/data/rs/135175484/floorplans/0_219962_DWR0004BF_FLP_00_0001.gif')))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1066255c-79f8-4360-aa6b-515576cb8b03",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "232d19bb-869a-4b8b-829a-0d8e211e9c7b",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"id": "e0d94c3e-e8de-452a-8556-12dd2e3e2351",