adding tesseract OCR for floorplan detection
This commit is contained in:
parent
508aa02812
commit
d108bf11ee
8 changed files with 153 additions and 29 deletions
|
|
@ -4,4 +4,6 @@ from tqdm import tqdm
|
||||||
listings = Listing.get_all_listings()
|
listings = Listing.get_all_listings()
|
||||||
|
|
||||||
for listing in tqdm(listings):
|
for listing in tqdm(listings):
|
||||||
tqdm.write(listing.calculate_sqm())
|
tqdm.write(str(listing.identifier))
|
||||||
|
# listing.calculate_sqm_model() # using google/deplot model. Too slow, rather use tesseract
|
||||||
|
listing.calculate_sqm_ocr()
|
||||||
|
|
|
||||||
|
|
@ -30,8 +30,11 @@ class Listing():
|
||||||
def path_detail_json(self) -> pathlib.Path:
|
def path_detail_json(self) -> pathlib.Path:
|
||||||
return self.path_listing() / 'detail.json'
|
return self.path_listing() / 'detail.json'
|
||||||
|
|
||||||
def path_floorplan_json(self) -> pathlib.Path:
|
def path_floorplan_model_json(self) -> pathlib.Path:
|
||||||
return self.path_listing() / 'floorplan.json'
|
return self.path_listing() / 'floorplan_model.json'
|
||||||
|
|
||||||
|
def path_floorplan_ocr_json(self) -> pathlib.Path:
|
||||||
|
return self.path_listing() / 'floorplan_ocr.json'
|
||||||
|
|
||||||
def path_pic_folder(self) -> pathlib.Path:
|
def path_pic_folder(self) -> pathlib.Path:
|
||||||
return self.path_listing() / 'pics'
|
return self.path_listing() / 'pics'
|
||||||
|
|
@ -51,27 +54,24 @@ class Listing():
|
||||||
# todo add check if return is image
|
# todo add check if return is image
|
||||||
return images
|
return images
|
||||||
|
|
||||||
def calculate_sqm(self):
|
def calculate_sqm_model(self):
|
||||||
objs = []
|
objs = []
|
||||||
for floorplan_path in self.list_floorplans():
|
for floorplan_path in self.list_floorplans():
|
||||||
estimated_sqm, model_output, predictions = floorplan.calculate(floorplan_path)
|
estimated_sqm, model_output, predictions = floorplan.calculate_model(floorplan_path)
|
||||||
objs.append({
|
objs.append({
|
||||||
'floorplan_path': floorplan_path,
|
'floorplan_path': str(floorplan_path),
|
||||||
'estimated_sqm': estimated_sqm,
|
'estimated_sqm': estimated_sqm,
|
||||||
'model_output': model_output,
|
'model_output': model_output,
|
||||||
'no_predictions': len(predictions) # cant serialize the predictions itself since its a tensor
|
'no_predictions': len(predictions) # cant serialize the predictions itself since its a tensor
|
||||||
})
|
})
|
||||||
|
|
||||||
with open(self.path_floorplan_json(), 'w') as f:
|
with open(self.path_floorplan_model_json(), 'w') as f:
|
||||||
json.dump(objs, f)
|
json.dump(objs, f)
|
||||||
|
|
||||||
max_sqm = max([o['estimated_sqm'] for o in objs if o is None]) # filter out Nones
|
|
||||||
return max_sqm
|
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def sqm(self, recalculate=True):
|
def sqm_model(self, recalculate=True):
|
||||||
if recalculate and not self.path_floorplan_json().exists():
|
if recalculate and not self.path_floorplan_model_json().exists():
|
||||||
self.calculate_sqm()
|
self.calculate_sqm_model()
|
||||||
|
|
||||||
with open(self.path_floorplan_json()) as f:
|
with open(self.path_floorplan_json()) as f:
|
||||||
objs = json.load(f)
|
objs = json.load(f)
|
||||||
|
|
@ -79,6 +79,31 @@ class Listing():
|
||||||
max_sqm = max([o['estimated_sqm'] for o in objs if o is None]) # filter out Nones
|
max_sqm = max([o['estimated_sqm'] for o in objs if o is None]) # filter out Nones
|
||||||
return max_sqm
|
return max_sqm
|
||||||
|
|
||||||
|
def calculate_sqm_ocr(self):
|
||||||
|
objs = []
|
||||||
|
for floorplan_path in self.list_floorplans():
|
||||||
|
estimated_sqm, model_output = floorplan.calculate_ocr(floorplan_path)
|
||||||
|
objs.append({
|
||||||
|
'floorplan_path': str(floorplan_path),
|
||||||
|
'estimated_sqm': estimated_sqm,
|
||||||
|
'text': model_output,
|
||||||
|
})
|
||||||
|
|
||||||
|
with open(self.path_floorplan_ocr_json(), 'w') as f:
|
||||||
|
json.dump(objs, f)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def sqm_ocr(self, recalculate=True):
|
||||||
|
if recalculate and not self.path_floorplan_ocr_json().exists():
|
||||||
|
self.calculate_sqm_ocr()
|
||||||
|
|
||||||
|
with open(self.path_floorplan_ocr_json()) as f:
|
||||||
|
objs = json.load(f)
|
||||||
|
|
||||||
|
max_sqm = max([o['estimated_sqm'] for o in objs if o is None]) # filter out Nones
|
||||||
|
return max_sqm
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
|
|
||||||
17
crawler/poetry.lock
generated
17
crawler/poetry.lock
generated
|
|
@ -712,6 +712,21 @@ tests = ["check-manifest", "coverage", "defusedxml", "markdown2", "olefile", "pa
|
||||||
typing = ["typing-extensions"]
|
typing = ["typing-extensions"]
|
||||||
xmp = ["defusedxml"]
|
xmp = ["defusedxml"]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "pytesseract"
|
||||||
|
version = "0.3.10"
|
||||||
|
description = "Python-tesseract is a python wrapper for Google's Tesseract-OCR"
|
||||||
|
optional = false
|
||||||
|
python-versions = ">=3.7"
|
||||||
|
files = [
|
||||||
|
{file = "pytesseract-0.3.10-py3-none-any.whl", hash = "sha256:8f22cc98f765bf13517ead0c70effedb46c153540d25783e04014f28b55a5fc6"},
|
||||||
|
{file = "pytesseract-0.3.10.tar.gz", hash = "sha256:f1c3a8b0f07fd01a1085d451f5b8315be6eec1d5577a6796d46dc7a62bd4120f"},
|
||||||
|
]
|
||||||
|
|
||||||
|
[package.dependencies]
|
||||||
|
packaging = ">=21.3"
|
||||||
|
Pillow = ">=8.0.0"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "pyyaml"
|
name = "pyyaml"
|
||||||
version = "6.0.1"
|
version = "6.0.1"
|
||||||
|
|
@ -1455,4 +1470,4 @@ zstd = ["zstandard (>=0.18.0)"]
|
||||||
[metadata]
|
[metadata]
|
||||||
lock-version = "2.0"
|
lock-version = "2.0"
|
||||||
python-versions = ">3.11"
|
python-versions = ">3.11"
|
||||||
content-hash = "29e82860db598c8356b279e9287d0a96c1093724b0057ddcb374ae8f71881f3d"
|
content-hash = "30b432cae95b5a4facbca747f698614e256df27cb0f8b1c96608bba61eca1f0c"
|
||||||
|
|
|
||||||
|
|
@ -15,6 +15,7 @@ pillow = "^10.2.0"
|
||||||
torch = "^2.2.1"
|
torch = "^2.2.1"
|
||||||
numpy = "^1.26.4"
|
numpy = "^1.26.4"
|
||||||
transformers = "^4.38.2"
|
transformers = "^4.38.2"
|
||||||
|
pytesseract = "^0.3.10"
|
||||||
|
|
||||||
[tool.poetry.dev-dependencies]
|
[tool.poetry.dev-dependencies]
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,7 @@
|
||||||
import re
|
import re
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
from transformers import Pix2StructProcessor, Pix2StructForConditionalGeneration
|
from transformers import Pix2StructProcessor, Pix2StructForConditionalGeneration
|
||||||
|
import pytesseract
|
||||||
|
|
||||||
def inference(image_path):
|
def inference(image_path):
|
||||||
image = Image.open(image_path)
|
image = Image.open(image_path)
|
||||||
|
|
@ -24,7 +25,14 @@ def extract_total_sqm(deplot_input_str):
|
||||||
return max(sqms)
|
return max(sqms)
|
||||||
|
|
||||||
|
|
||||||
def calculate(image_path):
|
def calculate_model(image_path):
|
||||||
output, predictions_tensor = inference(image_path)
|
output, predictions_tensor = inference(image_path)
|
||||||
estimated_sqm = extract_total_sqm()
|
estimated_sqm = extract_total_sqm(output)
|
||||||
return estimated_sqm, output, predictions_tensor
|
return estimated_sqm, output, predictions_tensor
|
||||||
|
|
||||||
|
|
||||||
|
def calculate_ocr(image_path):
|
||||||
|
img = Image.open(image_path)
|
||||||
|
text = pytesseract.image_to_string(img)
|
||||||
|
estimated_sqm = extract_total_sqm(text)
|
||||||
|
return estimated_sqm, text
|
||||||
|
|
|
||||||
|
|
@ -113,28 +113,85 @@
|
||||||
"extract_total_sqm(x)"
|
"extract_total_sqm(x)"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "4bbf0ef5-0a38-4710-9ce1-e5e24a65d83b",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# tesseract"
|
||||||
|
]
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 16,
|
"execution_count": 18,
|
||||||
"id": "06d50e06-fead-40be-bd9d-7e8207d8df4e",
|
"id": "c73b5339-d493-4a55-a93c-9dcd7ee7a0af",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
{
|
{
|
||||||
"ename": "TypeError",
|
"name": "stdout",
|
||||||
"evalue": "'>' not supported between instances of 'int' and 'NoneType'",
|
"output_type": "stream",
|
||||||
"output_type": "error",
|
"text": [
|
||||||
"traceback": [
|
"DEFOE HOUSE EC2Y 8DN\n",
|
||||||
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
"\n",
|
||||||
"\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)",
|
"Prepared for Nicola Lee Ltd.\n",
|
||||||
"Cell \u001b[0;32mIn[16], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[38;5;28;43mmax\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m,\u001b[49m\u001b[38;5;241;43m1\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m)\u001b[49m\n",
|
"\n",
|
||||||
"\u001b[0;31mTypeError\u001b[0m: '>' not supported between instances of 'int' and 'NoneType'"
|
"Kitchen\n",
|
||||||
|
"93 x 74\n",
|
||||||
|
"\n",
|
||||||
|
"\\\n",
|
||||||
|
"Living Space “ (2.81mx 2.23m) Bedroom\n",
|
||||||
|
"162 11'8 18x 118\n",
|
||||||
|
"(3.55m x 3.55m)\n",
|
||||||
|
"\n",
|
||||||
|
"(4.90m x 3.55m)\n",
|
||||||
|
"\n",
|
||||||
|
"ee\n",
|
||||||
|
"\n",
|
||||||
|
"Balcony\n",
|
||||||
|
"12'6x4'll\n",
|
||||||
|
"(3.80m x 1.49m)\n",
|
||||||
|
"r\n",
|
||||||
|
"\n",
|
||||||
|
"Seventh Floor\n",
|
||||||
|
"Terrace\n",
|
||||||
|
"21'S x 16'10\n",
|
||||||
|
"(6.50m x 5.12m)\n",
|
||||||
|
"\n",
|
||||||
|
"Approximate Gross Internal Floor Area : 579 sq ft / 53.8 sqm\n",
|
||||||
|
"\n",
|
||||||
|
"\n"
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"max([None,1])"
|
"from PIL import Image\n",
|
||||||
|
"\n",
|
||||||
|
"import pytesseract\n",
|
||||||
|
"\n",
|
||||||
|
"# If you don't have tesseract executable in your PATH, include the following:\n",
|
||||||
|
"# pytesseract.pytesseract.tesseract_cmd = r'<full_path_to_your_tesseract_executable>'\n",
|
||||||
|
"# Example tesseract_cmd = r'C:\\Program Files (x86)\\Tesseract-OCR\\tesseract'\n",
|
||||||
|
"\n",
|
||||||
|
"# Simple image to string\n",
|
||||||
|
"print(pytesseract.image_to_string(Image.open('/Users/kadir/code/realestate/crawler/data/rs/135175484/floorplans/0_219962_DWR0004BF_FLP_00_0001.gif')))"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "1066255c-79f8-4360-aa6b-515576cb8b03",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": []
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "232d19bb-869a-4b8b-829a-0d8e211e9c7b",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": []
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"id": "e0d94c3e-e8de-452a-8556-12dd2e3e2351",
|
"id": "e0d94c3e-e8de-452a-8556-12dd2e3e2351",
|
||||||
|
|
|
||||||
17
vqa/poetry.lock
generated
17
vqa/poetry.lock
generated
|
|
@ -1803,6 +1803,21 @@ files = [
|
||||||
plugins = ["importlib-metadata"]
|
plugins = ["importlib-metadata"]
|
||||||
windows-terminal = ["colorama (>=0.4.6)"]
|
windows-terminal = ["colorama (>=0.4.6)"]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "pytesseract"
|
||||||
|
version = "0.3.10"
|
||||||
|
description = "Python-tesseract is a python wrapper for Google's Tesseract-OCR"
|
||||||
|
optional = false
|
||||||
|
python-versions = ">=3.7"
|
||||||
|
files = [
|
||||||
|
{file = "pytesseract-0.3.10-py3-none-any.whl", hash = "sha256:8f22cc98f765bf13517ead0c70effedb46c153540d25783e04014f28b55a5fc6"},
|
||||||
|
{file = "pytesseract-0.3.10.tar.gz", hash = "sha256:f1c3a8b0f07fd01a1085d451f5b8315be6eec1d5577a6796d46dc7a62bd4120f"},
|
||||||
|
]
|
||||||
|
|
||||||
|
[package.dependencies]
|
||||||
|
packaging = ">=21.3"
|
||||||
|
Pillow = ">=8.0.0"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "python-dateutil"
|
name = "python-dateutil"
|
||||||
version = "2.9.0"
|
version = "2.9.0"
|
||||||
|
|
@ -3072,4 +3087,4 @@ test = ["websockets"]
|
||||||
[metadata]
|
[metadata]
|
||||||
lock-version = "2.0"
|
lock-version = "2.0"
|
||||||
python-versions = "^3.12"
|
python-versions = "^3.12"
|
||||||
content-hash = "ec7099d470acdee1c418ff102390f5a71764b90628f9523eff9f22a8fb3e5018"
|
content-hash = "dcb74934d7385cf192b13d8767dc1cdb7b20907e66acca361a941d439f487b96"
|
||||||
|
|
|
||||||
|
|
@ -15,6 +15,7 @@ torchvision = "^0.17.1"
|
||||||
torchaudio = "^2.2.1"
|
torchaudio = "^2.2.1"
|
||||||
jupyterlab = "^4.1.2"
|
jupyterlab = "^4.1.2"
|
||||||
pandas = "^2.2.1"
|
pandas = "^2.2.1"
|
||||||
|
pytesseract = "^0.3.10"
|
||||||
|
|
||||||
|
|
||||||
[build-system]
|
[build-system]
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue