diff --git a/crawler/4_detect_floorplan.py b/crawler/4_detect_floorplan.py index 5715916..fbbc818 100644 --- a/crawler/4_detect_floorplan.py +++ b/crawler/4_detect_floorplan.py @@ -4,4 +4,6 @@ from tqdm import tqdm listings = Listing.get_all_listings() for listing in tqdm(listings): - tqdm.write(listing.calculate_sqm()) + tqdm.write(str(listing.identifier)) + # listing.calculate_sqm_model() # using google/deplot model. Too slow, rather use tesseract + listing.calculate_sqm_ocr() diff --git a/crawler/data_access.py b/crawler/data_access.py index 0a974f5..68f3c2e 100644 --- a/crawler/data_access.py +++ b/crawler/data_access.py @@ -30,8 +30,11 @@ class Listing(): def path_detail_json(self) -> pathlib.Path: return self.path_listing() / 'detail.json' - def path_floorplan_json(self) -> pathlib.Path: - return self.path_listing() / 'floorplan.json' + def path_floorplan_model_json(self) -> pathlib.Path: + return self.path_listing() / 'floorplan_model.json' + + def path_floorplan_ocr_json(self) -> pathlib.Path: + return self.path_listing() / 'floorplan_ocr.json' def path_pic_folder(self) -> pathlib.Path: return self.path_listing() / 'pics' @@ -51,36 +54,58 @@ class Listing(): # todo add check if return is image return images - def calculate_sqm(self): + def calculate_sqm_model(self): objs = [] for floorplan_path in self.list_floorplans(): - estimated_sqm, model_output, predictions = floorplan.calculate(floorplan_path) + estimated_sqm, model_output, predictions = floorplan.calculate_model(floorplan_path) objs.append({ - 'floorplan_path': floorplan_path, + 'floorplan_path': str(floorplan_path), 'estimated_sqm': estimated_sqm, 'model_output': model_output, 'no_predictions': len(predictions) # cant serialize the predictions itself since its a tensor }) - with open(self.path_floorplan_json(), 'w') as f: + with open(self.path_floorplan_model_json(), 'w') as f: json.dump(objs, f) - - max_sqm = max([o['estimated_sqm'] for o in objs if o is None]) # filter out Nones - return max_sqm @property - def sqm(self, recalculate=True): - if recalculate and not self.path_floorplan_json().exists(): - self.calculate_sqm() + def sqm_model(self, recalculate=True): + if recalculate and not self.path_floorplan_model_json().exists(): + self.calculate_sqm_model() with open(self.path_floorplan_json()) as f: objs = json.load(f) max_sqm = max([o['estimated_sqm'] for o in objs if o is None]) # filter out Nones return max_sqm + + def calculate_sqm_ocr(self): + objs = [] + for floorplan_path in self.list_floorplans(): + estimated_sqm, model_output = floorplan.calculate_ocr(floorplan_path) + objs.append({ + 'floorplan_path': str(floorplan_path), + 'estimated_sqm': estimated_sqm, + 'text': model_output, + }) + with open(self.path_floorplan_ocr_json(), 'w') as f: + json.dump(objs, f) + + @property + def sqm_ocr(self, recalculate=True): + if recalculate and not self.path_floorplan_ocr_json().exists(): + self.calculate_sqm_ocr() + + with open(self.path_floorplan_ocr_json()) as f: + objs = json.load(f) + + max_sqm = max([o['estimated_sqm'] for o in objs if o is None]) # filter out Nones + return max_sqm + + if __name__ == '__main__': listings = Listing.get_all_listings() - print(listings[0].list_floorplans()) \ No newline at end of file + print(listings[0].list_floorplans()) diff --git a/crawler/poetry.lock b/crawler/poetry.lock index d8e6e25..96ad22f 100644 --- a/crawler/poetry.lock +++ b/crawler/poetry.lock @@ -712,6 +712,21 @@ tests = ["check-manifest", "coverage", "defusedxml", "markdown2", "olefile", "pa typing = ["typing-extensions"] xmp = ["defusedxml"] +[[package]] +name = "pytesseract" +version = "0.3.10" +description = "Python-tesseract is a python wrapper for Google's Tesseract-OCR" +optional = false +python-versions = ">=3.7" +files = [ + {file = "pytesseract-0.3.10-py3-none-any.whl", hash = "sha256:8f22cc98f765bf13517ead0c70effedb46c153540d25783e04014f28b55a5fc6"}, + {file = "pytesseract-0.3.10.tar.gz", hash = "sha256:f1c3a8b0f07fd01a1085d451f5b8315be6eec1d5577a6796d46dc7a62bd4120f"}, +] + +[package.dependencies] +packaging = ">=21.3" +Pillow = ">=8.0.0" + [[package]] name = "pyyaml" version = "6.0.1" @@ -1455,4 +1470,4 @@ zstd = ["zstandard (>=0.18.0)"] [metadata] lock-version = "2.0" python-versions = ">3.11" -content-hash = "29e82860db598c8356b279e9287d0a96c1093724b0057ddcb374ae8f71881f3d" +content-hash = "30b432cae95b5a4facbca747f698614e256df27cb0f8b1c96608bba61eca1f0c" diff --git a/crawler/pyproject.toml b/crawler/pyproject.toml index a535095..2ef0aa3 100644 --- a/crawler/pyproject.toml +++ b/crawler/pyproject.toml @@ -15,6 +15,7 @@ pillow = "^10.2.0" torch = "^2.2.1" numpy = "^1.26.4" transformers = "^4.38.2" +pytesseract = "^0.3.10" [tool.poetry.dev-dependencies] diff --git a/crawler/rec/floorplan.py b/crawler/rec/floorplan.py index 74d086c..17f9a98 100644 --- a/crawler/rec/floorplan.py +++ b/crawler/rec/floorplan.py @@ -1,6 +1,7 @@ import re from PIL import Image from transformers import Pix2StructProcessor, Pix2StructForConditionalGeneration +import pytesseract def inference(image_path): image = Image.open(image_path) @@ -24,7 +25,14 @@ def extract_total_sqm(deplot_input_str): return max(sqms) -def calculate(image_path): +def calculate_model(image_path): output, predictions_tensor = inference(image_path) - estimated_sqm = extract_total_sqm() + estimated_sqm = extract_total_sqm(output) return estimated_sqm, output, predictions_tensor + + +def calculate_ocr(image_path): + img = Image.open(image_path) + text = pytesseract.image_to_string(img) + estimated_sqm = extract_total_sqm(text) + return estimated_sqm, text diff --git a/vqa/Untitled.ipynb b/vqa/Untitled.ipynb index 0e558a7..9ee8422 100644 --- a/vqa/Untitled.ipynb +++ b/vqa/Untitled.ipynb @@ -113,28 +113,85 @@ "extract_total_sqm(x)" ] }, + { + "cell_type": "markdown", + "id": "4bbf0ef5-0a38-4710-9ce1-e5e24a65d83b", + "metadata": {}, + "source": [ + "# tesseract" + ] + }, { "cell_type": "code", - "execution_count": 16, - "id": "06d50e06-fead-40be-bd9d-7e8207d8df4e", + "execution_count": 18, + "id": "c73b5339-d493-4a55-a93c-9dcd7ee7a0af", "metadata": {}, "outputs": [ { - "ename": "TypeError", - "evalue": "'>' not supported between instances of 'int' and 'NoneType'", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[16], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[38;5;28;43mmax\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m,\u001b[49m\u001b[38;5;241;43m1\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m)\u001b[49m\n", - "\u001b[0;31mTypeError\u001b[0m: '>' not supported between instances of 'int' and 'NoneType'" + "name": "stdout", + "output_type": "stream", + "text": [ + "DEFOE HOUSE EC2Y 8DN\n", + "\n", + "Prepared for Nicola Lee Ltd.\n", + "\n", + "Kitchen\n", + "93 x 74\n", + "\n", + "\\\n", + "Living Space “ (2.81mx 2.23m) Bedroom\n", + "162 11'8 18x 118\n", + "(3.55m x 3.55m)\n", + "\n", + "(4.90m x 3.55m)\n", + "\n", + "ee\n", + "\n", + "Balcony\n", + "12'6x4'll\n", + "(3.80m x 1.49m)\n", + "r\n", + "\n", + "Seventh Floor\n", + "Terrace\n", + "21'S x 16'10\n", + "(6.50m x 5.12m)\n", + "\n", + "Approximate Gross Internal Floor Area : 579 sq ft / 53.8 sqm\n", + "\n", + "\n" ] } ], "source": [ - "max([None,1])" + "from PIL import Image\n", + "\n", + "import pytesseract\n", + "\n", + "# If you don't have tesseract executable in your PATH, include the following:\n", + "# pytesseract.pytesseract.tesseract_cmd = r''\n", + "# Example tesseract_cmd = r'C:\\Program Files (x86)\\Tesseract-OCR\\tesseract'\n", + "\n", + "# Simple image to string\n", + "print(pytesseract.image_to_string(Image.open('/Users/kadir/code/realestate/crawler/data/rs/135175484/floorplans/0_219962_DWR0004BF_FLP_00_0001.gif')))" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "1066255c-79f8-4360-aa6b-515576cb8b03", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "232d19bb-869a-4b8b-829a-0d8e211e9c7b", + "metadata": {}, + "outputs": [], + "source": [] + }, { "cell_type": "markdown", "id": "e0d94c3e-e8de-452a-8556-12dd2e3e2351", diff --git a/vqa/poetry.lock b/vqa/poetry.lock index cf5cd6b..cec899a 100644 --- a/vqa/poetry.lock +++ b/vqa/poetry.lock @@ -1803,6 +1803,21 @@ files = [ plugins = ["importlib-metadata"] windows-terminal = ["colorama (>=0.4.6)"] +[[package]] +name = "pytesseract" +version = "0.3.10" +description = "Python-tesseract is a python wrapper for Google's Tesseract-OCR" +optional = false +python-versions = ">=3.7" +files = [ + {file = "pytesseract-0.3.10-py3-none-any.whl", hash = "sha256:8f22cc98f765bf13517ead0c70effedb46c153540d25783e04014f28b55a5fc6"}, + {file = "pytesseract-0.3.10.tar.gz", hash = "sha256:f1c3a8b0f07fd01a1085d451f5b8315be6eec1d5577a6796d46dc7a62bd4120f"}, +] + +[package.dependencies] +packaging = ">=21.3" +Pillow = ">=8.0.0" + [[package]] name = "python-dateutil" version = "2.9.0" @@ -3072,4 +3087,4 @@ test = ["websockets"] [metadata] lock-version = "2.0" python-versions = "^3.12" -content-hash = "ec7099d470acdee1c418ff102390f5a71764b90628f9523eff9f22a8fb3e5018" +content-hash = "dcb74934d7385cf192b13d8767dc1cdb7b20907e66acca361a941d439f487b96" diff --git a/vqa/pyproject.toml b/vqa/pyproject.toml index be9c9bf..c1d8b1a 100644 --- a/vqa/pyproject.toml +++ b/vqa/pyproject.toml @@ -15,6 +15,7 @@ torchvision = "^0.17.1" torchaudio = "^2.2.1" jupyterlab = "^4.1.2" pandas = "^2.2.1" +pytesseract = "^0.3.10" [build-system]