merging the visual query answering with the crawler. Monorepo go!

2024-03-01 16:42:48 +01:00 · 2024-03-01 16:42:48 +01:00 · e2f7998ee9
commit e2f7998ee9
parent 85686a8b24
32 changed files with 3449 additions and 0 deletions
--- a/crawler/GUIDE
+++ b/crawler/GUIDE
--- a/crawler/Makefile
+++ b/crawler/Makefile
--- a/crawler/TASKS
+++ b/crawler/TASKS
--- a/crawler/code/image.py
+++ b/crawler/code/image.py
--- a/crawler/code/json/detail.json
+++ b/crawler/code/json/detail.json
--- a/crawler/code/json/listings.json
+++ b/crawler/code/json/listings.json
--- a/crawler/code/json/routing_distancematrix.json
+++ b/crawler/code/json/routing_distancematrix.json
--- a/crawler/code/json/routing_routeapi.json
+++ b/crawler/code/json/routing_routeapi.json
--- a/crawler/code/json/routing_routeapi_full.json
+++ b/crawler/code/json/routing_routeapi_full.json
--- a/crawler/code/json/routing_routeapi_with_alternative.json
+++ b/crawler/code/json/routing_routeapi_with_alternative.json
--- a/crawler/code/listings.py
+++ b/crawler/code/listings.py
--- a/crawler/code/routing_distancematrix.py
+++ b/crawler/code/routing_distancematrix.py
--- a/crawler/code/routing_routing.py
+++ b/crawler/code/routing_routing.py
--- a/crawler/code/single-query.py
+++ b/crawler/code/single-query.py
--- a/crawler/code/utils.py
+++ b/crawler/code/utils.py
--- a/crawler/poetry.lock
+++ b/crawler/poetry.lock
--- a/crawler/pyproject.toml
+++ b/crawler/pyproject.toml
--- a/crawler/rec/init.py
+++ b/crawler/rec/init.py
--- a/crawler/rec/db.py
+++ b/crawler/rec/db.py
--- a/crawler/rec/query.py
+++ b/crawler/rec/query.py
--- a/crawler/rec/rightmove_parser.py
+++ b/crawler/rec/rightmove_parser.py
--- a/crawler/testing.py
+++ b/crawler/testing.py
--- a/vqa/.gitignore
+++ b/vqa/.gitignore
@ -0,0 +1,3 @@
+venv/
+__pycache__/
+.ipynb_checkpoints/
--- a/vqa/Untitled.ipynb
+++ b/vqa/Untitled.ipynb
@ -0,0 +1,300 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "546cee84-249f-43ce-a1f9-8475682a833d",
+   "metadata": {
+    "editable": true,
+    "slideshow": {
+     "slide_type": ""
+    },
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/Users/kadir/code/vqa/venv/lib/python3.12/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n"
+     ]
+    }
+   ],
+   "source": [
+    "from transformers import Pix2StructProcessor, Pix2StructForConditionalGeneration\n",
+    "from PIL import Image\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 45,
+   "id": "74e4a41f-2dfc-428e-8bca-4e9cc1c076c6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "image = Image.open(\"floorplans/46001_32532509_FLP_00_0000.jpeg\")\n",
+    "question = \"How many living rooms are displayed on this floor plan?\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "id": "c24b1560-563b-4ff2-8744-4591ac1cc57b",
+   "metadata": {
+    "editable": true,
+    "slideshow": {
+     "slide_type": ""
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "processor = Pix2StructProcessor.from_pretrained('google/deplot')\n",
+    "model = Pix2StructForConditionalGeneration.from_pretrained('google/deplot')\n",
+    "\n",
+    "inputs = processor(images=image, text=question, return_tensors=\"pt\")\n",
+    "predictions = model.generate(**inputs, max_new_tokens=512)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "id": "ea990e1f-a660-4efb-be48-d095ab05b50d",
+   "metadata": {
+    "editable": true,
+    "slideshow": {
+     "slide_type": ""
+    },
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'TITLE |  <0x0A> Fifth Floor<0x0A>Floor Area 642 Sq Ft - 59.64 Sq M | Licence<0x0A>Remung<0x0A>Floor Area 642 Sq Ft - 59.64 Sq M <0x0A> Reception Room<0x0A>1210 x 12.10%<0x0A>39 x 39m | 1280 <0x0A> Balcony<0x0A>Reception Room<0x0A>1210 x 12.10%<0x0A>39 x 39m | 1280 <0x0A> Property<0x0A>Measurer | 1280 <0x0A> Ipaplus.com | 1280'"
+      ]
+     },
+     "execution_count": 19,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "x = processor.decode(predictions[0], skip_special_tokens=True)\n",
+    "x"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e0d94c3e-e8de-452a-8556-12dd2e3e2351",
+   "metadata": {},
+   "source": [
+    "# google/pix2struct-ai2d-base"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "id": "cd57d77b-3d70-46f0-97cf-ddd37cd61d23",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "question = \"How many living rooms?\"\n",
+    "model = Pix2StructForConditionalGeneration.from_pretrained(\"google/pix2struct-ai2d-base\")\n",
+    "processor = Pix2StructProcessor.from_pretrained(\"google/pix2struct-ai2d-base\")\n",
+    "\n",
+    "inputs = processor(images=image, text=question, return_tensors=\"pt\")\n",
+    "predictions = model.generate(**inputs, max_new_tokens=10000)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "id": "a507a4a4-33b6-4ca3-93dd-3aab489b5c24",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "predictions 1\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "'floor plan is for illustrative purposes only and is not to scale. Every attempt has been made to ensure the accuracy of the floor plan shown, however all measurements, fixtures, fittings and data shown are an approximate interpretation for illustrative purposes only. 1 sq m + 10.76 sq feet. 7/8/2023>'"
+      ]
+     },
+     "execution_count": 26,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "x = processor.decode(predictions[0], skip_special_tokens=True)\n",
+    "print(\"predictions: \", len(predictions))\n",
+    "x"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a13e12c9-0406-4568-9a36-e61d70e1e683",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4865a480-230d-48bc-be05-ddc54d107c79",
+   "metadata": {},
+   "source": [
+    "# pix2struct-docvqa-large"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "id": "cd209233-ad64-4af0-a8ce-f1d640458d02",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "question = \"How many living rooms?\"\n",
+    "model = Pix2StructForConditionalGeneration.from_pretrained(\"google/pix2struct-docvqa-large\")\n",
+    "processor = Pix2StructProcessor.from_pretrained(\"google/pix2struct-docvqa-large\")\n",
+    "\n",
+    "inputs = processor(images=image, text=question, return_tensors=\"pt\")\n",
+    "predictions = model.generate(**inputs, max_new_tokens=10000)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "id": "17378986-ae8b-4ab5-be40-837db0f7aaa6",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "predictions:  1\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "'2'"
+      ]
+     },
+     "execution_count": 28,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "x = processor.decode(predictions[0], skip_special_tokens=True)\n",
+    "x"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 39,
+   "id": "2d3f5a65-34e8-4c2e-ab37-03e6da86fc1c",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "['what is the total area measured in square meters? 8. 5']\n"
+     ]
+    }
+   ],
+   "source": [
+    "from transformers import AutoProcessor, AutoModelForCausalLM\n",
+    "from huggingface_hub import hf_hub_download\n",
+    "import torch\n",
+    "\n",
+    "processor = AutoProcessor.from_pretrained(\"microsoft/git-base-textvqa\")\n",
+    "model = AutoModelForCausalLM.from_pretrained(\"microsoft/git-base-textvqa\")\n",
+    "file_path = hf_hub_download(repo_id=\"nielsr/textvqa-sample\", filename=\"bus.png\", repo_type=\"dataset\")\n",
+    "\n",
+    "pixel_values = processor(images=image, return_tensors=\"pt\").pixel_values\n",
+    "question = \"What is the total area measured in square meters?\"\n",
+    "\n",
+    "input_ids = processor(text=question, add_special_tokens=False).input_ids\n",
+    "\n",
+    "input_ids = [processor.tokenizer.cls_token_id] + input_ids\n",
+    "\n",
+    "input_ids = torch.tensor(input_ids).unsqueeze(0)\n",
+    "\n",
+    "generated_ids = model.generate(pixel_values=pixel_values, input_ids=input_ids, max_length=50)\n",
+    "\n",
+    "print(processor.batch_decode(generated_ids, skip_special_tokens=True))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 46,
+   "id": "254054f0-0102-4927-a93c-c6eba97b437c",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "['how many living rooms are displayed on this floor plan? 8']\n"
+     ]
+    }
+   ],
+   "source": [
+    "from transformers import AutoProcessor, AutoModelForCausalLM\n",
+    "\n",
+    "from huggingface_hub import hf_hub_download\n",
+    "\n",
+    "from PIL import Image\n",
+    "\n",
+    "processor = AutoProcessor.from_pretrained(\"microsoft/git-base-textvqa\")\n",
+    "\n",
+    "model = AutoModelForCausalLM.from_pretrained(\"microsoft/git-base-textvqa\")\n",
+    "\n",
+    "file_path = hf_hub_download(repo_id=\"nielsr/textvqa-sample\", filename=\"bus.png\", repo_type=\"dataset\")\n",
+    "\n",
+    "pixel_values = processor(images=image, return_tensors=\"pt\").pixel_values\n",
+    "\n",
+    "input_ids = processor(text=question, add_special_tokens=False).input_ids\n",
+    "\n",
+    "input_ids = [processor.tokenizer.cls_token_id] + input_ids\n",
+    "\n",
+    "input_ids = torch.tensor(input_ids).unsqueeze(0)\n",
+    "\n",
+    "generated_ids = model.generate(pixel_values=pixel_values, input_ids=input_ids, max_length=50)\n",
+    "\n",
+    "print(processor.batch_decode(generated_ids, skip_special_tokens=True))"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
--- a/vqa/floorplans/46001_32532509_FLP_00_0000.jpeg
+++ b/vqa/floorplans/46001_32532509_FLP_00_0000.jpeg
--- a/vqa/main.py
+++ b/vqa/main.py
@ -0,0 +1,24 @@
+from vqa import Blip, MicrosoftGIT, PixStructDocVA, Vilt, Deplot, VQA
+from PIL import Image
+from typing import List
+from questions import load_questions
+
+image = Image.open("floorplans/46001_32532509_FLP_00_0000.jpeg")
+
+questions = load_questions(False)
+
+models: List[VQA] = [
+    # Blip(),
+    # Vilt(),
+    # Deplot(),
+    # PixStructDocVA(),
+    MicrosoftGIT(),
+]
+
+for question, answer in questions.items():
+    answers = {model.name: model.query(image, question) for model in models}
+
+    print("# Question:", question)
+    for modelname, answer in answers.items():
+        print(f"{modelname}: {answer}")
+    print("Expected:", answer)
--- a/vqa/poetry.lock
+++ b/vqa/poetry.lock
--- a/vqa/pyproject.toml
+++ b/vqa/pyproject.toml
@ -0,0 +1,21 @@
+[tool.poetry]
+name = "vqa"
+version = "0.1.0"
+description = ""
+authors = ["Kadir"]
+readme = "README.md"
+
+[tool.poetry.dependencies]
+python = "^3.12"
+requests = "^2.31.0"
+transformers = "^4.38.2"
+pillow = "^10.2.0"
+torch = "^2.2.1"
+torchvision = "^0.17.1"
+torchaudio = "^2.2.1"
+jupyterlab = "^4.1.2"
+
+
+[build-system]
+requires = ["poetry-core"]
+build-backend = "poetry.core.masonry.api"
--- a/vqa/questions.json
+++ b/vqa/questions.json
@ -0,0 +1,8 @@
+{
+    "How many bedrooms?": "2",
+    "How many living rooms?":"1",
+    "How many reception rooms?":"1",
+    "How many bathrooms?": "1",
+    "What is the total square meter area?": "59.64",
+    "Which floor is it?": "5"
+}
--- a/vqa/questions.py
+++ b/vqa/questions.py
@ -0,0 +1,37 @@
+import json
+
+
+def load_questions(bigquestion=False):
+    with open("questions.json") as f:
+        qs: dict = json.load(f)
+    
+    if bigquestion:
+        s = """
+Tell me from this floor plan information in a valid json format. The json format should have the following format. Only fill out the information if it is visible by the floor plan. Do not try to convert footage or meter into the each other or multiply width with height if area is not given. Instead use null whenever the data is not explicitly displayed.
+```
+{
+	"balcony": "boolean",
+	"terrace": "boolean",
+	"total_number_of_rooms": "number",
+	"number_of_bathrooms": "number",
+	"number_of_bedrooms": "number",
+	"total_square_meter": "number", // if described
+	"total_square_footage": "number", // if described
+	"kitchen_separate": "boolean",
+	"bedroom_faces_south_or_east": "boolean",
+	// Per room I also want following information
+	"rooms": {
+		"title": "e.g. bedroom 1"
+		"type": "bedroom|livingroom|bathroom",
+		"dimension_width_metric": "3", // in meters if exist
+		"dimension_height_metric": "4", // in meters if exist
+		"sqm" : "number", // if exists
+		"dimension_width_foot": "5.5", // in foot if exist
+		"dimension_height_foot": "7", // in foot if exist
+	}
+}
+```
+        """.strip()
+        qs[s] = "<not set>"
+    
+    return qs
--- a/vqa/util.py
+++ b/vqa/util.py
--- a/vqa/vqa.py
+++ b/vqa/vqa.py
@ -0,0 +1,72 @@
+from transformers import BlipProcessor, BlipForQuestionAnswering
+from transformers import ViltProcessor, ViltForQuestionAnswering
+from transformers import Pix2StructProcessor, Pix2StructForConditionalGeneration
+from transformers import GitVisionConfig, GitVisionModel, AutoProcessor, GitProcessor
+
+class VQA:
+    name = "Not defined"
+    def query(image, question: str) -> str:
+        pass
+
+class Blip(VQA):
+    name = "Blip"
+    def query(self, image, question):
+        processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-capfilt-large")
+        model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-capfilt-large")
+        inputs = processor(image, question, return_tensors="pt")
+        out = model.generate(max_new_tokens=50000, **inputs)
+        return processor.decode(out[0], skip_special_tokens=True)
+
+
+class Vilt(VQA):
+    name = "Vilt"
+    def query(self, image, question):
+        processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-finetuned-vqa")
+        model = ViltForQuestionAnswering.from_pretrained("dandelin/vilt-b32-finetuned-vqa")
+
+        # prepare inputs
+        encoding = processor(image, question, return_tensors="pt")
+
+        # forward pass
+        outputs = model(**encoding)
+        logits = outputs.logits
+        idx = logits.argmax(-1).item()
+        return model.config.id2label[idx]
+
+
+
+class Deplot(VQA):
+    name = "Deplot"
+    def query(self, image, question):
+        processor = Pix2StructProcessor.from_pretrained('google/deplot')
+        model = Pix2StructForConditionalGeneration.from_pretrained('google/deplot')
+
+        inputs = processor(images=image, text=question, return_tensors="pt")
+        predictions = model.generate(**inputs, max_new_tokens=512)
+        return processor.decode(predictions[0], skip_special_tokens=True)
+    
+
+class PixStructDocVA(VQA):
+    name = "google/pix2struct-docvqa-large"
+    def query(self, image, question):
+        print(question)
+        model = Pix2StructForConditionalGeneration.from_pretrained("google/pix2struct-docvqa-large")
+        processor = Pix2StructProcessor.from_pretrained("google/pix2struct-docvqa-large")
+
+        inputs = processor(images=image, text=question, return_tensors="pt")
+        predictions = model.generate(**inputs, max_new_tokens=10000)
+        answer = processor.decode(predictions[0], skip_special_tokens=True)
+        print(answer)
+        return answer
+    
+class MicrosoftGIT(VQA):
+    name = "microsoft/git-base-textvqa"
+    def query(self, image, question):
+        processor = GitProcessor.from_pretrained("microsoft/git-base")
+        model = GitVisionModel.from_pretrained("microsoft/git-base")
+        inputs = processor(images=image, text=question, return_tensors="pt")
+        predictions = model.generate(**inputs, max_new_tokens=10000)
+        answer = processor.decode(predictions[0], skip_special_tokens=True)
+        
+        return answer
+