merging the visual query answering with the crawler. Monorepo go!

This commit is contained in:
Kadir 2024-03-01 16:42:48 +01:00
parent 85686a8b24
commit e2f7998ee9
32 changed files with 3449 additions and 0 deletions

View file

3
vqa/.gitignore vendored Normal file
View file

@ -0,0 +1,3 @@
venv/
__pycache__/
.ipynb_checkpoints/

300
vqa/Untitled.ipynb Normal file
View file

@ -0,0 +1,300 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "546cee84-249f-43ce-a1f9-8475682a833d",
"metadata": {
"editable": true,
"slideshow": {
"slide_type": ""
},
"tags": []
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/Users/kadir/code/vqa/venv/lib/python3.12/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
" from .autonotebook import tqdm as notebook_tqdm\n"
]
}
],
"source": [
"from transformers import Pix2StructProcessor, Pix2StructForConditionalGeneration\n",
"from PIL import Image\n"
]
},
{
"cell_type": "code",
"execution_count": 45,
"id": "74e4a41f-2dfc-428e-8bca-4e9cc1c076c6",
"metadata": {},
"outputs": [],
"source": [
"image = Image.open(\"floorplans/46001_32532509_FLP_00_0000.jpeg\")\n",
"question = \"How many living rooms are displayed on this floor plan?\""
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "c24b1560-563b-4ff2-8744-4591ac1cc57b",
"metadata": {
"editable": true,
"slideshow": {
"slide_type": ""
},
"tags": []
},
"outputs": [],
"source": [
"processor = Pix2StructProcessor.from_pretrained('google/deplot')\n",
"model = Pix2StructForConditionalGeneration.from_pretrained('google/deplot')\n",
"\n",
"inputs = processor(images=image, text=question, return_tensors=\"pt\")\n",
"predictions = model.generate(**inputs, max_new_tokens=512)"
]
},
{
"cell_type": "code",
"execution_count": 19,
"id": "ea990e1f-a660-4efb-be48-d095ab05b50d",
"metadata": {
"editable": true,
"slideshow": {
"slide_type": ""
},
"tags": []
},
"outputs": [
{
"data": {
"text/plain": [
"'TITLE | <0x0A> Fifth Floor<0x0A>Floor Area 642 Sq Ft - 59.64 Sq M | Licence<0x0A>Remung<0x0A>Floor Area 642 Sq Ft - 59.64 Sq M <0x0A> Reception Room<0x0A>1210 x 12.10%<0x0A>39 x 39m | 1280 <0x0A> Balcony<0x0A>Reception Room<0x0A>1210 x 12.10%<0x0A>39 x 39m | 1280 <0x0A> Property<0x0A>Measurer | 1280 <0x0A> Ipaplus.com | 1280'"
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"x = processor.decode(predictions[0], skip_special_tokens=True)\n",
"x"
]
},
{
"cell_type": "markdown",
"id": "e0d94c3e-e8de-452a-8556-12dd2e3e2351",
"metadata": {},
"source": [
"# google/pix2struct-ai2d-base"
]
},
{
"cell_type": "code",
"execution_count": 23,
"id": "cd57d77b-3d70-46f0-97cf-ddd37cd61d23",
"metadata": {},
"outputs": [],
"source": [
"question = \"How many living rooms?\"\n",
"model = Pix2StructForConditionalGeneration.from_pretrained(\"google/pix2struct-ai2d-base\")\n",
"processor = Pix2StructProcessor.from_pretrained(\"google/pix2struct-ai2d-base\")\n",
"\n",
"inputs = processor(images=image, text=question, return_tensors=\"pt\")\n",
"predictions = model.generate(**inputs, max_new_tokens=10000)"
]
},
{
"cell_type": "code",
"execution_count": 26,
"id": "a507a4a4-33b6-4ca3-93dd-3aab489b5c24",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"predictions 1\n"
]
},
{
"data": {
"text/plain": [
"'floor plan is for illustrative purposes only and is not to scale. Every attempt has been made to ensure the accuracy of the floor plan shown, however all measurements, fixtures, fittings and data shown are an approximate interpretation for illustrative purposes only. 1 sq m + 10.76 sq feet. 7/8/2023>'"
]
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"x = processor.decode(predictions[0], skip_special_tokens=True)\n",
"print(\"predictions: \", len(predictions))\n",
"x"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a13e12c9-0406-4568-9a36-e61d70e1e683",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"id": "4865a480-230d-48bc-be05-ddc54d107c79",
"metadata": {},
"source": [
"# pix2struct-docvqa-large"
]
},
{
"cell_type": "code",
"execution_count": 27,
"id": "cd209233-ad64-4af0-a8ce-f1d640458d02",
"metadata": {},
"outputs": [],
"source": [
"question = \"How many living rooms?\"\n",
"model = Pix2StructForConditionalGeneration.from_pretrained(\"google/pix2struct-docvqa-large\")\n",
"processor = Pix2StructProcessor.from_pretrained(\"google/pix2struct-docvqa-large\")\n",
"\n",
"inputs = processor(images=image, text=question, return_tensors=\"pt\")\n",
"predictions = model.generate(**inputs, max_new_tokens=10000)"
]
},
{
"cell_type": "code",
"execution_count": 28,
"id": "17378986-ae8b-4ab5-be40-837db0f7aaa6",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"predictions: 1\n"
]
},
{
"data": {
"text/plain": [
"'2'"
]
},
"execution_count": 28,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"x = processor.decode(predictions[0], skip_special_tokens=True)\n",
"x"
]
},
{
"cell_type": "code",
"execution_count": 39,
"id": "2d3f5a65-34e8-4c2e-ab37-03e6da86fc1c",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['what is the total area measured in square meters? 8. 5']\n"
]
}
],
"source": [
"from transformers import AutoProcessor, AutoModelForCausalLM\n",
"from huggingface_hub import hf_hub_download\n",
"import torch\n",
"\n",
"processor = AutoProcessor.from_pretrained(\"microsoft/git-base-textvqa\")\n",
"model = AutoModelForCausalLM.from_pretrained(\"microsoft/git-base-textvqa\")\n",
"file_path = hf_hub_download(repo_id=\"nielsr/textvqa-sample\", filename=\"bus.png\", repo_type=\"dataset\")\n",
"\n",
"pixel_values = processor(images=image, return_tensors=\"pt\").pixel_values\n",
"question = \"What is the total area measured in square meters?\"\n",
"\n",
"input_ids = processor(text=question, add_special_tokens=False).input_ids\n",
"\n",
"input_ids = [processor.tokenizer.cls_token_id] + input_ids\n",
"\n",
"input_ids = torch.tensor(input_ids).unsqueeze(0)\n",
"\n",
"generated_ids = model.generate(pixel_values=pixel_values, input_ids=input_ids, max_length=50)\n",
"\n",
"print(processor.batch_decode(generated_ids, skip_special_tokens=True))"
]
},
{
"cell_type": "code",
"execution_count": 46,
"id": "254054f0-0102-4927-a93c-c6eba97b437c",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['how many living rooms are displayed on this floor plan? 8']\n"
]
}
],
"source": [
"from transformers import AutoProcessor, AutoModelForCausalLM\n",
"\n",
"from huggingface_hub import hf_hub_download\n",
"\n",
"from PIL import Image\n",
"\n",
"processor = AutoProcessor.from_pretrained(\"microsoft/git-base-textvqa\")\n",
"\n",
"model = AutoModelForCausalLM.from_pretrained(\"microsoft/git-base-textvqa\")\n",
"\n",
"file_path = hf_hub_download(repo_id=\"nielsr/textvqa-sample\", filename=\"bus.png\", repo_type=\"dataset\")\n",
"\n",
"pixel_values = processor(images=image, return_tensors=\"pt\").pixel_values\n",
"\n",
"input_ids = processor(text=question, add_special_tokens=False).input_ids\n",
"\n",
"input_ids = [processor.tokenizer.cls_token_id] + input_ids\n",
"\n",
"input_ids = torch.tensor(input_ids).unsqueeze(0)\n",
"\n",
"generated_ids = model.generate(pixel_values=pixel_values, input_ids=input_ids, max_length=50)\n",
"\n",
"print(processor.batch_decode(generated_ids, skip_special_tokens=True))"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.0"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

Binary file not shown.

After

Width:  |  Height:  |  Size: 59 KiB

24
vqa/main.py Normal file
View file

@ -0,0 +1,24 @@
from vqa import Blip, MicrosoftGIT, PixStructDocVA, Vilt, Deplot, VQA
from PIL import Image
from typing import List
from questions import load_questions
image = Image.open("floorplans/46001_32532509_FLP_00_0000.jpeg")
questions = load_questions(False)
models: List[VQA] = [
# Blip(),
# Vilt(),
# Deplot(),
# PixStructDocVA(),
MicrosoftGIT(),
]
for question, answer in questions.items():
answers = {model.name: model.query(image, question) for model in models}
print("# Question:", question)
for modelname, answer in answers.items():
print(f"{modelname}: {answer}")
print("Expected:", answer)

2984
vqa/poetry.lock generated Normal file

File diff suppressed because it is too large Load diff

21
vqa/pyproject.toml Normal file
View file

@ -0,0 +1,21 @@
[tool.poetry]
name = "vqa"
version = "0.1.0"
description = ""
authors = ["Kadir"]
readme = "README.md"
[tool.poetry.dependencies]
python = "^3.12"
requests = "^2.31.0"
transformers = "^4.38.2"
pillow = "^10.2.0"
torch = "^2.2.1"
torchvision = "^0.17.1"
torchaudio = "^2.2.1"
jupyterlab = "^4.1.2"
[build-system]
requires = ["poetry-core"]
build-backend = "poetry.core.masonry.api"

8
vqa/questions.json Normal file
View file

@ -0,0 +1,8 @@
{
"How many bedrooms?": "2",
"How many living rooms?":"1",
"How many reception rooms?":"1",
"How many bathrooms?": "1",
"What is the total square meter area?": "59.64",
"Which floor is it?": "5"
}

37
vqa/questions.py Normal file
View file

@ -0,0 +1,37 @@
import json
def load_questions(bigquestion=False):
with open("questions.json") as f:
qs: dict = json.load(f)
if bigquestion:
s = """
Tell me from this floor plan information in a valid json format. The json format should have the following format. Only fill out the information if it is visible by the floor plan. Do not try to convert footage or meter into the each other or multiply width with height if area is not given. Instead use null whenever the data is not explicitly displayed.
```
{
"balcony": "boolean",
"terrace": "boolean",
"total_number_of_rooms": "number",
"number_of_bathrooms": "number",
"number_of_bedrooms": "number",
"total_square_meter": "number", // if described
"total_square_footage": "number", // if described
"kitchen_separate": "boolean",
"bedroom_faces_south_or_east": "boolean",
// Per room I also want following information
"rooms": {
"title": "e.g. bedroom 1"
"type": "bedroom|livingroom|bathroom",
"dimension_width_metric": "3", // in meters if exist
"dimension_height_metric": "4", // in meters if exist
"sqm" : "number", // if exists
"dimension_width_foot": "5.5", // in foot if exist
"dimension_height_foot": "7", // in foot if exist
}
}
```
""".strip()
qs[s] = "<not set>"
return qs

0
vqa/util.py Normal file
View file

72
vqa/vqa.py Normal file
View file

@ -0,0 +1,72 @@
from transformers import BlipProcessor, BlipForQuestionAnswering
from transformers import ViltProcessor, ViltForQuestionAnswering
from transformers import Pix2StructProcessor, Pix2StructForConditionalGeneration
from transformers import GitVisionConfig, GitVisionModel, AutoProcessor, GitProcessor
class VQA:
name = "Not defined"
def query(image, question: str) -> str:
pass
class Blip(VQA):
name = "Blip"
def query(self, image, question):
processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-capfilt-large")
model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-capfilt-large")
inputs = processor(image, question, return_tensors="pt")
out = model.generate(max_new_tokens=50000, **inputs)
return processor.decode(out[0], skip_special_tokens=True)
class Vilt(VQA):
name = "Vilt"
def query(self, image, question):
processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-finetuned-vqa")
model = ViltForQuestionAnswering.from_pretrained("dandelin/vilt-b32-finetuned-vqa")
# prepare inputs
encoding = processor(image, question, return_tensors="pt")
# forward pass
outputs = model(**encoding)
logits = outputs.logits
idx = logits.argmax(-1).item()
return model.config.id2label[idx]
class Deplot(VQA):
name = "Deplot"
def query(self, image, question):
processor = Pix2StructProcessor.from_pretrained('google/deplot')
model = Pix2StructForConditionalGeneration.from_pretrained('google/deplot')
inputs = processor(images=image, text=question, return_tensors="pt")
predictions = model.generate(**inputs, max_new_tokens=512)
return processor.decode(predictions[0], skip_special_tokens=True)
class PixStructDocVA(VQA):
name = "google/pix2struct-docvqa-large"
def query(self, image, question):
print(question)
model = Pix2StructForConditionalGeneration.from_pretrained("google/pix2struct-docvqa-large")
processor = Pix2StructProcessor.from_pretrained("google/pix2struct-docvqa-large")
inputs = processor(images=image, text=question, return_tensors="pt")
predictions = model.generate(**inputs, max_new_tokens=10000)
answer = processor.decode(predictions[0], skip_special_tokens=True)
print(answer)
return answer
class MicrosoftGIT(VQA):
name = "microsoft/git-base-textvqa"
def query(self, image, question):
processor = GitProcessor.from_pretrained("microsoft/git-base")
model = GitVisionModel.from_pretrained("microsoft/git-base")
inputs = processor(images=image, text=question, return_tensors="pt")
predictions = model.generate(**inputs, max_new_tokens=10000)
answer = processor.decode(predictions[0], skip_special_tokens=True)
return answer