merging the visual query answering with the crawler. Monorepo go!
This commit is contained in:
parent
85686a8b24
commit
e2f7998ee9
32 changed files with 3449 additions and 0 deletions
0
poetry.lock → crawler/poetry.lock
generated
0
poetry.lock → crawler/poetry.lock
generated
3
vqa/.gitignore
vendored
Normal file
3
vqa/.gitignore
vendored
Normal file
|
|
@ -0,0 +1,3 @@
|
|||
venv/
|
||||
__pycache__/
|
||||
.ipynb_checkpoints/
|
||||
300
vqa/Untitled.ipynb
Normal file
300
vqa/Untitled.ipynb
Normal file
|
|
@ -0,0 +1,300 @@
|
|||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "546cee84-249f-43ce-a1f9-8475682a833d",
|
||||
"metadata": {
|
||||
"editable": true,
|
||||
"slideshow": {
|
||||
"slide_type": ""
|
||||
},
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/Users/kadir/code/vqa/venv/lib/python3.12/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
|
||||
" from .autonotebook import tqdm as notebook_tqdm\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from transformers import Pix2StructProcessor, Pix2StructForConditionalGeneration\n",
|
||||
"from PIL import Image\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 45,
|
||||
"id": "74e4a41f-2dfc-428e-8bca-4e9cc1c076c6",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"image = Image.open(\"floorplans/46001_32532509_FLP_00_0000.jpeg\")\n",
|
||||
"question = \"How many living rooms are displayed on this floor plan?\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 17,
|
||||
"id": "c24b1560-563b-4ff2-8744-4591ac1cc57b",
|
||||
"metadata": {
|
||||
"editable": true,
|
||||
"slideshow": {
|
||||
"slide_type": ""
|
||||
},
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"processor = Pix2StructProcessor.from_pretrained('google/deplot')\n",
|
||||
"model = Pix2StructForConditionalGeneration.from_pretrained('google/deplot')\n",
|
||||
"\n",
|
||||
"inputs = processor(images=image, text=question, return_tensors=\"pt\")\n",
|
||||
"predictions = model.generate(**inputs, max_new_tokens=512)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 19,
|
||||
"id": "ea990e1f-a660-4efb-be48-d095ab05b50d",
|
||||
"metadata": {
|
||||
"editable": true,
|
||||
"slideshow": {
|
||||
"slide_type": ""
|
||||
},
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'TITLE | <0x0A> Fifth Floor<0x0A>Floor Area 642 Sq Ft - 59.64 Sq M | Licence<0x0A>Remung<0x0A>Floor Area 642 Sq Ft - 59.64 Sq M <0x0A> Reception Room<0x0A>1210 x 12.10%<0x0A>39 x 39m | 1280 <0x0A> Balcony<0x0A>Reception Room<0x0A>1210 x 12.10%<0x0A>39 x 39m | 1280 <0x0A> Property<0x0A>Measurer | 1280 <0x0A> Ipaplus.com | 1280'"
|
||||
]
|
||||
},
|
||||
"execution_count": 19,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"x = processor.decode(predictions[0], skip_special_tokens=True)\n",
|
||||
"x"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "e0d94c3e-e8de-452a-8556-12dd2e3e2351",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# google/pix2struct-ai2d-base"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 23,
|
||||
"id": "cd57d77b-3d70-46f0-97cf-ddd37cd61d23",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"question = \"How many living rooms?\"\n",
|
||||
"model = Pix2StructForConditionalGeneration.from_pretrained(\"google/pix2struct-ai2d-base\")\n",
|
||||
"processor = Pix2StructProcessor.from_pretrained(\"google/pix2struct-ai2d-base\")\n",
|
||||
"\n",
|
||||
"inputs = processor(images=image, text=question, return_tensors=\"pt\")\n",
|
||||
"predictions = model.generate(**inputs, max_new_tokens=10000)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 26,
|
||||
"id": "a507a4a4-33b6-4ca3-93dd-3aab489b5c24",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"predictions 1\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'floor plan is for illustrative purposes only and is not to scale. Every attempt has been made to ensure the accuracy of the floor plan shown, however all measurements, fixtures, fittings and data shown are an approximate interpretation for illustrative purposes only. 1 sq m + 10.76 sq feet. 7/8/2023>'"
|
||||
]
|
||||
},
|
||||
"execution_count": 26,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"x = processor.decode(predictions[0], skip_special_tokens=True)\n",
|
||||
"print(\"predictions: \", len(predictions))\n",
|
||||
"x"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "a13e12c9-0406-4568-9a36-e61d70e1e683",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "4865a480-230d-48bc-be05-ddc54d107c79",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# pix2struct-docvqa-large"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 27,
|
||||
"id": "cd209233-ad64-4af0-a8ce-f1d640458d02",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"question = \"How many living rooms?\"\n",
|
||||
"model = Pix2StructForConditionalGeneration.from_pretrained(\"google/pix2struct-docvqa-large\")\n",
|
||||
"processor = Pix2StructProcessor.from_pretrained(\"google/pix2struct-docvqa-large\")\n",
|
||||
"\n",
|
||||
"inputs = processor(images=image, text=question, return_tensors=\"pt\")\n",
|
||||
"predictions = model.generate(**inputs, max_new_tokens=10000)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 28,
|
||||
"id": "17378986-ae8b-4ab5-be40-837db0f7aaa6",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"predictions: 1\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'2'"
|
||||
]
|
||||
},
|
||||
"execution_count": 28,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"x = processor.decode(predictions[0], skip_special_tokens=True)\n",
|
||||
"x"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 39,
|
||||
"id": "2d3f5a65-34e8-4c2e-ab37-03e6da86fc1c",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"['what is the total area measured in square meters? 8. 5']\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from transformers import AutoProcessor, AutoModelForCausalLM\n",
|
||||
"from huggingface_hub import hf_hub_download\n",
|
||||
"import torch\n",
|
||||
"\n",
|
||||
"processor = AutoProcessor.from_pretrained(\"microsoft/git-base-textvqa\")\n",
|
||||
"model = AutoModelForCausalLM.from_pretrained(\"microsoft/git-base-textvqa\")\n",
|
||||
"file_path = hf_hub_download(repo_id=\"nielsr/textvqa-sample\", filename=\"bus.png\", repo_type=\"dataset\")\n",
|
||||
"\n",
|
||||
"pixel_values = processor(images=image, return_tensors=\"pt\").pixel_values\n",
|
||||
"question = \"What is the total area measured in square meters?\"\n",
|
||||
"\n",
|
||||
"input_ids = processor(text=question, add_special_tokens=False).input_ids\n",
|
||||
"\n",
|
||||
"input_ids = [processor.tokenizer.cls_token_id] + input_ids\n",
|
||||
"\n",
|
||||
"input_ids = torch.tensor(input_ids).unsqueeze(0)\n",
|
||||
"\n",
|
||||
"generated_ids = model.generate(pixel_values=pixel_values, input_ids=input_ids, max_length=50)\n",
|
||||
"\n",
|
||||
"print(processor.batch_decode(generated_ids, skip_special_tokens=True))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 46,
|
||||
"id": "254054f0-0102-4927-a93c-c6eba97b437c",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"['how many living rooms are displayed on this floor plan? 8']\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from transformers import AutoProcessor, AutoModelForCausalLM\n",
|
||||
"\n",
|
||||
"from huggingface_hub import hf_hub_download\n",
|
||||
"\n",
|
||||
"from PIL import Image\n",
|
||||
"\n",
|
||||
"processor = AutoProcessor.from_pretrained(\"microsoft/git-base-textvqa\")\n",
|
||||
"\n",
|
||||
"model = AutoModelForCausalLM.from_pretrained(\"microsoft/git-base-textvqa\")\n",
|
||||
"\n",
|
||||
"file_path = hf_hub_download(repo_id=\"nielsr/textvqa-sample\", filename=\"bus.png\", repo_type=\"dataset\")\n",
|
||||
"\n",
|
||||
"pixel_values = processor(images=image, return_tensors=\"pt\").pixel_values\n",
|
||||
"\n",
|
||||
"input_ids = processor(text=question, add_special_tokens=False).input_ids\n",
|
||||
"\n",
|
||||
"input_ids = [processor.tokenizer.cls_token_id] + input_ids\n",
|
||||
"\n",
|
||||
"input_ids = torch.tensor(input_ids).unsqueeze(0)\n",
|
||||
"\n",
|
||||
"generated_ids = model.generate(pixel_values=pixel_values, input_ids=input_ids, max_length=50)\n",
|
||||
"\n",
|
||||
"print(processor.batch_decode(generated_ids, skip_special_tokens=True))"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.12.0"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
BIN
vqa/floorplans/46001_32532509_FLP_00_0000.jpeg
Normal file
BIN
vqa/floorplans/46001_32532509_FLP_00_0000.jpeg
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 59 KiB |
24
vqa/main.py
Normal file
24
vqa/main.py
Normal file
|
|
@ -0,0 +1,24 @@
|
|||
from vqa import Blip, MicrosoftGIT, PixStructDocVA, Vilt, Deplot, VQA
|
||||
from PIL import Image
|
||||
from typing import List
|
||||
from questions import load_questions
|
||||
|
||||
image = Image.open("floorplans/46001_32532509_FLP_00_0000.jpeg")
|
||||
|
||||
questions = load_questions(False)
|
||||
|
||||
models: List[VQA] = [
|
||||
# Blip(),
|
||||
# Vilt(),
|
||||
# Deplot(),
|
||||
# PixStructDocVA(),
|
||||
MicrosoftGIT(),
|
||||
]
|
||||
|
||||
for question, answer in questions.items():
|
||||
answers = {model.name: model.query(image, question) for model in models}
|
||||
|
||||
print("# Question:", question)
|
||||
for modelname, answer in answers.items():
|
||||
print(f"{modelname}: {answer}")
|
||||
print("Expected:", answer)
|
||||
2984
vqa/poetry.lock
generated
Normal file
2984
vqa/poetry.lock
generated
Normal file
File diff suppressed because it is too large
Load diff
21
vqa/pyproject.toml
Normal file
21
vqa/pyproject.toml
Normal file
|
|
@ -0,0 +1,21 @@
|
|||
[tool.poetry]
|
||||
name = "vqa"
|
||||
version = "0.1.0"
|
||||
description = ""
|
||||
authors = ["Kadir"]
|
||||
readme = "README.md"
|
||||
|
||||
[tool.poetry.dependencies]
|
||||
python = "^3.12"
|
||||
requests = "^2.31.0"
|
||||
transformers = "^4.38.2"
|
||||
pillow = "^10.2.0"
|
||||
torch = "^2.2.1"
|
||||
torchvision = "^0.17.1"
|
||||
torchaudio = "^2.2.1"
|
||||
jupyterlab = "^4.1.2"
|
||||
|
||||
|
||||
[build-system]
|
||||
requires = ["poetry-core"]
|
||||
build-backend = "poetry.core.masonry.api"
|
||||
8
vqa/questions.json
Normal file
8
vqa/questions.json
Normal file
|
|
@ -0,0 +1,8 @@
|
|||
{
|
||||
"How many bedrooms?": "2",
|
||||
"How many living rooms?":"1",
|
||||
"How many reception rooms?":"1",
|
||||
"How many bathrooms?": "1",
|
||||
"What is the total square meter area?": "59.64",
|
||||
"Which floor is it?": "5"
|
||||
}
|
||||
37
vqa/questions.py
Normal file
37
vqa/questions.py
Normal file
|
|
@ -0,0 +1,37 @@
|
|||
import json
|
||||
|
||||
|
||||
def load_questions(bigquestion=False):
|
||||
with open("questions.json") as f:
|
||||
qs: dict = json.load(f)
|
||||
|
||||
if bigquestion:
|
||||
s = """
|
||||
Tell me from this floor plan information in a valid json format. The json format should have the following format. Only fill out the information if it is visible by the floor plan. Do not try to convert footage or meter into the each other or multiply width with height if area is not given. Instead use null whenever the data is not explicitly displayed.
|
||||
```
|
||||
{
|
||||
"balcony": "boolean",
|
||||
"terrace": "boolean",
|
||||
"total_number_of_rooms": "number",
|
||||
"number_of_bathrooms": "number",
|
||||
"number_of_bedrooms": "number",
|
||||
"total_square_meter": "number", // if described
|
||||
"total_square_footage": "number", // if described
|
||||
"kitchen_separate": "boolean",
|
||||
"bedroom_faces_south_or_east": "boolean",
|
||||
// Per room I also want following information
|
||||
"rooms": {
|
||||
"title": "e.g. bedroom 1"
|
||||
"type": "bedroom|livingroom|bathroom",
|
||||
"dimension_width_metric": "3", // in meters if exist
|
||||
"dimension_height_metric": "4", // in meters if exist
|
||||
"sqm" : "number", // if exists
|
||||
"dimension_width_foot": "5.5", // in foot if exist
|
||||
"dimension_height_foot": "7", // in foot if exist
|
||||
}
|
||||
}
|
||||
```
|
||||
""".strip()
|
||||
qs[s] = "<not set>"
|
||||
|
||||
return qs
|
||||
0
vqa/util.py
Normal file
0
vqa/util.py
Normal file
72
vqa/vqa.py
Normal file
72
vqa/vqa.py
Normal file
|
|
@ -0,0 +1,72 @@
|
|||
from transformers import BlipProcessor, BlipForQuestionAnswering
|
||||
from transformers import ViltProcessor, ViltForQuestionAnswering
|
||||
from transformers import Pix2StructProcessor, Pix2StructForConditionalGeneration
|
||||
from transformers import GitVisionConfig, GitVisionModel, AutoProcessor, GitProcessor
|
||||
|
||||
class VQA:
|
||||
name = "Not defined"
|
||||
def query(image, question: str) -> str:
|
||||
pass
|
||||
|
||||
class Blip(VQA):
|
||||
name = "Blip"
|
||||
def query(self, image, question):
|
||||
processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-capfilt-large")
|
||||
model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-capfilt-large")
|
||||
inputs = processor(image, question, return_tensors="pt")
|
||||
out = model.generate(max_new_tokens=50000, **inputs)
|
||||
return processor.decode(out[0], skip_special_tokens=True)
|
||||
|
||||
|
||||
class Vilt(VQA):
|
||||
name = "Vilt"
|
||||
def query(self, image, question):
|
||||
processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-finetuned-vqa")
|
||||
model = ViltForQuestionAnswering.from_pretrained("dandelin/vilt-b32-finetuned-vqa")
|
||||
|
||||
# prepare inputs
|
||||
encoding = processor(image, question, return_tensors="pt")
|
||||
|
||||
# forward pass
|
||||
outputs = model(**encoding)
|
||||
logits = outputs.logits
|
||||
idx = logits.argmax(-1).item()
|
||||
return model.config.id2label[idx]
|
||||
|
||||
|
||||
|
||||
class Deplot(VQA):
|
||||
name = "Deplot"
|
||||
def query(self, image, question):
|
||||
processor = Pix2StructProcessor.from_pretrained('google/deplot')
|
||||
model = Pix2StructForConditionalGeneration.from_pretrained('google/deplot')
|
||||
|
||||
inputs = processor(images=image, text=question, return_tensors="pt")
|
||||
predictions = model.generate(**inputs, max_new_tokens=512)
|
||||
return processor.decode(predictions[0], skip_special_tokens=True)
|
||||
|
||||
|
||||
class PixStructDocVA(VQA):
|
||||
name = "google/pix2struct-docvqa-large"
|
||||
def query(self, image, question):
|
||||
print(question)
|
||||
model = Pix2StructForConditionalGeneration.from_pretrained("google/pix2struct-docvqa-large")
|
||||
processor = Pix2StructProcessor.from_pretrained("google/pix2struct-docvqa-large")
|
||||
|
||||
inputs = processor(images=image, text=question, return_tensors="pt")
|
||||
predictions = model.generate(**inputs, max_new_tokens=10000)
|
||||
answer = processor.decode(predictions[0], skip_special_tokens=True)
|
||||
print(answer)
|
||||
return answer
|
||||
|
||||
class MicrosoftGIT(VQA):
|
||||
name = "microsoft/git-base-textvqa"
|
||||
def query(self, image, question):
|
||||
processor = GitProcessor.from_pretrained("microsoft/git-base")
|
||||
model = GitVisionModel.from_pretrained("microsoft/git-base")
|
||||
inputs = processor(images=image, text=question, return_tensors="pt")
|
||||
predictions = model.generate(**inputs, max_new_tokens=10000)
|
||||
answer = processor.decode(predictions[0], skip_special_tokens=True)
|
||||
|
||||
return answer
|
||||
|
||||
Loading…
Add table
Add a link
Reference in a new issue