301 lines
8.4 KiB
Text
301 lines
8.4 KiB
Text
|
|
{
|
||
|
|
"cells": [
|
||
|
|
{
|
||
|
|
"cell_type": "code",
|
||
|
|
"execution_count": 1,
|
||
|
|
"id": "546cee84-249f-43ce-a1f9-8475682a833d",
|
||
|
|
"metadata": {
|
||
|
|
"editable": true,
|
||
|
|
"slideshow": {
|
||
|
|
"slide_type": ""
|
||
|
|
},
|
||
|
|
"tags": []
|
||
|
|
},
|
||
|
|
"outputs": [
|
||
|
|
{
|
||
|
|
"name": "stderr",
|
||
|
|
"output_type": "stream",
|
||
|
|
"text": [
|
||
|
|
"/Users/kadir/code/vqa/venv/lib/python3.12/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
|
||
|
|
" from .autonotebook import tqdm as notebook_tqdm\n"
|
||
|
|
]
|
||
|
|
}
|
||
|
|
],
|
||
|
|
"source": [
|
||
|
|
"from transformers import Pix2StructProcessor, Pix2StructForConditionalGeneration\n",
|
||
|
|
"from PIL import Image\n"
|
||
|
|
]
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"cell_type": "code",
|
||
|
|
"execution_count": 45,
|
||
|
|
"id": "74e4a41f-2dfc-428e-8bca-4e9cc1c076c6",
|
||
|
|
"metadata": {},
|
||
|
|
"outputs": [],
|
||
|
|
"source": [
|
||
|
|
"image = Image.open(\"floorplans/46001_32532509_FLP_00_0000.jpeg\")\n",
|
||
|
|
"question = \"How many living rooms are displayed on this floor plan?\""
|
||
|
|
]
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"cell_type": "code",
|
||
|
|
"execution_count": 17,
|
||
|
|
"id": "c24b1560-563b-4ff2-8744-4591ac1cc57b",
|
||
|
|
"metadata": {
|
||
|
|
"editable": true,
|
||
|
|
"slideshow": {
|
||
|
|
"slide_type": ""
|
||
|
|
},
|
||
|
|
"tags": []
|
||
|
|
},
|
||
|
|
"outputs": [],
|
||
|
|
"source": [
|
||
|
|
"processor = Pix2StructProcessor.from_pretrained('google/deplot')\n",
|
||
|
|
"model = Pix2StructForConditionalGeneration.from_pretrained('google/deplot')\n",
|
||
|
|
"\n",
|
||
|
|
"inputs = processor(images=image, text=question, return_tensors=\"pt\")\n",
|
||
|
|
"predictions = model.generate(**inputs, max_new_tokens=512)"
|
||
|
|
]
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"cell_type": "code",
|
||
|
|
"execution_count": 19,
|
||
|
|
"id": "ea990e1f-a660-4efb-be48-d095ab05b50d",
|
||
|
|
"metadata": {
|
||
|
|
"editable": true,
|
||
|
|
"slideshow": {
|
||
|
|
"slide_type": ""
|
||
|
|
},
|
||
|
|
"tags": []
|
||
|
|
},
|
||
|
|
"outputs": [
|
||
|
|
{
|
||
|
|
"data": {
|
||
|
|
"text/plain": [
|
||
|
|
"'TITLE | <0x0A> Fifth Floor<0x0A>Floor Area 642 Sq Ft - 59.64 Sq M | Licence<0x0A>Remung<0x0A>Floor Area 642 Sq Ft - 59.64 Sq M <0x0A> Reception Room<0x0A>1210 x 12.10%<0x0A>39 x 39m | 1280 <0x0A> Balcony<0x0A>Reception Room<0x0A>1210 x 12.10%<0x0A>39 x 39m | 1280 <0x0A> Property<0x0A>Measurer | 1280 <0x0A> Ipaplus.com | 1280'"
|
||
|
|
]
|
||
|
|
},
|
||
|
|
"execution_count": 19,
|
||
|
|
"metadata": {},
|
||
|
|
"output_type": "execute_result"
|
||
|
|
}
|
||
|
|
],
|
||
|
|
"source": [
|
||
|
|
"x = processor.decode(predictions[0], skip_special_tokens=True)\n",
|
||
|
|
"x"
|
||
|
|
]
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"cell_type": "markdown",
|
||
|
|
"id": "e0d94c3e-e8de-452a-8556-12dd2e3e2351",
|
||
|
|
"metadata": {},
|
||
|
|
"source": [
|
||
|
|
"# google/pix2struct-ai2d-base"
|
||
|
|
]
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"cell_type": "code",
|
||
|
|
"execution_count": 23,
|
||
|
|
"id": "cd57d77b-3d70-46f0-97cf-ddd37cd61d23",
|
||
|
|
"metadata": {},
|
||
|
|
"outputs": [],
|
||
|
|
"source": [
|
||
|
|
"question = \"How many living rooms?\"\n",
|
||
|
|
"model = Pix2StructForConditionalGeneration.from_pretrained(\"google/pix2struct-ai2d-base\")\n",
|
||
|
|
"processor = Pix2StructProcessor.from_pretrained(\"google/pix2struct-ai2d-base\")\n",
|
||
|
|
"\n",
|
||
|
|
"inputs = processor(images=image, text=question, return_tensors=\"pt\")\n",
|
||
|
|
"predictions = model.generate(**inputs, max_new_tokens=10000)"
|
||
|
|
]
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"cell_type": "code",
|
||
|
|
"execution_count": 26,
|
||
|
|
"id": "a507a4a4-33b6-4ca3-93dd-3aab489b5c24",
|
||
|
|
"metadata": {},
|
||
|
|
"outputs": [
|
||
|
|
{
|
||
|
|
"name": "stdout",
|
||
|
|
"output_type": "stream",
|
||
|
|
"text": [
|
||
|
|
"predictions 1\n"
|
||
|
|
]
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"data": {
|
||
|
|
"text/plain": [
|
||
|
|
"'floor plan is for illustrative purposes only and is not to scale. Every attempt has been made to ensure the accuracy of the floor plan shown, however all measurements, fixtures, fittings and data shown are an approximate interpretation for illustrative purposes only. 1 sq m + 10.76 sq feet. 7/8/2023>'"
|
||
|
|
]
|
||
|
|
},
|
||
|
|
"execution_count": 26,
|
||
|
|
"metadata": {},
|
||
|
|
"output_type": "execute_result"
|
||
|
|
}
|
||
|
|
],
|
||
|
|
"source": [
|
||
|
|
"x = processor.decode(predictions[0], skip_special_tokens=True)\n",
|
||
|
|
"print(\"predictions: \", len(predictions))\n",
|
||
|
|
"x"
|
||
|
|
]
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"cell_type": "code",
|
||
|
|
"execution_count": null,
|
||
|
|
"id": "a13e12c9-0406-4568-9a36-e61d70e1e683",
|
||
|
|
"metadata": {},
|
||
|
|
"outputs": [],
|
||
|
|
"source": []
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"cell_type": "markdown",
|
||
|
|
"id": "4865a480-230d-48bc-be05-ddc54d107c79",
|
||
|
|
"metadata": {},
|
||
|
|
"source": [
|
||
|
|
"# pix2struct-docvqa-large"
|
||
|
|
]
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"cell_type": "code",
|
||
|
|
"execution_count": 27,
|
||
|
|
"id": "cd209233-ad64-4af0-a8ce-f1d640458d02",
|
||
|
|
"metadata": {},
|
||
|
|
"outputs": [],
|
||
|
|
"source": [
|
||
|
|
"question = \"How many living rooms?\"\n",
|
||
|
|
"model = Pix2StructForConditionalGeneration.from_pretrained(\"google/pix2struct-docvqa-large\")\n",
|
||
|
|
"processor = Pix2StructProcessor.from_pretrained(\"google/pix2struct-docvqa-large\")\n",
|
||
|
|
"\n",
|
||
|
|
"inputs = processor(images=image, text=question, return_tensors=\"pt\")\n",
|
||
|
|
"predictions = model.generate(**inputs, max_new_tokens=10000)"
|
||
|
|
]
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"cell_type": "code",
|
||
|
|
"execution_count": 28,
|
||
|
|
"id": "17378986-ae8b-4ab5-be40-837db0f7aaa6",
|
||
|
|
"metadata": {},
|
||
|
|
"outputs": [
|
||
|
|
{
|
||
|
|
"name": "stdout",
|
||
|
|
"output_type": "stream",
|
||
|
|
"text": [
|
||
|
|
"predictions: 1\n"
|
||
|
|
]
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"data": {
|
||
|
|
"text/plain": [
|
||
|
|
"'2'"
|
||
|
|
]
|
||
|
|
},
|
||
|
|
"execution_count": 28,
|
||
|
|
"metadata": {},
|
||
|
|
"output_type": "execute_result"
|
||
|
|
}
|
||
|
|
],
|
||
|
|
"source": [
|
||
|
|
"x = processor.decode(predictions[0], skip_special_tokens=True)\n",
|
||
|
|
"x"
|
||
|
|
]
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"cell_type": "code",
|
||
|
|
"execution_count": 39,
|
||
|
|
"id": "2d3f5a65-34e8-4c2e-ab37-03e6da86fc1c",
|
||
|
|
"metadata": {},
|
||
|
|
"outputs": [
|
||
|
|
{
|
||
|
|
"name": "stdout",
|
||
|
|
"output_type": "stream",
|
||
|
|
"text": [
|
||
|
|
"['what is the total area measured in square meters? 8. 5']\n"
|
||
|
|
]
|
||
|
|
}
|
||
|
|
],
|
||
|
|
"source": [
|
||
|
|
"from transformers import AutoProcessor, AutoModelForCausalLM\n",
|
||
|
|
"from huggingface_hub import hf_hub_download\n",
|
||
|
|
"import torch\n",
|
||
|
|
"\n",
|
||
|
|
"processor = AutoProcessor.from_pretrained(\"microsoft/git-base-textvqa\")\n",
|
||
|
|
"model = AutoModelForCausalLM.from_pretrained(\"microsoft/git-base-textvqa\")\n",
|
||
|
|
"file_path = hf_hub_download(repo_id=\"nielsr/textvqa-sample\", filename=\"bus.png\", repo_type=\"dataset\")\n",
|
||
|
|
"\n",
|
||
|
|
"pixel_values = processor(images=image, return_tensors=\"pt\").pixel_values\n",
|
||
|
|
"question = \"What is the total area measured in square meters?\"\n",
|
||
|
|
"\n",
|
||
|
|
"input_ids = processor(text=question, add_special_tokens=False).input_ids\n",
|
||
|
|
"\n",
|
||
|
|
"input_ids = [processor.tokenizer.cls_token_id] + input_ids\n",
|
||
|
|
"\n",
|
||
|
|
"input_ids = torch.tensor(input_ids).unsqueeze(0)\n",
|
||
|
|
"\n",
|
||
|
|
"generated_ids = model.generate(pixel_values=pixel_values, input_ids=input_ids, max_length=50)\n",
|
||
|
|
"\n",
|
||
|
|
"print(processor.batch_decode(generated_ids, skip_special_tokens=True))"
|
||
|
|
]
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"cell_type": "code",
|
||
|
|
"execution_count": 46,
|
||
|
|
"id": "254054f0-0102-4927-a93c-c6eba97b437c",
|
||
|
|
"metadata": {},
|
||
|
|
"outputs": [
|
||
|
|
{
|
||
|
|
"name": "stdout",
|
||
|
|
"output_type": "stream",
|
||
|
|
"text": [
|
||
|
|
"['how many living rooms are displayed on this floor plan? 8']\n"
|
||
|
|
]
|
||
|
|
}
|
||
|
|
],
|
||
|
|
"source": [
|
||
|
|
"from transformers import AutoProcessor, AutoModelForCausalLM\n",
|
||
|
|
"\n",
|
||
|
|
"from huggingface_hub import hf_hub_download\n",
|
||
|
|
"\n",
|
||
|
|
"from PIL import Image\n",
|
||
|
|
"\n",
|
||
|
|
"processor = AutoProcessor.from_pretrained(\"microsoft/git-base-textvqa\")\n",
|
||
|
|
"\n",
|
||
|
|
"model = AutoModelForCausalLM.from_pretrained(\"microsoft/git-base-textvqa\")\n",
|
||
|
|
"\n",
|
||
|
|
"file_path = hf_hub_download(repo_id=\"nielsr/textvqa-sample\", filename=\"bus.png\", repo_type=\"dataset\")\n",
|
||
|
|
"\n",
|
||
|
|
"pixel_values = processor(images=image, return_tensors=\"pt\").pixel_values\n",
|
||
|
|
"\n",
|
||
|
|
"input_ids = processor(text=question, add_special_tokens=False).input_ids\n",
|
||
|
|
"\n",
|
||
|
|
"input_ids = [processor.tokenizer.cls_token_id] + input_ids\n",
|
||
|
|
"\n",
|
||
|
|
"input_ids = torch.tensor(input_ids).unsqueeze(0)\n",
|
||
|
|
"\n",
|
||
|
|
"generated_ids = model.generate(pixel_values=pixel_values, input_ids=input_ids, max_length=50)\n",
|
||
|
|
"\n",
|
||
|
|
"print(processor.batch_decode(generated_ids, skip_special_tokens=True))"
|
||
|
|
]
|
||
|
|
}
|
||
|
|
],
|
||
|
|
"metadata": {
|
||
|
|
"kernelspec": {
|
||
|
|
"display_name": "Python 3 (ipykernel)",
|
||
|
|
"language": "python",
|
||
|
|
"name": "python3"
|
||
|
|
},
|
||
|
|
"language_info": {
|
||
|
|
"codemirror_mode": {
|
||
|
|
"name": "ipython",
|
||
|
|
"version": 3
|
||
|
|
},
|
||
|
|
"file_extension": ".py",
|
||
|
|
"mimetype": "text/x-python",
|
||
|
|
"name": "python",
|
||
|
|
"nbconvert_exporter": "python",
|
||
|
|
"pygments_lexer": "ipython3",
|
||
|
|
"version": "3.12.0"
|
||
|
|
}
|
||
|
|
},
|
||
|
|
"nbformat": 4,
|
||
|
|
"nbformat_minor": 5
|
||
|
|
}
|