wrongmove/vqa/Untitled.ipynb

301 lines
8.4 KiB
Text
Raw Normal View History

{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "546cee84-249f-43ce-a1f9-8475682a833d",
"metadata": {
"editable": true,
"slideshow": {
"slide_type": ""
},
"tags": []
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/Users/kadir/code/vqa/venv/lib/python3.12/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
" from .autonotebook import tqdm as notebook_tqdm\n"
]
}
],
"source": [
"from transformers import Pix2StructProcessor, Pix2StructForConditionalGeneration\n",
"from PIL import Image\n"
]
},
{
"cell_type": "code",
"execution_count": 45,
"id": "74e4a41f-2dfc-428e-8bca-4e9cc1c076c6",
"metadata": {},
"outputs": [],
"source": [
"image = Image.open(\"floorplans/46001_32532509_FLP_00_0000.jpeg\")\n",
"question = \"How many living rooms are displayed on this floor plan?\""
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "c24b1560-563b-4ff2-8744-4591ac1cc57b",
"metadata": {
"editable": true,
"slideshow": {
"slide_type": ""
},
"tags": []
},
"outputs": [],
"source": [
"processor = Pix2StructProcessor.from_pretrained('google/deplot')\n",
"model = Pix2StructForConditionalGeneration.from_pretrained('google/deplot')\n",
"\n",
"inputs = processor(images=image, text=question, return_tensors=\"pt\")\n",
"predictions = model.generate(**inputs, max_new_tokens=512)"
]
},
{
"cell_type": "code",
"execution_count": 19,
"id": "ea990e1f-a660-4efb-be48-d095ab05b50d",
"metadata": {
"editable": true,
"slideshow": {
"slide_type": ""
},
"tags": []
},
"outputs": [
{
"data": {
"text/plain": [
"'TITLE | <0x0A> Fifth Floor<0x0A>Floor Area 642 Sq Ft - 59.64 Sq M | Licence<0x0A>Remung<0x0A>Floor Area 642 Sq Ft - 59.64 Sq M <0x0A> Reception Room<0x0A>1210 x 12.10%<0x0A>39 x 39m | 1280 <0x0A> Balcony<0x0A>Reception Room<0x0A>1210 x 12.10%<0x0A>39 x 39m | 1280 <0x0A> Property<0x0A>Measurer | 1280 <0x0A> Ipaplus.com | 1280'"
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"x = processor.decode(predictions[0], skip_special_tokens=True)\n",
"x"
]
},
{
"cell_type": "markdown",
"id": "e0d94c3e-e8de-452a-8556-12dd2e3e2351",
"metadata": {},
"source": [
"# google/pix2struct-ai2d-base"
]
},
{
"cell_type": "code",
"execution_count": 23,
"id": "cd57d77b-3d70-46f0-97cf-ddd37cd61d23",
"metadata": {},
"outputs": [],
"source": [
"question = \"How many living rooms?\"\n",
"model = Pix2StructForConditionalGeneration.from_pretrained(\"google/pix2struct-ai2d-base\")\n",
"processor = Pix2StructProcessor.from_pretrained(\"google/pix2struct-ai2d-base\")\n",
"\n",
"inputs = processor(images=image, text=question, return_tensors=\"pt\")\n",
"predictions = model.generate(**inputs, max_new_tokens=10000)"
]
},
{
"cell_type": "code",
"execution_count": 26,
"id": "a507a4a4-33b6-4ca3-93dd-3aab489b5c24",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"predictions 1\n"
]
},
{
"data": {
"text/plain": [
"'floor plan is for illustrative purposes only and is not to scale. Every attempt has been made to ensure the accuracy of the floor plan shown, however all measurements, fixtures, fittings and data shown are an approximate interpretation for illustrative purposes only. 1 sq m + 10.76 sq feet. 7/8/2023>'"
]
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"x = processor.decode(predictions[0], skip_special_tokens=True)\n",
"print(\"predictions: \", len(predictions))\n",
"x"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a13e12c9-0406-4568-9a36-e61d70e1e683",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"id": "4865a480-230d-48bc-be05-ddc54d107c79",
"metadata": {},
"source": [
"# pix2struct-docvqa-large"
]
},
{
"cell_type": "code",
"execution_count": 27,
"id": "cd209233-ad64-4af0-a8ce-f1d640458d02",
"metadata": {},
"outputs": [],
"source": [
"question = \"How many living rooms?\"\n",
"model = Pix2StructForConditionalGeneration.from_pretrained(\"google/pix2struct-docvqa-large\")\n",
"processor = Pix2StructProcessor.from_pretrained(\"google/pix2struct-docvqa-large\")\n",
"\n",
"inputs = processor(images=image, text=question, return_tensors=\"pt\")\n",
"predictions = model.generate(**inputs, max_new_tokens=10000)"
]
},
{
"cell_type": "code",
"execution_count": 28,
"id": "17378986-ae8b-4ab5-be40-837db0f7aaa6",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"predictions: 1\n"
]
},
{
"data": {
"text/plain": [
"'2'"
]
},
"execution_count": 28,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"x = processor.decode(predictions[0], skip_special_tokens=True)\n",
"x"
]
},
{
"cell_type": "code",
"execution_count": 39,
"id": "2d3f5a65-34e8-4c2e-ab37-03e6da86fc1c",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['what is the total area measured in square meters? 8. 5']\n"
]
}
],
"source": [
"from transformers import AutoProcessor, AutoModelForCausalLM\n",
"from huggingface_hub import hf_hub_download\n",
"import torch\n",
"\n",
"processor = AutoProcessor.from_pretrained(\"microsoft/git-base-textvqa\")\n",
"model = AutoModelForCausalLM.from_pretrained(\"microsoft/git-base-textvqa\")\n",
"file_path = hf_hub_download(repo_id=\"nielsr/textvqa-sample\", filename=\"bus.png\", repo_type=\"dataset\")\n",
"\n",
"pixel_values = processor(images=image, return_tensors=\"pt\").pixel_values\n",
"question = \"What is the total area measured in square meters?\"\n",
"\n",
"input_ids = processor(text=question, add_special_tokens=False).input_ids\n",
"\n",
"input_ids = [processor.tokenizer.cls_token_id] + input_ids\n",
"\n",
"input_ids = torch.tensor(input_ids).unsqueeze(0)\n",
"\n",
"generated_ids = model.generate(pixel_values=pixel_values, input_ids=input_ids, max_length=50)\n",
"\n",
"print(processor.batch_decode(generated_ids, skip_special_tokens=True))"
]
},
{
"cell_type": "code",
"execution_count": 46,
"id": "254054f0-0102-4927-a93c-c6eba97b437c",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['how many living rooms are displayed on this floor plan? 8']\n"
]
}
],
"source": [
"from transformers import AutoProcessor, AutoModelForCausalLM\n",
"\n",
"from huggingface_hub import hf_hub_download\n",
"\n",
"from PIL import Image\n",
"\n",
"processor = AutoProcessor.from_pretrained(\"microsoft/git-base-textvqa\")\n",
"\n",
"model = AutoModelForCausalLM.from_pretrained(\"microsoft/git-base-textvqa\")\n",
"\n",
"file_path = hf_hub_download(repo_id=\"nielsr/textvqa-sample\", filename=\"bus.png\", repo_type=\"dataset\")\n",
"\n",
"pixel_values = processor(images=image, return_tensors=\"pt\").pixel_values\n",
"\n",
"input_ids = processor(text=question, add_special_tokens=False).input_ids\n",
"\n",
"input_ids = [processor.tokenizer.cls_token_id] + input_ids\n",
"\n",
"input_ids = torch.tensor(input_ids).unsqueeze(0)\n",
"\n",
"generated_ids = model.generate(pixel_values=pixel_values, input_ids=input_ids, max_length=50)\n",
"\n",
"print(processor.batch_decode(generated_ids, skip_special_tokens=True))"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.0"
}
},
"nbformat": 4,
"nbformat_minor": 5
}