wrongmove/vqa/Untitled.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "546cee84-249f-43ce-a1f9-8475682a833d",
   "metadata": {
    "editable": true,
    "slideshow": {
     "slide_type": ""
    },
    "tags": []
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/kadir/code/realestate/vqa/venv/lib/python3.12/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
      "  from .autonotebook import tqdm as notebook_tqdm\n"
     ]
    }
   ],
   "source": [
    "from transformers import Pix2StructProcessor, Pix2StructForConditionalGeneration\n",
    "from PIL import Image\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "74e4a41f-2dfc-428e-8bca-4e9cc1c076c6",
   "metadata": {},
   "outputs": [],
   "source": [
    "image = Image.open(\"floorplans/46001_32532509_FLP_00_0000.jpeg\")\n",
    "question = \"How many living rooms are displayed on this floor plan?\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "c24b1560-563b-4ff2-8744-4591ac1cc57b",
   "metadata": {
    "editable": true,
    "slideshow": {
     "slide_type": ""
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "processor = Pix2StructProcessor.from_pretrained('google/deplot')\n",
    "model = Pix2StructForConditionalGeneration.from_pretrained('google/deplot')\n",
    "\n",
    "inputs = processor(images=image, text=question, return_tensors=\"pt\")\n",
    "predictions = model.generate(**inputs, max_new_tokens=512)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "ea990e1f-a660-4efb-be48-d095ab05b50d",
   "metadata": {
    "editable": true,
    "slideshow": {
     "slide_type": ""
    },
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "TITLE |  <0x0A> Fifth Floor<0x0A>Floor Area 642 Sq Ft - 59.64 Sq M | LUX<0x0A>Floor Area 642 Sq Ft - 59.64 Sq M<0x0A>Reception Room<0x0A>1210×1210<0x0A>3.9×3.9m | 12.10×1210<0x0A>Reception Room<0x0A>3.9×3.9m | 12.10×1210<0x0A>Balcony<0x0A>1.5×4.5′ | 1.5×4.5 <0x0A> Bathroom<0x0A>131°×102°<0x0A>4.0×3.1m | 1.40×2.7m | 1.40×2.7m <0x0A> Kitchen<0x0A>102°×710°<0x0A>3.1×2.4m | 1.10×3.1m | 1.00×3.1m <0x0A> Bedroom<0x0A>131°×810<0x0A>4.0×2.7m | 1.00×3.1m | 1.00×3.1m <0x0A> Kitchen<0x0A>102°×710<0x0A>3.1×2.4m | 1.00×3.1m | 1.00×3.1m <0x0A> Reception Room<0x0A>1210×1210<0x0A>3.9×3.9m | 1.50×4.5 | 1.00×3.5m <0x0A> Balcony<0x0A>1.5×4.5 | 1.50×4.5 | 1.00×3.5m\n"
     ]
    }
   ],
   "source": [
    "x = processor.decode(predictions[0], skip_special_tokens=True)\n",
    "print(x)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "b4a8b5fb-7cc5-441d-ad60-cff7c03a103e",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "TITLE |  \n",
      " Fifth Floor\n",
      "Floor Area 642 Sq Ft - 59.64 Sq M | LUX\n",
      "Floor Area 642 Sq Ft - 59.64 Sq M\n",
      "Reception Room\n",
      "1210×1210\n",
      "3.9×3.9m | 12.10×1210\n",
      "Reception Room\n",
      "3.9×3.9m | 12.10×1210\n",
      "Balcony\n",
      "1.5×4.5′ | 1.5×4.5 \n",
      " Bathroom\n",
      "131°×102°\n",
      "4.0×3.1m | 1.40×2.7m | 1.40×2.7m \n",
      " Kitchen\n",
      "102°×710°\n",
      "3.1×2.4m | 1.10×3.1m | 1.00×3.1m \n",
      " Bedroom\n",
      "131°×810\n",
      "4.0×2.7m | 1.00×3.1m | 1.00×3.1m \n",
      " Kitchen\n",
      "102°×710\n",
      "3.1×2.4m | 1.00×3.1m | 1.00×3.1m \n",
      " Reception Room\n",
      "1210×1210\n",
      "3.9×3.9m | 1.50×4.5 | 1.00×3.5m \n",
      " Balcony\n",
      "1.5×4.5 | 1.50×4.5 | 1.00×3.5m\n"
     ]
    }
   ],
   "source": [
    "x = r'\\d+.\\d*'"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e0d94c3e-e8de-452a-8556-12dd2e3e2351",
   "metadata": {},
   "source": [
    "# google/pix2struct-ai2d-base"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "cd57d77b-3d70-46f0-97cf-ddd37cd61d23",
   "metadata": {},
   "outputs": [],
   "source": [
    "question = \"How many living rooms?\"\n",
    "model = Pix2StructForConditionalGeneration.from_pretrained(\"google/pix2struct-ai2d-base\")\n",
    "processor = Pix2StructProcessor.from_pretrained(\"google/pix2struct-ai2d-base\")\n",
    "\n",
    "inputs = processor(images=image, text=question, return_tensors=\"pt\")\n",
    "predictions = model.generate(**inputs, max_new_tokens=10000)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "id": "a507a4a4-33b6-4ca3-93dd-3aab489b5c24",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "predictions 1\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "'floor plan is for illustrative purposes only and is not to scale. Every attempt has been made to ensure the accuracy of the floor plan shown, however all measurements, fixtures, fittings and data shown are an approximate interpretation for illustrative purposes only. 1 sq m + 10.76 sq feet. 7/8/2023>'"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "x = processor.decode(predictions[0], skip_special_tokens=True)\n",
    "print(\"predictions: \", len(predictions))\n",
    "x"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a13e12c9-0406-4568-9a36-e61d70e1e683",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "id": "4865a480-230d-48bc-be05-ddc54d107c79",
   "metadata": {},
   "source": [
    "# pix2struct-docvqa-large"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "id": "cd209233-ad64-4af0-a8ce-f1d640458d02",
   "metadata": {},
   "outputs": [],
   "source": [
    "question = \"How many living rooms?\"\n",
    "model = Pix2StructForConditionalGeneration.from_pretrained(\"google/pix2struct-docvqa-large\")\n",
    "processor = Pix2StructProcessor.from_pretrained(\"google/pix2struct-docvqa-large\")\n",
    "\n",
    "inputs = processor(images=image, text=question, return_tensors=\"pt\")\n",
    "predictions = model.generate(**inputs, max_new_tokens=10000)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "id": "17378986-ae8b-4ab5-be40-837db0f7aaa6",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "predictions:  1\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "'2'"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "x = processor.decode(predictions[0], skip_special_tokens=True)\n",
    "x"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "id": "2d3f5a65-34e8-4c2e-ab37-03e6da86fc1c",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['what is the total area measured in square meters? 8. 5']\n"
     ]
    }
   ],
   "source": [
    "from transformers import AutoProcessor, AutoModelForCausalLM\n",
    "from huggingface_hub import hf_hub_download\n",
    "import torch\n",
    "\n",
    "processor = AutoProcessor.from_pretrained(\"microsoft/git-base-textvqa\")\n",
    "model = AutoModelForCausalLM.from_pretrained(\"microsoft/git-base-textvqa\")\n",
    "file_path = hf_hub_download(repo_id=\"nielsr/textvqa-sample\", filename=\"bus.png\", repo_type=\"dataset\")\n",
    "\n",
    "pixel_values = processor(images=image, return_tensors=\"pt\").pixel_values\n",
    "question = \"What is the total area measured in square meters?\"\n",
    "\n",
    "input_ids = processor(text=question, add_special_tokens=False).input_ids\n",
    "\n",
    "input_ids = [processor.tokenizer.cls_token_id] + input_ids\n",
    "\n",
    "input_ids = torch.tensor(input_ids).unsqueeze(0)\n",
    "\n",
    "generated_ids = model.generate(pixel_values=pixel_values, input_ids=input_ids, max_length=50)\n",
    "\n",
    "print(processor.batch_decode(generated_ids, skip_special_tokens=True))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "id": "254054f0-0102-4927-a93c-c6eba97b437c",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['how many living rooms are displayed on this floor plan? 8']\n"
     ]
    }
   ],
   "source": [
    "from transformers import AutoProcessor, AutoModelForCausalLM\n",
    "\n",
    "from huggingface_hub import hf_hub_download\n",
    "\n",
    "from PIL import Image\n",
    "\n",
    "processor = AutoProcessor.from_pretrained(\"microsoft/git-base-textvqa\")\n",
    "\n",
    "model = AutoModelForCausalLM.from_pretrained(\"microsoft/git-base-textvqa\")\n",
    "\n",
    "file_path = hf_hub_download(repo_id=\"nielsr/textvqa-sample\", filename=\"bus.png\", repo_type=\"dataset\")\n",
    "\n",
    "pixel_values = processor(images=image, return_tensors=\"pt\").pixel_values\n",
    "\n",
    "input_ids = processor(text=question, add_special_tokens=False).input_ids\n",
    "\n",
    "input_ids = [processor.tokenizer.cls_token_id] + input_ids\n",
    "\n",
    "input_ids = torch.tensor(input_ids).unsqueeze(0)\n",
    "\n",
    "generated_ids = model.generate(pixel_values=pixel_values, input_ids=input_ids, max_length=50)\n",
    "\n",
    "print(processor.batch_decode(generated_ids, skip_special_tokens=True))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8b5b4fd0-1152-4834-9e3f-89be944bad16",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4fef9f95-22c3-4fd9-907d-471520494533",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1e56a1f7-c10f-4169-a7c1-a862937c22ba",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "id": "b6eec969-ebd6-4dac-8ca6-17569a9a3f8a",
   "metadata": {},
   "source": [
    "# asd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "7c0a2360-5de2-475b-9d10-d84bb5bede17",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "a diagram of a floor plan of a two bedroom apartment\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/kadir/code/realestate/vqa/venv/lib/python3.12/site-packages/transformers/generation/configuration_utils.py:410: UserWarning: `do_sample` is set to `False`. However, `temperature` is set to `1.2` -- this flag is only used in sample-based generation modes. You should set `do_sample=True` or unset `temperature`.\n",
      "  warnings.warn(\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "this is an image of a floor plan of a two bedroom apartment\n"
     ]
    }
   ],
   "source": [
    "from transformers import BlipProcessor, BlipForConditionalGeneration\n",
    "\n",
    "processor = BlipProcessor.from_pretrained(\"Salesforce/blip-image-captioning-large\")\n",
    "model = BlipForConditionalGeneration.from_pretrained(\"Salesforce/blip-image-captioning-large\")\n",
    "\n",
    "# conditional image captioning\n",
    "text = \"\"\n",
    "inputs = processor(image, text, return_tensors=\"pt\")\n",
    "\n",
    "out = model.generate(**inputs)\n",
    "print(processor.decode(out[0], skip_special_tokens=True))\n",
    "\n",
    "# unconditional image captioning\n",
    "inputs = processor(image, return_tensors=\"pt\")\n",
    "\n",
    "out = model.generate(**inputs)\n",
    "print(processor.decode(out[0], skip_special_tokens=True))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9f5a7d57-30f3-4b1b-bc07-dbf076d41792",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "41c23bf0-cf64-4c71-9bc0-57f78af609b4",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}