wrongmove/vqa/Untitled.ipynb

504 lines
14 KiB
Text
Raw Permalink Normal View History

{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "546cee84-249f-43ce-a1f9-8475682a833d",
"metadata": {
"editable": true,
"slideshow": {
"slide_type": ""
},
"tags": []
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/Users/kadir/code/realestate/vqa/venv/lib/python3.12/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
" from .autonotebook import tqdm as notebook_tqdm\n"
]
}
],
"source": [
"from transformers import Pix2StructProcessor, Pix2StructForConditionalGeneration\n",
"from PIL import Image\n",
"import re"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "74e4a41f-2dfc-428e-8bca-4e9cc1c076c6",
"metadata": {},
"outputs": [],
"source": [
"# image = Image.open(\"floorplans/46001_32532509_FLP_00_0000.jpeg\")\n",
"image = Image.open(\"/Users/kadir/code/realestate/crawler/data/floorplans/15508_EnfieldRd_FLP_02_0000.jpg\")\n",
"question = \"How many living rooms are displayed on this floor plan?\""
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "c24b1560-563b-4ff2-8744-4591ac1cc57b",
"metadata": {
"editable": true,
"slideshow": {
"slide_type": ""
},
"tags": []
},
"outputs": [],
"source": [
"processor = Pix2StructProcessor.from_pretrained('google/deplot')\n",
"model = Pix2StructForConditionalGeneration.from_pretrained('google/deplot')\n",
"\n",
"inputs = processor(images=image, text=question, return_tensors=\"pt\")\n",
"predictions = model.generate(**inputs, max_new_tokens=512)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "ea990e1f-a660-4efb-be48-d095ab05b50d",
"metadata": {
"editable": true,
"slideshow": {
"slide_type": ""
},
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"TITLE | Approx. Gross Internal Area *<0x0A>636 Ft * - 59.08 M *<0x0A>First Floor | Store<0x0A>Reception<0x0A>Room<0x0A>16'8\" x 129\"<0x0A>5.08 x 3.89m | 1<0x0A>Reception<0x0A>Room<0x0A>16'8\" x 129\"<0x0A>5.08 x 3.89m | 1<0x0A>Reception<0x0A>Room<0x0A>16'8\" x 129\"<0x0A>5.08 x 3.89m | 1<0x0A>Reception<0x0A>Room<0x0A>16'8\" x 129\"<0x0A>5.08 x 3.89m | 1\n"
]
}
],
"source": [
"x = processor.decode(predictions[0], skip_special_tokens=True)\n",
"print(x)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "b4a8b5fb-7cc5-441d-ad60-cff7c03a103e",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"59.08"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"def extract_total_sqm(deplot_input_str):\n",
" sqmregex = r'(\\d+\\.\\d*) ?(sqm|sq.m|sq m|m)'\n",
" matches = re.findall(sqmregex, deplot_input_str.lower())\n",
" sqms = [float(m[0]) for m in matches]\n",
" # print(sqms)\n",
" return max(sqms)\n",
"\n",
"extract_total_sqm(x)"
]
},
{
"cell_type": "markdown",
"id": "4bbf0ef5-0a38-4710-9ce1-e5e24a65d83b",
"metadata": {},
"source": [
"# tesseract"
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "c73b5339-d493-4a55-a93c-9dcd7ee7a0af",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"DEFOE HOUSE EC2Y 8DN\n",
"\n",
"Prepared for Nicola Lee Ltd.\n",
"\n",
"Kitchen\n",
"93 x 74\n",
"\n",
"\\\n",
"Living Space “ (2.81mx 2.23m) Bedroom\n",
"162 11'8 18x 118\n",
"(3.55m x 3.55m)\n",
"\n",
"(4.90m x 3.55m)\n",
"\n",
"ee\n",
"\n",
"Balcony\n",
"12'6x4'll\n",
"(3.80m x 1.49m)\n",
"r\n",
"\n",
"Seventh Floor\n",
"Terrace\n",
"21'S x 16'10\n",
"(6.50m x 5.12m)\n",
"\n",
"Approximate Gross Internal Floor Area : 579 sq ft / 53.8 sqm\n",
"\n",
"\n"
]
}
],
"source": [
"from PIL import Image\n",
"\n",
"import pytesseract\n",
"\n",
"# If you don't have tesseract executable in your PATH, include the following:\n",
"# pytesseract.pytesseract.tesseract_cmd = r'<full_path_to_your_tesseract_executable>'\n",
"# Example tesseract_cmd = r'C:\\Program Files (x86)\\Tesseract-OCR\\tesseract'\n",
"\n",
"# Simple image to string\n",
"print(pytesseract.image_to_string(Image.open('/Users/kadir/code/realestate/crawler/data/rs/135175484/floorplans/0_219962_DWR0004BF_FLP_00_0001.gif')))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1066255c-79f8-4360-aa6b-515576cb8b03",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "232d19bb-869a-4b8b-829a-0d8e211e9c7b",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"id": "e0d94c3e-e8de-452a-8556-12dd2e3e2351",
"metadata": {},
"source": [
"# google/pix2struct-ai2d-base"
]
},
{
"cell_type": "code",
"execution_count": 23,
"id": "cd57d77b-3d70-46f0-97cf-ddd37cd61d23",
"metadata": {},
"outputs": [],
"source": [
"question = \"How many living rooms?\"\n",
"model = Pix2StructForConditionalGeneration.from_pretrained(\"google/pix2struct-ai2d-base\")\n",
"processor = Pix2StructProcessor.from_pretrained(\"google/pix2struct-ai2d-base\")\n",
"\n",
"inputs = processor(images=image, text=question, return_tensors=\"pt\")\n",
"predictions = model.generate(**inputs, max_new_tokens=10000)"
]
},
{
"cell_type": "code",
"execution_count": 26,
"id": "a507a4a4-33b6-4ca3-93dd-3aab489b5c24",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"predictions 1\n"
]
},
{
"data": {
"text/plain": [
"'floor plan is for illustrative purposes only and is not to scale. Every attempt has been made to ensure the accuracy of the floor plan shown, however all measurements, fixtures, fittings and data shown are an approximate interpretation for illustrative purposes only. 1 sq m + 10.76 sq feet. 7/8/2023>'"
]
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"x = processor.decode(predictions[0], skip_special_tokens=True)\n",
"print(\"predictions: \", len(predictions))\n",
"x"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a13e12c9-0406-4568-9a36-e61d70e1e683",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"id": "4865a480-230d-48bc-be05-ddc54d107c79",
"metadata": {},
"source": [
"# pix2struct-docvqa-large"
]
},
{
"cell_type": "code",
"execution_count": 27,
"id": "cd209233-ad64-4af0-a8ce-f1d640458d02",
"metadata": {},
"outputs": [],
"source": [
"question = \"How many living rooms?\"\n",
"model = Pix2StructForConditionalGeneration.from_pretrained(\"google/pix2struct-docvqa-large\")\n",
"processor = Pix2StructProcessor.from_pretrained(\"google/pix2struct-docvqa-large\")\n",
"\n",
"inputs = processor(images=image, text=question, return_tensors=\"pt\")\n",
"predictions = model.generate(**inputs, max_new_tokens=10000)"
]
},
{
"cell_type": "code",
"execution_count": 28,
"id": "17378986-ae8b-4ab5-be40-837db0f7aaa6",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"predictions: 1\n"
]
},
{
"data": {
"text/plain": [
"'2'"
]
},
"execution_count": 28,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"x = processor.decode(predictions[0], skip_special_tokens=True)\n",
"x"
]
},
{
"cell_type": "code",
"execution_count": 39,
"id": "2d3f5a65-34e8-4c2e-ab37-03e6da86fc1c",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['what is the total area measured in square meters? 8. 5']\n"
]
}
],
"source": [
"from transformers import AutoProcessor, AutoModelForCausalLM\n",
"from huggingface_hub import hf_hub_download\n",
"import torch\n",
"\n",
"processor = AutoProcessor.from_pretrained(\"microsoft/git-base-textvqa\")\n",
"model = AutoModelForCausalLM.from_pretrained(\"microsoft/git-base-textvqa\")\n",
"file_path = hf_hub_download(repo_id=\"nielsr/textvqa-sample\", filename=\"bus.png\", repo_type=\"dataset\")\n",
"\n",
"pixel_values = processor(images=image, return_tensors=\"pt\").pixel_values\n",
"question = \"What is the total area measured in square meters?\"\n",
"\n",
"input_ids = processor(text=question, add_special_tokens=False).input_ids\n",
"\n",
"input_ids = [processor.tokenizer.cls_token_id] + input_ids\n",
"\n",
"input_ids = torch.tensor(input_ids).unsqueeze(0)\n",
"\n",
"generated_ids = model.generate(pixel_values=pixel_values, input_ids=input_ids, max_length=50)\n",
"\n",
"print(processor.batch_decode(generated_ids, skip_special_tokens=True))"
]
},
{
"cell_type": "code",
"execution_count": 46,
"id": "254054f0-0102-4927-a93c-c6eba97b437c",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['how many living rooms are displayed on this floor plan? 8']\n"
]
}
],
"source": [
"from transformers import AutoProcessor, AutoModelForCausalLM\n",
"\n",
"from huggingface_hub import hf_hub_download\n",
"\n",
"from PIL import Image\n",
"\n",
"processor = AutoProcessor.from_pretrained(\"microsoft/git-base-textvqa\")\n",
"\n",
"model = AutoModelForCausalLM.from_pretrained(\"microsoft/git-base-textvqa\")\n",
"\n",
"file_path = hf_hub_download(repo_id=\"nielsr/textvqa-sample\", filename=\"bus.png\", repo_type=\"dataset\")\n",
"\n",
"pixel_values = processor(images=image, return_tensors=\"pt\").pixel_values\n",
"\n",
"input_ids = processor(text=question, add_special_tokens=False).input_ids\n",
"\n",
"input_ids = [processor.tokenizer.cls_token_id] + input_ids\n",
"\n",
"input_ids = torch.tensor(input_ids).unsqueeze(0)\n",
"\n",
"generated_ids = model.generate(pixel_values=pixel_values, input_ids=input_ids, max_length=50)\n",
"\n",
"print(processor.batch_decode(generated_ids, skip_special_tokens=True))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8b5b4fd0-1152-4834-9e3f-89be944bad16",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "4fef9f95-22c3-4fd9-907d-471520494533",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "1e56a1f7-c10f-4169-a7c1-a862937c22ba",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"id": "b6eec969-ebd6-4dac-8ca6-17569a9a3f8a",
"metadata": {},
"source": [
"# asd"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "7c0a2360-5de2-475b-9d10-d84bb5bede17",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"a diagram of a floor plan of a two bedroom apartment\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/Users/kadir/code/realestate/vqa/venv/lib/python3.12/site-packages/transformers/generation/configuration_utils.py:410: UserWarning: `do_sample` is set to `False`. However, `temperature` is set to `1.2` -- this flag is only used in sample-based generation modes. You should set `do_sample=True` or unset `temperature`.\n",
" warnings.warn(\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"this is an image of a floor plan of a two bedroom apartment\n"
]
}
],
"source": [
"from transformers import BlipProcessor, BlipForConditionalGeneration\n",
"\n",
"processor = BlipProcessor.from_pretrained(\"Salesforce/blip-image-captioning-large\")\n",
"model = BlipForConditionalGeneration.from_pretrained(\"Salesforce/blip-image-captioning-large\")\n",
"\n",
"# conditional image captioning\n",
"text = \"\"\n",
"inputs = processor(image, text, return_tensors=\"pt\")\n",
"\n",
"out = model.generate(**inputs)\n",
"print(processor.decode(out[0], skip_special_tokens=True))\n",
"\n",
"# unconditional image captioning\n",
"inputs = processor(image, return_tensors=\"pt\")\n",
"\n",
"out = model.generate(**inputs)\n",
"print(processor.decode(out[0], skip_special_tokens=True))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "9f5a7d57-30f3-4b1b-bc07-dbf076d41792",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "41c23bf0-cf64-4c71-9bc0-57f78af609b4",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.0"
}
},
"nbformat": 4,
"nbformat_minor": 5
}