wrongmove/vqa/Untitled.ipynb
Kadir 508aa02812 Real crawling scripts and floorplan detection
1. get all listings
2. get all detail jsons
3. get all images
4. get all floorplans
5. detecting floorplans

Also updating dependencies for huggingface etc.
2024-03-10 18:49:39 +00:00

447 lines
13 KiB
Text

{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "546cee84-249f-43ce-a1f9-8475682a833d",
"metadata": {
"editable": true,
"slideshow": {
"slide_type": ""
},
"tags": []
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/Users/kadir/code/realestate/vqa/venv/lib/python3.12/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
" from .autonotebook import tqdm as notebook_tqdm\n"
]
}
],
"source": [
"from transformers import Pix2StructProcessor, Pix2StructForConditionalGeneration\n",
"from PIL import Image\n",
"import pandas as pd\n",
"import re"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "74e4a41f-2dfc-428e-8bca-4e9cc1c076c6",
"metadata": {},
"outputs": [],
"source": [
"# image = Image.open(\"floorplans/46001_32532509_FLP_00_0000.jpeg\")\n",
"image = Image.open(\"/Users/kadir/code/realestate/crawler/data/floorplans/15508_EnfieldRd_FLP_02_0000.jpg\")\n",
"question = \"How many living rooms are displayed on this floor plan?\""
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "c24b1560-563b-4ff2-8744-4591ac1cc57b",
"metadata": {
"editable": true,
"slideshow": {
"slide_type": ""
},
"tags": []
},
"outputs": [],
"source": [
"processor = Pix2StructProcessor.from_pretrained('google/deplot')\n",
"model = Pix2StructForConditionalGeneration.from_pretrained('google/deplot')\n",
"\n",
"inputs = processor(images=image, text=question, return_tensors=\"pt\")\n",
"predictions = model.generate(**inputs, max_new_tokens=512)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "ea990e1f-a660-4efb-be48-d095ab05b50d",
"metadata": {
"editable": true,
"slideshow": {
"slide_type": ""
},
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"TITLE | Approx. Gross Internal Area *<0x0A>636 Ft * - 59.08 M *<0x0A>First Floor | Store<0x0A>Reception<0x0A>Room<0x0A>16'8\" x 129\"<0x0A>5.08 x 3.89m | 1<0x0A>Reception<0x0A>Room<0x0A>16'8\" x 129\"<0x0A>5.08 x 3.89m | 1<0x0A>Reception<0x0A>Room<0x0A>16'8\" x 129\"<0x0A>5.08 x 3.89m | 1<0x0A>Reception<0x0A>Room<0x0A>16'8\" x 129\"<0x0A>5.08 x 3.89m | 1\n"
]
}
],
"source": [
"x = processor.decode(predictions[0], skip_special_tokens=True)\n",
"print(x)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "b4a8b5fb-7cc5-441d-ad60-cff7c03a103e",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"59.08"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"def extract_total_sqm(deplot_input_str):\n",
" sqmregex = r'(\\d+\\.\\d*) ?(sqm|sq.m|sq m|m)'\n",
" matches = re.findall(sqmregex, deplot_input_str.lower())\n",
" sqms = [float(m[0]) for m in matches]\n",
" # print(sqms)\n",
" return max(sqms)\n",
"\n",
"extract_total_sqm(x)"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "06d50e06-fead-40be-bd9d-7e8207d8df4e",
"metadata": {},
"outputs": [
{
"ename": "TypeError",
"evalue": "'>' not supported between instances of 'int' and 'NoneType'",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[0;32mIn[16], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[38;5;28;43mmax\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m,\u001b[49m\u001b[38;5;241;43m1\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m)\u001b[49m\n",
"\u001b[0;31mTypeError\u001b[0m: '>' not supported between instances of 'int' and 'NoneType'"
]
}
],
"source": [
"max([None,1])"
]
},
{
"cell_type": "markdown",
"id": "e0d94c3e-e8de-452a-8556-12dd2e3e2351",
"metadata": {},
"source": [
"# google/pix2struct-ai2d-base"
]
},
{
"cell_type": "code",
"execution_count": 23,
"id": "cd57d77b-3d70-46f0-97cf-ddd37cd61d23",
"metadata": {},
"outputs": [],
"source": [
"question = \"How many living rooms?\"\n",
"model = Pix2StructForConditionalGeneration.from_pretrained(\"google/pix2struct-ai2d-base\")\n",
"processor = Pix2StructProcessor.from_pretrained(\"google/pix2struct-ai2d-base\")\n",
"\n",
"inputs = processor(images=image, text=question, return_tensors=\"pt\")\n",
"predictions = model.generate(**inputs, max_new_tokens=10000)"
]
},
{
"cell_type": "code",
"execution_count": 26,
"id": "a507a4a4-33b6-4ca3-93dd-3aab489b5c24",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"predictions 1\n"
]
},
{
"data": {
"text/plain": [
"'floor plan is for illustrative purposes only and is not to scale. Every attempt has been made to ensure the accuracy of the floor plan shown, however all measurements, fixtures, fittings and data shown are an approximate interpretation for illustrative purposes only. 1 sq m + 10.76 sq feet. 7/8/2023>'"
]
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"x = processor.decode(predictions[0], skip_special_tokens=True)\n",
"print(\"predictions: \", len(predictions))\n",
"x"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a13e12c9-0406-4568-9a36-e61d70e1e683",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"id": "4865a480-230d-48bc-be05-ddc54d107c79",
"metadata": {},
"source": [
"# pix2struct-docvqa-large"
]
},
{
"cell_type": "code",
"execution_count": 27,
"id": "cd209233-ad64-4af0-a8ce-f1d640458d02",
"metadata": {},
"outputs": [],
"source": [
"question = \"How many living rooms?\"\n",
"model = Pix2StructForConditionalGeneration.from_pretrained(\"google/pix2struct-docvqa-large\")\n",
"processor = Pix2StructProcessor.from_pretrained(\"google/pix2struct-docvqa-large\")\n",
"\n",
"inputs = processor(images=image, text=question, return_tensors=\"pt\")\n",
"predictions = model.generate(**inputs, max_new_tokens=10000)"
]
},
{
"cell_type": "code",
"execution_count": 28,
"id": "17378986-ae8b-4ab5-be40-837db0f7aaa6",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"predictions: 1\n"
]
},
{
"data": {
"text/plain": [
"'2'"
]
},
"execution_count": 28,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"x = processor.decode(predictions[0], skip_special_tokens=True)\n",
"x"
]
},
{
"cell_type": "code",
"execution_count": 39,
"id": "2d3f5a65-34e8-4c2e-ab37-03e6da86fc1c",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['what is the total area measured in square meters? 8. 5']\n"
]
}
],
"source": [
"from transformers import AutoProcessor, AutoModelForCausalLM\n",
"from huggingface_hub import hf_hub_download\n",
"import torch\n",
"\n",
"processor = AutoProcessor.from_pretrained(\"microsoft/git-base-textvqa\")\n",
"model = AutoModelForCausalLM.from_pretrained(\"microsoft/git-base-textvqa\")\n",
"file_path = hf_hub_download(repo_id=\"nielsr/textvqa-sample\", filename=\"bus.png\", repo_type=\"dataset\")\n",
"\n",
"pixel_values = processor(images=image, return_tensors=\"pt\").pixel_values\n",
"question = \"What is the total area measured in square meters?\"\n",
"\n",
"input_ids = processor(text=question, add_special_tokens=False).input_ids\n",
"\n",
"input_ids = [processor.tokenizer.cls_token_id] + input_ids\n",
"\n",
"input_ids = torch.tensor(input_ids).unsqueeze(0)\n",
"\n",
"generated_ids = model.generate(pixel_values=pixel_values, input_ids=input_ids, max_length=50)\n",
"\n",
"print(processor.batch_decode(generated_ids, skip_special_tokens=True))"
]
},
{
"cell_type": "code",
"execution_count": 46,
"id": "254054f0-0102-4927-a93c-c6eba97b437c",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['how many living rooms are displayed on this floor plan? 8']\n"
]
}
],
"source": [
"from transformers import AutoProcessor, AutoModelForCausalLM\n",
"\n",
"from huggingface_hub import hf_hub_download\n",
"\n",
"from PIL import Image\n",
"\n",
"processor = AutoProcessor.from_pretrained(\"microsoft/git-base-textvqa\")\n",
"\n",
"model = AutoModelForCausalLM.from_pretrained(\"microsoft/git-base-textvqa\")\n",
"\n",
"file_path = hf_hub_download(repo_id=\"nielsr/textvqa-sample\", filename=\"bus.png\", repo_type=\"dataset\")\n",
"\n",
"pixel_values = processor(images=image, return_tensors=\"pt\").pixel_values\n",
"\n",
"input_ids = processor(text=question, add_special_tokens=False).input_ids\n",
"\n",
"input_ids = [processor.tokenizer.cls_token_id] + input_ids\n",
"\n",
"input_ids = torch.tensor(input_ids).unsqueeze(0)\n",
"\n",
"generated_ids = model.generate(pixel_values=pixel_values, input_ids=input_ids, max_length=50)\n",
"\n",
"print(processor.batch_decode(generated_ids, skip_special_tokens=True))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8b5b4fd0-1152-4834-9e3f-89be944bad16",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "4fef9f95-22c3-4fd9-907d-471520494533",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "1e56a1f7-c10f-4169-a7c1-a862937c22ba",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"id": "b6eec969-ebd6-4dac-8ca6-17569a9a3f8a",
"metadata": {},
"source": [
"# asd"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "7c0a2360-5de2-475b-9d10-d84bb5bede17",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"a diagram of a floor plan of a two bedroom apartment\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/Users/kadir/code/realestate/vqa/venv/lib/python3.12/site-packages/transformers/generation/configuration_utils.py:410: UserWarning: `do_sample` is set to `False`. However, `temperature` is set to `1.2` -- this flag is only used in sample-based generation modes. You should set `do_sample=True` or unset `temperature`.\n",
" warnings.warn(\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"this is an image of a floor plan of a two bedroom apartment\n"
]
}
],
"source": [
"from transformers import BlipProcessor, BlipForConditionalGeneration\n",
"\n",
"processor = BlipProcessor.from_pretrained(\"Salesforce/blip-image-captioning-large\")\n",
"model = BlipForConditionalGeneration.from_pretrained(\"Salesforce/blip-image-captioning-large\")\n",
"\n",
"# conditional image captioning\n",
"text = \"\"\n",
"inputs = processor(image, text, return_tensors=\"pt\")\n",
"\n",
"out = model.generate(**inputs)\n",
"print(processor.decode(out[0], skip_special_tokens=True))\n",
"\n",
"# unconditional image captioning\n",
"inputs = processor(image, return_tensors=\"pt\")\n",
"\n",
"out = model.generate(**inputs)\n",
"print(processor.decode(out[0], skip_special_tokens=True))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "9f5a7d57-30f3-4b1b-bc07-dbf076d41792",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "41c23bf0-cf64-4c71-9bc0-57f78af609b4",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.0"
}
},
"nbformat": 4,
"nbformat_minor": 5
}