Real crawling scripts and floorplan detection

1. get all listings
2. get all detail jsons
3. get all images
4. get all floorplans
5. detecting floorplans

Also updating dependencies for huggingface etc.
This commit is contained in:
Kadir 2024-03-10 18:49:39 +00:00
parent 46bb641026
commit 508aa02812
12 changed files with 1531 additions and 170 deletions

View file

@ -24,7 +24,8 @@
"source": [
"from transformers import Pix2StructProcessor, Pix2StructForConditionalGeneration\n",
"from PIL import Image\n",
"import pandas as pd"
"import pandas as pd\n",
"import re"
]
},
{
@ -34,7 +35,8 @@
"metadata": {},
"outputs": [],
"source": [
"image = Image.open(\"floorplans/46001_32532509_FLP_00_0000.jpeg\")\n",
"# image = Image.open(\"floorplans/46001_32532509_FLP_00_0000.jpeg\")\n",
"image = Image.open(\"/Users/kadir/code/realestate/crawler/data/floorplans/15508_EnfieldRd_FLP_02_0000.jpg\")\n",
"question = \"How many living rooms are displayed on this floor plan?\""
]
},
@ -60,7 +62,7 @@
},
{
"cell_type": "code",
"execution_count": 11,
"execution_count": 4,
"id": "ea990e1f-a660-4efb-be48-d095ab05b50d",
"metadata": {
"editable": true,
@ -74,7 +76,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
"TITLE | <0x0A> Fifth Floor<0x0A>Floor Area 642 Sq Ft - 59.64 Sq M | LUX<0x0A>Floor Area 642 Sq Ft - 59.64 Sq M<0x0A>Reception Room<0x0A>1210×1210<0x0A>3.9×3.9m | 12.10×1210<0x0A>Reception Room<0x0A>3.9×3.9m | 12.10×1210<0x0A>Balcony<0x0A>1.5×4.5 | 1.5×4.5 <0x0A> Bathroom<0x0A>131°×102°<0x0A>4.0×3.1m | 1.40×2.7m | 1.40×2.7m <0x0A> Kitchen<0x0A>102°×710°<0x0A>3.1×2.4m | 1.10×3.1m | 1.00×3.1m <0x0A> Bedroom<0x0A>131°×810<0x0A>4.0×2.7m | 1.00×3.1m | 1.00×3.1m <0x0A> Kitchen<0x0A>102°×710<0x0A>3.1×2.4m | 1.00×3.1m | 1.00×3.1m <0x0A> Reception Room<0x0A>1210×1210<0x0A>3.9×3.9m | 1.50×4.5 | 1.00×3.5m <0x0A> Balcony<0x0A>1.5×4.5 | 1.50×4.5 | 1.00×3.5m\n"
"TITLE | Approx. Gross Internal Area *<0x0A>636 Ft * - 59.08 M *<0x0A>First Floor | Store<0x0A>Reception<0x0A>Room<0x0A>16'8\" x 129\"<0x0A>5.08 x 3.89m | 1<0x0A>Reception<0x0A>Room<0x0A>16'8\" x 129\"<0x0A>5.08 x 3.89m | 1<0x0A>Reception<0x0A>Room<0x0A>16'8\" x 129\"<0x0A>5.08 x 3.89m | 1<0x0A>Reception<0x0A>Room<0x0A>16'8\" x 129\"<0x0A>5.08 x 3.89m | 1\n"
]
}
],
@ -85,47 +87,52 @@
},
{
"cell_type": "code",
"execution_count": 19,
"execution_count": 6,
"id": "b4a8b5fb-7cc5-441d-ad60-cff7c03a103e",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"TITLE | \n",
" Fifth Floor\n",
"Floor Area 642 Sq Ft - 59.64 Sq M | LUX\n",
"Floor Area 642 Sq Ft - 59.64 Sq M\n",
"Reception Room\n",
"1210×1210\n",
"3.9×3.9m | 12.10×1210\n",
"Reception Room\n",
"3.9×3.9m | 12.10×1210\n",
"Balcony\n",
"1.5×4.5 | 1.5×4.5 \n",
" Bathroom\n",
"131°×102°\n",
"4.0×3.1m | 1.40×2.7m | 1.40×2.7m \n",
" Kitchen\n",
"102°×710°\n",
"3.1×2.4m | 1.10×3.1m | 1.00×3.1m \n",
" Bedroom\n",
"131°×810\n",
"4.0×2.7m | 1.00×3.1m | 1.00×3.1m \n",
" Kitchen\n",
"102°×710\n",
"3.1×2.4m | 1.00×3.1m | 1.00×3.1m \n",
" Reception Room\n",
"1210×1210\n",
"3.9×3.9m | 1.50×4.5 | 1.00×3.5m \n",
" Balcony\n",
"1.5×4.5 | 1.50×4.5 | 1.00×3.5m\n"
"data": {
"text/plain": [
"59.08"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"def extract_total_sqm(deplot_input_str):\n",
" sqmregex = r'(\\d+\\.\\d*) ?(sqm|sq.m|sq m|m)'\n",
" matches = re.findall(sqmregex, deplot_input_str.lower())\n",
" sqms = [float(m[0]) for m in matches]\n",
" # print(sqms)\n",
" return max(sqms)\n",
"\n",
"extract_total_sqm(x)"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "06d50e06-fead-40be-bd9d-7e8207d8df4e",
"metadata": {},
"outputs": [
{
"ename": "TypeError",
"evalue": "'>' not supported between instances of 'int' and 'NoneType'",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[0;32mIn[16], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[38;5;28;43mmax\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m,\u001b[49m\u001b[38;5;241;43m1\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m)\u001b[49m\n",
"\u001b[0;31mTypeError\u001b[0m: '>' not supported between instances of 'int' and 'NoneType'"
]
}
],
"source": [
"x = r'\\d+.\\d*'"
"max([None,1])"
]
},
{

View file

@ -0,0 +1,33 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "96658076-0295-4a52-9b82-2083198fbd57",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.0"
}
},
"nbformat": 4,
"nbformat_minor": 5
}