Real crawling scripts and floorplan detection

1. get all listings
2. get all detail jsons
3. get all images
4. get all floorplans
5. detecting floorplans

Also updating dependencies for huggingface etc.
This commit is contained in:
Kadir 2024-03-10 18:49:39 +00:00
parent 46bb641026
commit 508aa02812
12 changed files with 1531 additions and 170 deletions

1
crawler/.gitignore vendored Normal file
View file

@ -0,0 +1 @@
data/rs

View file

@ -0,0 +1,22 @@
from rec.query import listing_query
import pathlib
import json
d = listing_query(1, 1, 2, 15, 0, 800000)
folder = pathlib.Path("data/rs/")
for i in range(1, 10000):
try:
print(f"page {i}")
d = listing_query(i, 1, 2, 15, 0, 800000)
except:
break
for property in d['properties']:
identifier = property['identifier']
listing_folder = folder / str(identifier)
listing_folder.mkdir(exist_ok=True, parents=True)
listing_path = listing_folder / f"listing.json"
with open(listing_path, 'w') as f:
json.dump(property, f)

25
crawler/2_dump_detail.py Normal file
View file

@ -0,0 +1,25 @@
import pathlib
import json
from rec.query import detail_query
folder = pathlib.Path('data/rs/')
listings = folder.glob('*/listing.json')
for listing_path in listings:
with open(listing_path) as f:
listing = json.load(f)
identifier = listing['identifier']
try:
d = detail_query(identifier)
except:
print('Failed at: ', identifier)
raise
print(identifier)
detail_path = pathlib.Path(f'data/rs/{identifier}/detail.json')
with open(detail_path, 'w') as f:
json.dump(d, f)

42
crawler/3_dump_images.py Normal file
View file

@ -0,0 +1,42 @@
import pathlib
import json
from urllib.request import urlretrieve
from tqdm import tqdm
folder = pathlib.Path('data/rs/')
details = folder.glob('*/detail.json')
for detail_path in tqdm(list(details)):
with open(detail_path) as f:
detail = json.load(f)
identifier = detail['property']['identifier']
rsfolder = folder / str(identifier)
for photo in detail['property']['photos']:
url = photo['maxSizeUrl']
picname = url.split('/')[-1]
order = photo['order']
filename = f'{order}_{picname}'
fullpicpath = rsfolder / 'pics' / filename
if fullpicpath.exists():
continue
fullpicpath.parent.mkdir(parents=True, exist_ok=True) # create the 'pics' folder
tqdm.write(str(fullpicpath))
urlretrieve(url, fullpicpath)
for photo in detail['property']['floorplans']:
url = photo['url']
picname = url.split('/')[-1]
order = photo['order']
filename = f'{order}_{picname}'
fullpicpath = rsfolder / 'floorplans' / filename
if fullpicpath.exists():
continue
fullpicpath.parent.mkdir(parents=True, exist_ok=True) # create the 'floorplans' folder
tqdm.write(str(fullpicpath))
urlretrieve(url, fullpicpath)

View file

@ -0,0 +1,7 @@
from data_access import Listing
from tqdm import tqdm
listings = Listing.get_all_listings()
for listing in tqdm(listings):
tqdm.write(listing.calculate_sqm())

86
crawler/data_access.py Normal file
View file

@ -0,0 +1,86 @@
from dataclasses import dataclass
import json
import pathlib
from typing import List
from rec import floorplan
_DATA_DIR = pathlib.Path('data/rs/')
@dataclass()
class Listing():
identifier: int
@staticmethod
def get_all_listings() -> List['Listing']:
listing_paths = sorted(list(_DATA_DIR.glob('*/listing.json')))
identifiers = []
for listing_path in listing_paths:
with open(listing_path) as f:
d = json.load(f)
identifiers.append(Listing(d['identifier']))
return identifiers
def path_listing(self) -> pathlib.Path:
return _DATA_DIR / str(self.identifier)
def path_listing_json(self) -> pathlib.Path:
return self.path_listing() / 'listing.json'
def path_detail_json(self) -> pathlib.Path:
return self.path_listing() / 'detail.json'
def path_floorplan_json(self) -> pathlib.Path:
return self.path_listing() / 'floorplan.json'
def path_pic_folder(self) -> pathlib.Path:
return self.path_listing() / 'pics'
def path_pic_folder(self, order, name) -> pathlib.Path:
return self.path_pic_folder() / f'{order}_{name}'
def path_floorplan_folder(self) -> pathlib.Path:
return self.path_listing() / 'floorplans'
def path_floorplan_file(self, order, name) -> pathlib.Path:
return self.path_pic_folder() / f'{order}_{name}'
def list_floorplans(self):
images = list(self.path_floorplan_folder().glob('*'))
# todo add check if return is image
return images
def calculate_sqm(self):
objs = []
for floorplan_path in self.list_floorplans():
estimated_sqm, model_output, predictions = floorplan.calculate(floorplan_path)
objs.append({
'floorplan_path': floorplan_path,
'estimated_sqm': estimated_sqm,
'model_output': model_output,
'no_predictions': len(predictions) # cant serialize the predictions itself since its a tensor
})
with open(self.path_floorplan_json(), 'w') as f:
json.dump(objs, f)
max_sqm = max([o['estimated_sqm'] for o in objs if o is None]) # filter out Nones
return max_sqm
@property
def sqm(self, recalculate=True):
if recalculate and not self.path_floorplan_json().exists():
self.calculate_sqm()
with open(self.path_floorplan_json()) as f:
objs = json.load(f)
max_sqm = max([o['estimated_sqm'] for o in objs if o is None]) # filter out Nones
return max_sqm
if __name__ == '__main__':
listings = Listing.get_all_listings()
print(listings[0].list_floorplans())

1364
crawler/poetry.lock generated

File diff suppressed because it is too large Load diff

View file

@ -5,11 +5,16 @@ description = ""
authors = ["Kadir Tugan <git@k8n.dev>"]
[tool.poetry.dependencies]
python = ">3.8"
python = ">3.11"
SQLAlchemy = "^2.0.23"
requests = "^2.31.0"
cachetools = "^5.3.2"
diskcache = "^5.6.3"
tqdm = "^4.66.2"
pillow = "^10.2.0"
torch = "^2.2.1"
numpy = "^1.26.4"
transformers = "^4.38.2"
[tool.poetry.dev-dependencies]

30
crawler/rec/floorplan.py Normal file
View file

@ -0,0 +1,30 @@
import re
from PIL import Image
from transformers import Pix2StructProcessor, Pix2StructForConditionalGeneration
def inference(image_path):
image = Image.open(image_path)
question = "How many living rooms are displayed on this floor plan?" # not sure if it even has an effect
processor = Pix2StructProcessor.from_pretrained('google/deplot')
model = Pix2StructForConditionalGeneration.from_pretrained('google/deplot')
inputs = processor(images=image, text=question, return_tensors="pt")
predictions = model.generate(**inputs, max_new_tokens=512)
output = processor.decode(predictions[0], skip_special_tokens=True)
return output, predictions
def extract_total_sqm(deplot_input_str):
sqmregex = r'(\d+\.\d*) ?(sqm|sq.m|sq m|m)'
matches = re.findall(sqmregex, deplot_input_str.lower())
if len(matches) == 0:
return None
sqms = [float(m[0]) for m in matches]
return max(sqms)
def calculate(image_path):
output, predictions_tensor = inference(image_path)
estimated_sqm = extract_total_sqm()
return estimated_sqm, output, predictions_tensor

View file

@ -30,8 +30,7 @@ def detail_query(detail_id: int):
# @cache.memoize()
def listing_query(page: int, min_bedrooms: int, max_bedrooms: int, radius: float, min_price: int, max_price: int):
print("Querying")
def listing_query(page: int, min_bedrooms: int, max_bedrooms: int, radius: float, min_price: int, max_price: int) -> dict:
params = {
"locationIdentifier": "POSTCODE^4228216",
"channel": "BUY",