Real crawling scripts and floorplan detection
1. get all listings 2. get all detail jsons 3. get all images 4. get all floorplans 5. detecting floorplans Also updating dependencies for huggingface etc.
This commit is contained in:
parent
46bb641026
commit
508aa02812
12 changed files with 1531 additions and 170 deletions
1
crawler/.gitignore
vendored
Normal file
1
crawler/.gitignore
vendored
Normal file
|
|
@ -0,0 +1 @@
|
|||
data/rs
|
||||
22
crawler/1_dump_listings.py
Normal file
22
crawler/1_dump_listings.py
Normal file
|
|
@ -0,0 +1,22 @@
|
|||
from rec.query import listing_query
|
||||
import pathlib
|
||||
import json
|
||||
|
||||
d = listing_query(1, 1, 2, 15, 0, 800000)
|
||||
folder = pathlib.Path("data/rs/")
|
||||
|
||||
for i in range(1, 10000):
|
||||
try:
|
||||
print(f"page {i}")
|
||||
d = listing_query(i, 1, 2, 15, 0, 800000)
|
||||
except:
|
||||
break
|
||||
|
||||
for property in d['properties']:
|
||||
identifier = property['identifier']
|
||||
listing_folder = folder / str(identifier)
|
||||
listing_folder.mkdir(exist_ok=True, parents=True)
|
||||
listing_path = listing_folder / f"listing.json"
|
||||
with open(listing_path, 'w') as f:
|
||||
json.dump(property, f)
|
||||
|
||||
25
crawler/2_dump_detail.py
Normal file
25
crawler/2_dump_detail.py
Normal file
|
|
@ -0,0 +1,25 @@
|
|||
import pathlib
|
||||
import json
|
||||
from rec.query import detail_query
|
||||
|
||||
folder = pathlib.Path('data/rs/')
|
||||
listings = folder.glob('*/listing.json')
|
||||
|
||||
for listing_path in listings:
|
||||
with open(listing_path) as f:
|
||||
listing = json.load(f)
|
||||
identifier = listing['identifier']
|
||||
try:
|
||||
d = detail_query(identifier)
|
||||
except:
|
||||
print('Failed at: ', identifier)
|
||||
raise
|
||||
print(identifier)
|
||||
|
||||
detail_path = pathlib.Path(f'data/rs/{identifier}/detail.json')
|
||||
with open(detail_path, 'w') as f:
|
||||
json.dump(d, f)
|
||||
|
||||
|
||||
|
||||
|
||||
42
crawler/3_dump_images.py
Normal file
42
crawler/3_dump_images.py
Normal file
|
|
@ -0,0 +1,42 @@
|
|||
import pathlib
|
||||
import json
|
||||
from urllib.request import urlretrieve
|
||||
from tqdm import tqdm
|
||||
|
||||
folder = pathlib.Path('data/rs/')
|
||||
details = folder.glob('*/detail.json')
|
||||
|
||||
for detail_path in tqdm(list(details)):
|
||||
|
||||
with open(detail_path) as f:
|
||||
detail = json.load(f)
|
||||
|
||||
identifier = detail['property']['identifier']
|
||||
rsfolder = folder / str(identifier)
|
||||
|
||||
|
||||
for photo in detail['property']['photos']:
|
||||
url = photo['maxSizeUrl']
|
||||
picname = url.split('/')[-1]
|
||||
order = photo['order']
|
||||
filename = f'{order}_{picname}'
|
||||
fullpicpath = rsfolder / 'pics' / filename
|
||||
if fullpicpath.exists():
|
||||
continue
|
||||
fullpicpath.parent.mkdir(parents=True, exist_ok=True) # create the 'pics' folder
|
||||
tqdm.write(str(fullpicpath))
|
||||
urlretrieve(url, fullpicpath)
|
||||
|
||||
for photo in detail['property']['floorplans']:
|
||||
url = photo['url']
|
||||
picname = url.split('/')[-1]
|
||||
order = photo['order']
|
||||
filename = f'{order}_{picname}'
|
||||
fullpicpath = rsfolder / 'floorplans' / filename
|
||||
if fullpicpath.exists():
|
||||
continue
|
||||
fullpicpath.parent.mkdir(parents=True, exist_ok=True) # create the 'floorplans' folder
|
||||
tqdm.write(str(fullpicpath))
|
||||
urlretrieve(url, fullpicpath)
|
||||
|
||||
|
||||
7
crawler/4_detect_floorplan.py
Normal file
7
crawler/4_detect_floorplan.py
Normal file
|
|
@ -0,0 +1,7 @@
|
|||
from data_access import Listing
|
||||
from tqdm import tqdm
|
||||
|
||||
listings = Listing.get_all_listings()
|
||||
|
||||
for listing in tqdm(listings):
|
||||
tqdm.write(listing.calculate_sqm())
|
||||
86
crawler/data_access.py
Normal file
86
crawler/data_access.py
Normal file
|
|
@ -0,0 +1,86 @@
|
|||
from dataclasses import dataclass
|
||||
import json
|
||||
import pathlib
|
||||
from typing import List
|
||||
from rec import floorplan
|
||||
|
||||
_DATA_DIR = pathlib.Path('data/rs/')
|
||||
|
||||
@dataclass()
|
||||
class Listing():
|
||||
identifier: int
|
||||
|
||||
@staticmethod
|
||||
def get_all_listings() -> List['Listing']:
|
||||
listing_paths = sorted(list(_DATA_DIR.glob('*/listing.json')))
|
||||
identifiers = []
|
||||
for listing_path in listing_paths:
|
||||
with open(listing_path) as f:
|
||||
d = json.load(f)
|
||||
identifiers.append(Listing(d['identifier']))
|
||||
|
||||
return identifiers
|
||||
|
||||
def path_listing(self) -> pathlib.Path:
|
||||
return _DATA_DIR / str(self.identifier)
|
||||
|
||||
def path_listing_json(self) -> pathlib.Path:
|
||||
return self.path_listing() / 'listing.json'
|
||||
|
||||
def path_detail_json(self) -> pathlib.Path:
|
||||
return self.path_listing() / 'detail.json'
|
||||
|
||||
def path_floorplan_json(self) -> pathlib.Path:
|
||||
return self.path_listing() / 'floorplan.json'
|
||||
|
||||
def path_pic_folder(self) -> pathlib.Path:
|
||||
return self.path_listing() / 'pics'
|
||||
|
||||
def path_pic_folder(self, order, name) -> pathlib.Path:
|
||||
return self.path_pic_folder() / f'{order}_{name}'
|
||||
|
||||
|
||||
def path_floorplan_folder(self) -> pathlib.Path:
|
||||
return self.path_listing() / 'floorplans'
|
||||
|
||||
def path_floorplan_file(self, order, name) -> pathlib.Path:
|
||||
return self.path_pic_folder() / f'{order}_{name}'
|
||||
|
||||
def list_floorplans(self):
|
||||
images = list(self.path_floorplan_folder().glob('*'))
|
||||
# todo add check if return is image
|
||||
return images
|
||||
|
||||
def calculate_sqm(self):
|
||||
objs = []
|
||||
for floorplan_path in self.list_floorplans():
|
||||
estimated_sqm, model_output, predictions = floorplan.calculate(floorplan_path)
|
||||
objs.append({
|
||||
'floorplan_path': floorplan_path,
|
||||
'estimated_sqm': estimated_sqm,
|
||||
'model_output': model_output,
|
||||
'no_predictions': len(predictions) # cant serialize the predictions itself since its a tensor
|
||||
})
|
||||
|
||||
with open(self.path_floorplan_json(), 'w') as f:
|
||||
json.dump(objs, f)
|
||||
|
||||
max_sqm = max([o['estimated_sqm'] for o in objs if o is None]) # filter out Nones
|
||||
return max_sqm
|
||||
|
||||
@property
|
||||
def sqm(self, recalculate=True):
|
||||
if recalculate and not self.path_floorplan_json().exists():
|
||||
self.calculate_sqm()
|
||||
|
||||
with open(self.path_floorplan_json()) as f:
|
||||
objs = json.load(f)
|
||||
|
||||
max_sqm = max([o['estimated_sqm'] for o in objs if o is None]) # filter out Nones
|
||||
return max_sqm
|
||||
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
listings = Listing.get_all_listings()
|
||||
print(listings[0].list_floorplans())
|
||||
1364
crawler/poetry.lock
generated
1364
crawler/poetry.lock
generated
File diff suppressed because it is too large
Load diff
|
|
@ -5,11 +5,16 @@ description = ""
|
|||
authors = ["Kadir Tugan <git@k8n.dev>"]
|
||||
|
||||
[tool.poetry.dependencies]
|
||||
python = ">3.8"
|
||||
python = ">3.11"
|
||||
SQLAlchemy = "^2.0.23"
|
||||
requests = "^2.31.0"
|
||||
cachetools = "^5.3.2"
|
||||
diskcache = "^5.6.3"
|
||||
tqdm = "^4.66.2"
|
||||
pillow = "^10.2.0"
|
||||
torch = "^2.2.1"
|
||||
numpy = "^1.26.4"
|
||||
transformers = "^4.38.2"
|
||||
|
||||
[tool.poetry.dev-dependencies]
|
||||
|
||||
|
|
|
|||
30
crawler/rec/floorplan.py
Normal file
30
crawler/rec/floorplan.py
Normal file
|
|
@ -0,0 +1,30 @@
|
|||
import re
|
||||
from PIL import Image
|
||||
from transformers import Pix2StructProcessor, Pix2StructForConditionalGeneration
|
||||
|
||||
def inference(image_path):
|
||||
image = Image.open(image_path)
|
||||
question = "How many living rooms are displayed on this floor plan?" # not sure if it even has an effect
|
||||
processor = Pix2StructProcessor.from_pretrained('google/deplot')
|
||||
model = Pix2StructForConditionalGeneration.from_pretrained('google/deplot')
|
||||
|
||||
inputs = processor(images=image, text=question, return_tensors="pt")
|
||||
predictions = model.generate(**inputs, max_new_tokens=512)
|
||||
output = processor.decode(predictions[0], skip_special_tokens=True)
|
||||
|
||||
return output, predictions
|
||||
|
||||
|
||||
def extract_total_sqm(deplot_input_str):
|
||||
sqmregex = r'(\d+\.\d*) ?(sqm|sq.m|sq m|m)'
|
||||
matches = re.findall(sqmregex, deplot_input_str.lower())
|
||||
if len(matches) == 0:
|
||||
return None
|
||||
sqms = [float(m[0]) for m in matches]
|
||||
return max(sqms)
|
||||
|
||||
|
||||
def calculate(image_path):
|
||||
output, predictions_tensor = inference(image_path)
|
||||
estimated_sqm = extract_total_sqm()
|
||||
return estimated_sqm, output, predictions_tensor
|
||||
|
|
@ -30,8 +30,7 @@ def detail_query(detail_id: int):
|
|||
|
||||
|
||||
# @cache.memoize()
|
||||
def listing_query(page: int, min_bedrooms: int, max_bedrooms: int, radius: float, min_price: int, max_price: int):
|
||||
print("Querying")
|
||||
def listing_query(page: int, min_bedrooms: int, max_bedrooms: int, radius: float, min_price: int, max_price: int) -> dict:
|
||||
params = {
|
||||
"locationIdentifier": "POSTCODE^4228216",
|
||||
"channel": "BUY",
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue