Real crawling scripts and floorplan detection

1. get all listings 2. get all detail jsons 3. get all images 4. get all floorplans 5. detecting floorplans Also updating dependencies for huggingface etc.
2024-03-10 18:49:39 +00:00 · 2024-03-10 18:49:39 +00:00 · 508aa02812
commit 508aa02812
parent 46bb641026
12 changed files with 1531 additions and 170 deletions
--- a/crawler/.gitignore
+++ b/crawler/.gitignore
@ -0,0 +1 @@
+data/rs
--- a/crawler/1_dump_listings.py
+++ b/crawler/1_dump_listings.py
@ -0,0 +1,22 @@
+from rec.query import listing_query
+import pathlib
+import json
+
+d = listing_query(1, 1, 2, 15, 0, 800000)
+folder = pathlib.Path("data/rs/")
+
+for i in range(1, 10000):
+    try:
+        print(f"page {i}")
+        d = listing_query(i, 1, 2, 15, 0, 800000)
+    except:
+        break
+    
+    for property in d['properties']:
+        identifier = property['identifier']
+        listing_folder = folder / str(identifier)
+        listing_folder.mkdir(exist_ok=True, parents=True)
+        listing_path = listing_folder / f"listing.json"
+        with open(listing_path, 'w') as f:
+            json.dump(property, f)
+
--- a/crawler/2_dump_detail.py
+++ b/crawler/2_dump_detail.py
@ -0,0 +1,25 @@
+import pathlib
+import json
+from rec.query import detail_query
+
+folder = pathlib.Path('data/rs/')
+listings = folder.glob('*/listing.json')
+
+for listing_path in listings:
+    with open(listing_path) as f:
+        listing = json.load(f)
+    identifier = listing['identifier']
+    try:
+        d = detail_query(identifier)
+    except:
+        print('Failed at: ', identifier)
+        raise
+    print(identifier)
+    
+    detail_path = pathlib.Path(f'data/rs/{identifier}/detail.json')
+    with open(detail_path, 'w') as f:
+        json.dump(d, f)
+
+    
+        
+    
--- a/crawler/3_dump_images.py
+++ b/crawler/3_dump_images.py
@ -0,0 +1,42 @@
+import pathlib
+import json
+from urllib.request import urlretrieve
+from tqdm import tqdm
+
+folder = pathlib.Path('data/rs/')
+details = folder.glob('*/detail.json')
+
+for detail_path in tqdm(list(details)):
+    
+    with open(detail_path) as f:
+        detail = json.load(f)
+    
+    identifier = detail['property']['identifier']
+    rsfolder = folder / str(identifier)
+    
+    
+    for photo in detail['property']['photos']:
+        url = photo['maxSizeUrl']
+        picname = url.split('/')[-1]
+        order = photo['order']
+        filename = f'{order}_{picname}'
+        fullpicpath = rsfolder / 'pics' / filename
+        if fullpicpath.exists():
+            continue
+        fullpicpath.parent.mkdir(parents=True, exist_ok=True) # create the 'pics' folder
+        tqdm.write(str(fullpicpath))
+        urlretrieve(url, fullpicpath)
+        
+    for photo in detail['property']['floorplans']:
+        url = photo['url']
+        picname = url.split('/')[-1]
+        order = photo['order']
+        filename = f'{order}_{picname}'
+        fullpicpath = rsfolder / 'floorplans' / filename
+        if fullpicpath.exists():
+            continue
+        fullpicpath.parent.mkdir(parents=True, exist_ok=True) # create the 'floorplans' folder
+        tqdm.write(str(fullpicpath))
+        urlretrieve(url, fullpicpath)    
+    
+    
--- a/crawler/4_detect_floorplan.py
+++ b/crawler/4_detect_floorplan.py
@ -0,0 +1,7 @@
+from data_access import Listing
+from tqdm import tqdm
+
+listings = Listing.get_all_listings()
+
+for listing in tqdm(listings):
+    tqdm.write(listing.calculate_sqm())
--- a/crawler/data_access.py
+++ b/crawler/data_access.py
@ -0,0 +1,86 @@
+from dataclasses import dataclass
+import json
+import pathlib
+from typing import List
+from rec import floorplan
+
+_DATA_DIR = pathlib.Path('data/rs/')
+
+@dataclass()
+class Listing():
+    identifier: int
+    
+    @staticmethod
+    def get_all_listings() -> List['Listing']:
+        listing_paths = sorted(list(_DATA_DIR.glob('*/listing.json')))
+        identifiers = []
+        for listing_path in listing_paths:
+            with open(listing_path) as f:
+                d = json.load(f)
+            identifiers.append(Listing(d['identifier']))
+        
+        return identifiers
+
+    def path_listing(self) -> pathlib.Path:
+        return _DATA_DIR / str(self.identifier)
+    
+    def path_listing_json(self) -> pathlib.Path:
+        return self.path_listing() / 'listing.json'
+    
+    def path_detail_json(self) -> pathlib.Path:
+        return self.path_listing() / 'detail.json'
+    
+    def path_floorplan_json(self) -> pathlib.Path:
+        return self.path_listing() / 'floorplan.json'
+    
+    def path_pic_folder(self) -> pathlib.Path:
+        return self.path_listing() / 'pics'
+    
+    def path_pic_folder(self, order, name) -> pathlib.Path:
+        return self.path_pic_folder() / f'{order}_{name}'
+    
+    
+    def path_floorplan_folder(self) -> pathlib.Path:
+        return self.path_listing() / 'floorplans'
+    
+    def path_floorplan_file(self, order, name) -> pathlib.Path:
+        return self.path_pic_folder() / f'{order}_{name}'
+    
+    def list_floorplans(self):
+        images = list(self.path_floorplan_folder().glob('*'))
+        # todo add check if return is image
+        return images
+    
+    def calculate_sqm(self):
+        objs = []
+        for floorplan_path in self.list_floorplans():
+            estimated_sqm, model_output, predictions = floorplan.calculate(floorplan_path)
+            objs.append({
+                'floorplan_path': floorplan_path,
+                'estimated_sqm': estimated_sqm,
+                'model_output': model_output,
+                'no_predictions': len(predictions) # cant serialize the predictions itself since its a tensor
+            })
+        
+        with open(self.path_floorplan_json(), 'w') as f:
+            json.dump(objs, f)
+            
+        max_sqm = max([o['estimated_sqm'] for o in objs if o is None]) # filter out Nones
+        return max_sqm
+    
+    @property
+    def sqm(self, recalculate=True):
+        if recalculate and not self.path_floorplan_json().exists():
+            self.calculate_sqm()
+        
+        with open(self.path_floorplan_json()) as f:
+            objs = json.load(f)
+        
+        max_sqm = max([o['estimated_sqm'] for o in objs if o is None]) # filter out Nones
+        return max_sqm
+        
+            
+    
+if __name__ == '__main__':    
+    listings = Listing.get_all_listings()
+    print(listings[0].list_floorplans())
--- a/crawler/poetry.lock
+++ b/crawler/poetry.lock
--- a/crawler/pyproject.toml
+++ b/crawler/pyproject.toml
@ -5,11 +5,16 @@ description = ""
 authors = ["Kadir Tugan <git@k8n.dev>"]

 [tool.poetry.dependencies]
-python = ">3.8"
+python = ">3.11"
 SQLAlchemy = "^2.0.23"
 requests = "^2.31.0"
 cachetools = "^5.3.2"
 diskcache = "^5.6.3"
+tqdm = "^4.66.2"
+pillow = "^10.2.0"
+torch = "^2.2.1"
+numpy = "^1.26.4"
+transformers = "^4.38.2"

 [tool.poetry.dev-dependencies]

--- a/crawler/rec/floorplan.py
+++ b/crawler/rec/floorplan.py
@ -0,0 +1,30 @@
+import re
+from PIL import Image
+from transformers import Pix2StructProcessor, Pix2StructForConditionalGeneration
+
+def inference(image_path):
+    image = Image.open(image_path)
+    question = "How many living rooms are displayed on this floor plan?" # not sure if it even has an effect
+    processor = Pix2StructProcessor.from_pretrained('google/deplot')
+    model = Pix2StructForConditionalGeneration.from_pretrained('google/deplot')
+
+    inputs = processor(images=image, text=question, return_tensors="pt")
+    predictions = model.generate(**inputs, max_new_tokens=512)
+    output = processor.decode(predictions[0], skip_special_tokens=True)
+    
+    return output, predictions
+    
+
+def extract_total_sqm(deplot_input_str):
+    sqmregex = r'(\d+\.\d*) ?(sqm|sq.m|sq m|m)'
+    matches = re.findall(sqmregex, deplot_input_str.lower())
+    if len(matches) == 0:
+        return None
+    sqms = [float(m[0]) for m in matches]
+    return max(sqms)
+
+
+def calculate(image_path):
+    output, predictions_tensor = inference(image_path)
+    estimated_sqm = extract_total_sqm()
+    return estimated_sqm, output, predictions_tensor
--- a/crawler/rec/query.py
+++ b/crawler/rec/query.py
@ -30,8 +30,7 @@ def detail_query(detail_id: int):


 # @cache.memoize()
-def listing_query(page: int, min_bedrooms: int, max_bedrooms: int, radius: float, min_price: int, max_price: int):
-    print("Querying")
+def listing_query(page: int, min_bedrooms: int, max_bedrooms: int, radius: float, min_price: int, max_price: int) -> dict:
    params = {
        "locationIdentifier": "POSTCODE^4228216",
        "channel": "BUY",