Flatten repo structure: move crawler/ to root, remove vqa/ and immoweb/

The crawler subdirectory was the only active project. Moving it to the repo root simplifies paths and removes the unnecessary nesting. The vqa/ and immoweb/ directories were legacy/unused and have been removed. Updated .drone.yml, .gitignore, .claude/ docs, and skills to reflect the new flat structure.
2026-02-07 23:01:20 +00:00 · 2026-02-07 23:01:20 +00:00 · eafbc1ac52
commit eafbc1ac52
parent e2247be700
221 changed files with 70 additions and 146140 deletions
--- a/rec/floorplan.py
+++ b/rec/floorplan.py
@ -0,0 +1,67 @@
+import logging
+import re
+from pathlib import Path
+from typing import Any
+from PIL import Image
+import cv2
+import numpy as np
+
+logger = logging.getLogger(__name__)
+
+MIN_SQM = 30
+MAX_SQM = 160
+
+
+def inference(image_path: str | Path) -> tuple[str, Any]:
+    from transformers import Pix2StructProcessor, Pix2StructForConditionalGeneration
+
+    image = Image.open(image_path)
+    question = "How many living rooms are displayed on this floor plan?"  # not sure if it even has an effect
+    processor = Pix2StructProcessor.from_pretrained("google/deplot")
+    model = Pix2StructForConditionalGeneration.from_pretrained("google/deplot")
+
+    inputs = processor(images=image, text=question, return_tensors="pt")
+    predictions = model.generate(**inputs, max_new_tokens=512)
+    output = processor.decode(predictions[0], skip_special_tokens=True)
+
+    return output, predictions
+
+
+def extract_total_sqm(input_str: str) -> float | None:
+    # Note: can be used on the output of inference() to extract sqm from model predictions.
+    sqmregex = r"(\d+\.?\d*) ?(sq ?m|sq. ?m)"
+    matches = re.findall(sqmregex, input_str.lower())
+    sqms = [float(m[0]) for m in matches]
+    filtered = [sqm for sqm in sqms if MIN_SQM < sqm < MAX_SQM]
+    if len(filtered) == 0:
+        return None
+    return max(filtered)
+
+
+def improve_img_for_ocr(img: Image.Image) -> Image.Image:
+    grayscale_image = np.array(img.convert("L"))
+    grayscale_image = cv2.resize(grayscale_image, None, fx=1.2, fy=1.2, interpolation=cv2.INTER_CUBIC)
+    thresh = cv2.adaptiveThreshold(
+        grayscale_image, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2
+    )
+    return Image.fromarray(thresh)
+
+
+def calculate_ocr(image_path: str | Path) -> tuple[float | None, str]:
+    import pytesseract
+
+    path = Path(image_path)
+    if not path.exists():
+        raise FileNotFoundError(f"Image not found: {image_path}")
+
+    img = Image.open(path)
+    text = pytesseract.image_to_string(img)
+    estimated_sqm = extract_total_sqm(text)
+    if estimated_sqm is None:
+        improved_img = improve_img_for_ocr(img)
+        text2 = pytesseract.image_to_string(improved_img)
+        estimated_sqm2 = extract_total_sqm(text2)
+        logger.debug(f"before: {estimated_sqm} after: {estimated_sqm2} - {image_path}")
+        return estimated_sqm2, text2
+
+    return estimated_sqm, text