Flatten repo structure: move crawler/ to root, remove vqa/ and immoweb/

The crawler subdirectory was the only active project. Moving it to the repo root simplifies paths and removes the unnecessary nesting. The vqa/ and immoweb/ directories were legacy/unused and have been removed. Updated .drone.yml, .gitignore, .claude/ docs, and skills to reflect the new flat structure.
2026-02-07 23:01:20 +00:00 · 2026-02-07 23:01:20 +00:00 · eafbc1ac52
commit eafbc1ac52
parent e2247be700
221 changed files with 70 additions and 146140 deletions
--- a/services/image_fetcher.py
+++ b/services/image_fetcher.py
@ -0,0 +1,88 @@
+"""Image fetcher service - downloads floorplan images for listings."""
+import asyncio
+import logging
+from pathlib import Path
+from urllib.parse import urlparse
+
+import aiohttp
+from repositories import ListingRepository
+from tenacity import retry, stop_after_attempt, wait_random
+from tqdm.asyncio import tqdm
+
+from models import Listing
+
+logger = logging.getLogger(__name__)
+
+# Maximum number of concurrent image downloads.
+# Setting this too high either crashes Rightmove or gets us blocked.
+MAX_CONCURRENT_DOWNLOADS = 5
+semaphore = asyncio.Semaphore(MAX_CONCURRENT_DOWNLOADS)
+
+
+async def dump_images(
+    repository: ListingRepository,
+    image_base_path: Path = Path("data/rs/"),
+) -> None:
+    """Download floorplan images for all listings."""
+    listings = await repository.get_listings()
+    async with aiohttp.ClientSession() as session:
+        updated_listings = await tqdm.gather(
+            *[
+                dump_images_for_listing(listing, image_base_path, session=session)
+                for listing in listings
+            ]
+        )
+    await repository.upsert_listings(
+        [listing for listing in updated_listings if listing is not None]
+    )
+
+
+@retry(wait=wait_random(min=1, max=2), stop=stop_after_attempt(3))
+async def dump_images_for_listing(
+    listing: Listing,
+    base_path: Path,
+    session: aiohttp.ClientSession | None = None,
+) -> Listing | None:
+    """Download floorplan images for a single listing."""
+    all_floorplans = listing.additional_info.get("property", {}).get("floorplans", [])
+    for floorplan in all_floorplans:
+        url = floorplan["url"]
+        picname = Path(urlparse(url).path).name
+        floorplan_path = Path(base_path, str(listing.id), "floorplans", picname)
+        if floorplan_path.exists():
+            continue
+        try:
+            owns_session = session is None
+            active_session = session or aiohttp.ClientSession()
+            try:
+                async with semaphore:
+                    async with active_session.get(url) as response:
+                        if response.status == 404:
+                            logger.warning(
+                                "Listing %s: floorplan not found (404) at %s",
+                                listing.id,
+                                url,
+                            )
+                            return None
+                        if response.status != 200:
+                            raise Exception(
+                                f"Error downloading floorplan for listing {listing.id} "
+                                f"from {url}: HTTP {response.status}"
+                            )
+                        floorplan_path.parent.mkdir(parents=True, exist_ok=True)
+                        with open(floorplan_path, "wb") as f:
+                            f.write(await response.read())
+                        listing.floorplan_image_paths.append(str(floorplan_path))
+                        return listing
+            finally:
+                if owns_session:
+                    await active_session.close()
+        except Exception as e:
+            logger.error(
+                "Listing %s: error downloading floorplan from %s: %s",
+                listing.id,
+                url,
+                e,
+            )
+            raise
+    return None