Flatten repo structure: move crawler/ to root, remove vqa/ and immoweb/

The crawler subdirectory was the only active project. Moving it to the repo root simplifies paths and removes the unnecessary nesting. The vqa/ and immoweb/ directories were legacy/unused and have been removed. Updated .drone.yml, .gitignore, .claude/ docs, and skills to reflect the new flat structure.
2026-02-07 23:01:20 +00:00 · 2026-02-07 23:01:20 +00:00 · eafbc1ac52
commit eafbc1ac52
parent e2247be700
221 changed files with 70 additions and 146140 deletions
--- a/services/listing_fetcher.py
+++ b/services/listing_fetcher.py
@ -0,0 +1,211 @@
+"""Listing fetcher service - fetches listing data from Rightmove API."""
+import asyncio
+import logging
+
+from config.scraper_config import ScraperConfig
+from listing_processor import ListingProcessor
+from rec.query import create_session, listing_query
+from rec.exceptions import CircuitBreakerOpenError, ThrottlingError
+from rec.throttle_detector import get_throttle_metrics, reset_throttle_metrics
+from models.listing import Listing, QueryParameters
+from repositories import ListingRepository
+from services.query_splitter import QuerySplitter, SubQuery
+
+logger = logging.getLogger("uvicorn.error")
+
+# Number of concurrent workers that process listing details (fetch details,
+# download images, run OCR) from the streaming queue in parallel.
+NUM_WORKERS = 20
+
+
+async def dump_listings_full(
+    parameters: QueryParameters,
+    repository: ListingRepository,
+) -> list[Listing]:
+    """Fetches all listings, images as well as detects floorplans."""
+    new_listings = await dump_listings(parameters, repository)
+    logger.debug(f"Upserted {len(new_listings)} new listings")
+    new_listing_ids = [listing.id for listing in new_listings]
+    return await repository.get_listings(only_ids=new_listing_ids)
+
+
+async def _fetch_subquery(
+    sq: SubQuery,
+    parameters: QueryParameters,
+    session: object,
+    config: ScraperConfig,
+    semaphore: asyncio.Semaphore,
+    existing_ids: set[int],
+    queue: asyncio.Queue[int | None],
+) -> int:
+    """Fetch listing IDs for a single subquery and enqueue new ones.
+
+    Iterates through pages of results for the given subquery, adding any
+    newly discovered listing IDs to the processing queue.
+
+    Args:
+        sq: The subquery to fetch results for.
+        parameters: The original query parameters (for page_size, etc.).
+        session: The aiohttp session for making requests.
+        config: Scraper configuration.
+        semaphore: Concurrency limiter for HTTP requests.
+        existing_ids: Set of already-known listing IDs (mutated in place).
+        queue: Queue to push new listing IDs onto for processing.
+
+    Returns:
+        The number of new IDs discovered and enqueued.
+    """
+    estimated = sq.estimated_results or 0
+    if estimated == 0:
+        return 0
+
+    ids_found = 0
+    page_size = parameters.page_size
+    max_pages = min(
+        config.max_pages_per_query,
+        (estimated // page_size) + 1,
+    )
+
+    for page_id in range(1, max_pages + 1):
+        async with semaphore:
+            await asyncio.sleep(config.request_delay_ms / 1000)
+            try:
+                result = await listing_query(
+                    page=page_id,
+                    channel=parameters.listing_type,
+                    min_bedrooms=sq.min_bedrooms,
+                    max_bedrooms=sq.max_bedrooms,
+                    radius=parameters.radius,
+                    min_price=sq.min_price,
+                    max_price=sq.max_price,
+                    district=sq.district,
+                    page_size=page_size,
+                    max_days_since_added=parameters.max_days_since_added,
+                    furnish_types=parameters.furnish_types or [],
+                    session=session,
+                    config=config,
+                )
+
+                # Extract and enqueue new IDs inline
+                properties = result.get("properties", [])
+                for prop in properties:
+                    identifier = prop.get("identifier")
+                    if identifier and identifier not in existing_ids:
+                        existing_ids.add(identifier)
+                        ids_found += 1
+                        await queue.put(identifier)
+
+                if len(properties) < page_size:
+                    break
+
+            except CircuitBreakerOpenError as e:
+                logger.error(f"Circuit breaker open: {e}")
+                break
+            except ThrottlingError as e:
+                logger.warning(
+                    f"Throttling error on page {page_id} for "
+                    f"{sq.district}: {e}"
+                )
+                break
+            except Exception as e:
+                # Rightmove returns GENERIC_ERROR when requesting pages
+                # past the last page of results. This is expected behavior
+                # and signals we've exhausted this subquery's results.
+                if "GENERIC_ERROR" in str(e):
+                    logger.debug(
+                        f"Max page for {sq.district}: {page_id - 1}"
+                    )
+                    break
+                logger.warning(
+                    f"Error fetching page {page_id} for "
+                    f"{sq.district}: {e}"
+                )
+                break
+
+    return ids_found
+
+
+async def dump_listings(
+    parameters: QueryParameters,
+    repository: ListingRepository,
+) -> list[Listing]:
+    """Fetch listings from Rightmove API and process them.
+
+    Uses intelligent query splitting and a streaming pipeline so that
+    listing processing starts as soon as IDs become available.
+    """
+    config = ScraperConfig.from_env()
+    splitter = QuerySplitter(config)
+
+    # Reset throttle metrics at start
+    reset_throttle_metrics()
+
+    try:
+        async with create_session(config) as session:
+            # Phase 1: Split and probe queries
+            logger.info("Splitting query and probing result counts...")
+            subqueries = await splitter.split(parameters, session)
+
+            total_estimated = splitter.calculate_total_estimated_results(subqueries)
+            logger.info(
+                f"Split into {len(subqueries)} subqueries, "
+                f"estimated {total_estimated} total results"
+            )
+
+            # Load existing IDs (fast, ID-only projection)
+            existing_ids = repository.get_listing_ids(parameters.listing_type)
+            logger.info(f"Found {len(existing_ids)} existing listings in DB")
+
+            # Phase 2: Streaming fetch & process
+            queue: asyncio.Queue[int | None] = asyncio.Queue()
+            semaphore = asyncio.Semaphore(config.max_concurrent_requests)
+            processed_listings: list[Listing] = []
+
+            async def producer() -> int:
+                """Fetch all subqueries and send sentinel values to workers."""
+                tasks = [
+                    _fetch_subquery(
+                        sq, parameters, session, config,
+                        semaphore, existing_ids, queue,
+                    )
+                    for sq in subqueries
+                ]
+                counts = await asyncio.gather(*tasks)
+                ids_collected = sum(counts)
+                logger.info(f"Fetch complete: {ids_collected} new IDs found")
+                for _ in range(NUM_WORKERS):
+                    await queue.put(None)
+                return ids_collected
+
+            async def worker() -> None:
+                while True:
+                    listing_id = await queue.get()
+                    if listing_id is None:
+                        break
+                    listing_processor = ListingProcessor(repository)
+                    listing = await listing_processor.process_listing(listing_id)
+                    if listing is not None:
+                        processed_listings.append(listing)
+
+            results = await asyncio.gather(
+                producer(),
+                *[worker() for _ in range(NUM_WORKERS)],
+            )
+            ids_collected = results[0]
+
+    except CircuitBreakerOpenError as e:
+        logger.error(f"Circuit breaker prevented listing fetch: {e}")
+        logger.info(get_throttle_metrics().summary())
+        return []
+    finally:
+        # Log throttle metrics at end
+        metrics = get_throttle_metrics()
+        if metrics.total_requests > 0:
+            logger.info("\n" + metrics.summary())
+
+    logger.info(
+        f"Processed {len(processed_listings)} new listings "
+        f"({ids_collected} total found)"
+    )
+
+    return processed_listings