wrongmove/crawler/services/listing_fetcher.py

"""Listing fetcher service - fetches listing data from Rightmove API."""
import asyncio
import logging

from config.scraper_config import ScraperConfig
from listing_processor import ListingProcessor
from rec.query import create_session, listing_query
from rec.exceptions import CircuitBreakerOpenError, ThrottlingError
from rec.throttle_detector import get_throttle_metrics, reset_throttle_metrics
from models.listing import Listing, QueryParameters
from repositories import ListingRepository
from services.query_splitter import QuerySplitter, SubQuery

logger = logging.getLogger("uvicorn.error")

# Number of concurrent workers that process listing details (fetch details,
# download images, run OCR) from the streaming queue in parallel.
NUM_WORKERS = 20


async def dump_listings_full(
    parameters: QueryParameters,
    repository: ListingRepository,
) -> list[Listing]:
    """Fetches all listings, images as well as detects floorplans."""
    new_listings = await dump_listings(parameters, repository)
    logger.debug(f"Upserted {len(new_listings)} new listings")
    new_listing_ids = [listing.id for listing in new_listings]
    return await repository.get_listings(only_ids=new_listing_ids)


async def _fetch_subquery(
    sq: SubQuery,
    parameters: QueryParameters,
    session: object,
    config: ScraperConfig,
    semaphore: asyncio.Semaphore,
    existing_ids: set[int],
    queue: asyncio.Queue[int | None],
) -> int:
    """Fetch listing IDs for a single subquery and enqueue new ones.

    Iterates through pages of results for the given subquery, adding any
    newly discovered listing IDs to the processing queue.

    Args:
        sq: The subquery to fetch results for.
        parameters: The original query parameters (for page_size, etc.).
        session: The aiohttp session for making requests.
        config: Scraper configuration.
        semaphore: Concurrency limiter for HTTP requests.
        existing_ids: Set of already-known listing IDs (mutated in place).
        queue: Queue to push new listing IDs onto for processing.

    Returns:
        The number of new IDs discovered and enqueued.
    """
    estimated = sq.estimated_results or 0
    if estimated == 0:
        return 0

    ids_found = 0
    page_size = parameters.page_size
    max_pages = min(
        config.max_pages_per_query,
        (estimated // page_size) + 1,
    )

    for page_id in range(1, max_pages + 1):
        async with semaphore:
            await asyncio.sleep(config.request_delay_ms / 1000)
            try:
                result = await listing_query(
                    page=page_id,
                    channel=parameters.listing_type,
                    min_bedrooms=sq.min_bedrooms,
                    max_bedrooms=sq.max_bedrooms,
                    radius=parameters.radius,
                    min_price=sq.min_price,
                    max_price=sq.max_price,
                    district=sq.district,
                    page_size=page_size,
                    max_days_since_added=parameters.max_days_since_added,
                    furnish_types=parameters.furnish_types or [],
                    session=session,
                    config=config,
                )

                # Extract and enqueue new IDs inline
                properties = result.get("properties", [])
                for prop in properties:
                    identifier = prop.get("identifier")
                    if identifier and identifier not in existing_ids:
                        existing_ids.add(identifier)
                        ids_found += 1
                        await queue.put(identifier)

                if len(properties) < page_size:
                    break

            except CircuitBreakerOpenError as e:
                logger.error(f"Circuit breaker open: {e}")
                break
            except ThrottlingError as e:
                logger.warning(
                    f"Throttling error on page {page_id} for "
                    f"{sq.district}: {e}"
                )
                break
            except Exception as e:
                # Rightmove returns GENERIC_ERROR when requesting pages
                # past the last page of results. This is expected behavior
                # and signals we've exhausted this subquery's results.
                if "GENERIC_ERROR" in str(e):
                    logger.debug(
                        f"Max page for {sq.district}: {page_id - 1}"
                    )
                    break
                logger.warning(
                    f"Error fetching page {page_id} for "
                    f"{sq.district}: {e}"
                )
                break

    return ids_found


async def dump_listings(
    parameters: QueryParameters,
    repository: ListingRepository,
) -> list[Listing]:
    """Fetch listings from Rightmove API and process them.

    Uses intelligent query splitting and a streaming pipeline so that
    listing processing starts as soon as IDs become available.
    """
    config = ScraperConfig.from_env()
    splitter = QuerySplitter(config)

    # Reset throttle metrics at start
    reset_throttle_metrics()

    try:
        async with create_session(config) as session:
            # Phase 1: Split and probe queries
            logger.info("Splitting query and probing result counts...")
            subqueries = await splitter.split(parameters, session)

            total_estimated = splitter.calculate_total_estimated_results(subqueries)
            logger.info(
                f"Split into {len(subqueries)} subqueries, "
                f"estimated {total_estimated} total results"
            )

            # Load existing IDs (fast, ID-only projection)
            existing_ids = repository.get_listing_ids(parameters.listing_type)
            logger.info(f"Found {len(existing_ids)} existing listings in DB")

            # Phase 2: Streaming fetch & process
            queue: asyncio.Queue[int | None] = asyncio.Queue()
            semaphore = asyncio.Semaphore(config.max_concurrent_requests)
            processed_listings: list[Listing] = []

            async def producer() -> int:
                """Fetch all subqueries and send sentinel values to workers."""
                tasks = [
                    _fetch_subquery(
                        sq, parameters, session, config,
                        semaphore, existing_ids, queue,
                    )
                    for sq in subqueries
                ]
                counts = await asyncio.gather(*tasks)
                ids_collected = sum(counts)
                logger.info(f"Fetch complete: {ids_collected} new IDs found")
                for _ in range(NUM_WORKERS):
                    await queue.put(None)
                return ids_collected

            async def worker() -> None:
                while True:
                    listing_id = await queue.get()
                    if listing_id is None:
                        break
                    listing_processor = ListingProcessor(repository)
                    listing = await listing_processor.process_listing(listing_id)
                    if listing is not None:
                        processed_listings.append(listing)

            results = await asyncio.gather(
                producer(),
                *[worker() for _ in range(NUM_WORKERS)],
            )
            ids_collected = results[0]

    except CircuitBreakerOpenError as e:
        logger.error(f"Circuit breaker prevented listing fetch: {e}")
        logger.info(get_throttle_metrics().summary())
        return []
    finally:
        # Log throttle metrics at end
        metrics = get_throttle_metrics()
        if metrics.total_requests > 0:
            logger.info("\n" + metrics.summary())

    logger.info(
        f"Processed {len(processed_listings)} new listings "
        f"({ids_collected} total found)"
    )

    return processed_listings