wrongmove/crawler/services/listing_fetcher.py

"""Listing fetcher service - fetches listing data from Rightmove API."""
import asyncio
import logging
from typing import Any

from config.scraper_config import ScraperConfig
from listing_processor import ListingProcessor
from rec.query import create_session, listing_query
from models.listing import QueryParameters
from repositories import ListingRepository
from tqdm.asyncio import tqdm
from models import Listing as modelListing
from services.query_splitter import QuerySplitter, SubQuery

logger = logging.getLogger("uvicorn.error")


async def dump_listings_full(
    parameters: QueryParameters,
    repository: ListingRepository,
) -> list[modelListing]:
    """Fetches all listings, images as well as detects floorplans."""
    new_listings = await dump_listings(parameters, repository)
    logger.debug(f"Upserted {len(new_listings)} new listings")
    # refresh listings
    listings = await repository.get_listings(parameters)  # this can be better
    new_listings = [x for x in listings if x.id in new_listings]
    return new_listings


async def dump_listings(
    parameters: QueryParameters,
    repository: ListingRepository,
) -> list[modelListing]:
    """Fetch listings from Rightmove API and process them.

    Uses intelligent query splitting to maximize data extraction
    while respecting Rightmove's result caps.
    """
    config = ScraperConfig.from_env()
    splitter = QuerySplitter(config)

    async with create_session(config) as session:
        # Phase 1 & 2: Split and probe queries
        logger.info("Splitting query and probing result counts...")
        subqueries = await splitter.split(parameters, session)

        total_estimated = splitter.calculate_total_estimated_results(subqueries)
        logger.info(
            f"Split into {len(subqueries)} subqueries, "
            f"estimated {total_estimated} total results"
        )

        # Phase 3: Fetch all pages for each subquery
        semaphore = asyncio.Semaphore(config.max_concurrent_requests)

        async def fetch_subquery(sq: SubQuery) -> list[dict[str, Any]]:
            """Fetch all pages for a single subquery."""
            results: list[dict[str, Any]] = []

            estimated = sq.estimated_results or 0
            if estimated == 0:
                return results

            page_size = parameters.page_size
            max_pages = min(
                config.max_pages_per_query,
                (estimated // page_size) + 1,
            )

            for page_id in range(1, max_pages + 1):
                async with semaphore:
                    await asyncio.sleep(config.request_delay_ms / 1000)
                    try:
                        result = await listing_query(
                            page=page_id,
                            channel=parameters.listing_type,
                            min_bedrooms=sq.min_bedrooms,
                            max_bedrooms=sq.max_bedrooms,
                            radius=parameters.radius,
                            min_price=sq.min_price,
                            max_price=sq.max_price,
                            district=sq.district,
                            page_size=page_size,
                            max_days_since_added=parameters.max_days_since_added,
                            furnish_types=parameters.furnish_types or [],
                            session=session,
                        )
                        results.append(result)

                        properties = result.get("properties", [])
                        if len(properties) < page_size:
                            break

                    except Exception as e:
                        if "GENERIC_ERROR" in str(e):
                            logger.debug(
                                f"Max page for {sq.district}: {page_id - 1}"
                            )
                            break
                        logger.warning(
                            f"Error fetching page {page_id} for {sq.district}: {e}"
                        )
                        break

            return results

        # Fetch all subqueries with progress bar
        all_results = await tqdm.gather(
            *[fetch_subquery(sq) for sq in subqueries],
            desc="Fetching listings",
        )

    # Extract listing identifiers from results
    listing_ids: list[int] = []
    for subquery_results in all_results:
        for response_json in subquery_results:
            if not response_json:
                continue
            if response_json.get("totalAvailableResults", 0) == 0:
                continue
            for property_data in response_json.get("properties", []):
                identifier = property_data.get("identifier")
                if identifier:
                    listing_ids.append(identifier)

    logger.info(f"Found {len(listing_ids)} total listings")

    # Deduplicate
    unique_ids = list(set(listing_ids))
    logger.info(f"After deduplication: {len(unique_ids)} unique listings")

    # Filter out listings already in database
    all_listing_ids = [x.id for x in await repository.get_listings()]
    missing_ids = [
        listing_id for listing_id in unique_ids if listing_id not in all_listing_ids
    ]

    listing_processor = ListingProcessor(repository)
    logger.info(f"Starting processing {len(missing_ids)} new listings")
    processed_listings = await tqdm.gather(
        *[listing_processor.process_listing(id) for id in missing_ids]
    )
    filtered_listings = [x for x in processed_listings if x is not None]

    return filtered_listings