From 29ba73906345895ebf8472b288c28acea44a7cdd Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Mon, 2 Feb 2026 20:08:03 +0000 Subject: [PATCH 1/5] Improve login UI with error handling and callback page --- crawler/frontend/src/App.tsx | 16 +- crawler/frontend/src/auth/authService.ts | 33 +++- crawler/frontend/src/auth/errors.ts | 60 +++++++ .../frontend/src/components/AuthCallback.tsx | 111 +++++++++++++ .../frontend/src/components/LoginModal.tsx | 148 ++++++++++++++---- 5 files changed, 324 insertions(+), 44 deletions(-) create mode 100644 crawler/frontend/src/auth/errors.ts create mode 100644 crawler/frontend/src/components/AuthCallback.tsx diff --git a/crawler/frontend/src/App.tsx b/crawler/frontend/src/App.tsx index 08be85c..5e90bdf 100644 --- a/crawler/frontend/src/App.tsx +++ b/crawler/frontend/src/App.tsx @@ -1,9 +1,10 @@ import type { User } from 'oidc-client-ts'; import { useEffect, useState, useRef, useCallback } from 'react'; import './App.css'; -import { getUser, handleCallback } from './auth/authService'; +import { getUser } from './auth/authService'; import AlertError from './components/AlertError'; import LoginModal from './components/LoginModal'; +import AuthCallback from './components/AuthCallback'; import { Map } from './components/Map'; import { FilterPanel, type ParameterValues, DEFAULT_FILTER_VALUES } from './components/FilterPanel'; import { Header } from './components/Header'; @@ -34,15 +35,12 @@ function App() { // Ref to track if initial load has been triggered const initialLoadTriggeredRef = useRef(false); - useEffect(() => { - // Check if this is a callback from Authentik (after login) - if (window.location.pathname === '/callback') { - handleCallback().then(() => { - window.location.href = '/'; // Redirect to home after login - }); - return; - } + // Check if this is the callback route - render dedicated component + if (window.location.pathname === '/callback') { + return ; + } + useEffect(() => { // Load user data getUser().then(setUser); }, []); diff --git a/crawler/frontend/src/auth/authService.ts b/crawler/frontend/src/auth/authService.ts index 726dbd3..c63b253 100644 --- a/crawler/frontend/src/auth/authService.ts +++ b/crawler/frontend/src/auth/authService.ts @@ -1,11 +1,36 @@ import { User, UserManager } from 'oidc-client-ts'; import { oidcConfig } from './config'; +import { parseOidcError, type AuthError } from './errors'; const userManager = new UserManager(oidcConfig); -export const login = () => userManager.signinRedirect(); -export const logout = () => userManager.signoutRedirect(); -export const handleCallback = () => userManager.signinRedirectCallback(); +export const login = async (): Promise => { + try { + await userManager.signinRedirect(); + } catch (error) { + console.error('Login redirect failed:', error); + throw parseOidcError(error); + } +}; + +export const logout = async (): Promise => { + try { + await userManager.signoutRedirect(); + } catch (error) { + console.error('Logout redirect failed:', error); + throw parseOidcError(error); + } +}; + +export const handleCallback = async (): Promise => { + try { + const user = await userManager.signinRedirectCallback(); + return user; + } catch (error) { + console.error('Callback handling failed:', error); + throw parseOidcError(error); + } +}; export const getUser = async (): Promise => { try { @@ -16,3 +41,5 @@ export const getUser = async (): Promise => { return null; } }; + +export type { AuthError }; diff --git a/crawler/frontend/src/auth/errors.ts b/crawler/frontend/src/auth/errors.ts new file mode 100644 index 0000000..ce82fc5 --- /dev/null +++ b/crawler/frontend/src/auth/errors.ts @@ -0,0 +1,60 @@ +export enum AuthErrorType { + REDIRECT_FAILED = 'REDIRECT_FAILED', + CALLBACK_FAILED = 'CALLBACK_FAILED', + NETWORK_ERROR = 'NETWORK_ERROR', + USER_CANCELLED = 'USER_CANCELLED', +} + +export interface AuthError { + type: AuthErrorType; + message: string; + retryable: boolean; +} + +export function parseOidcError(error: unknown): AuthError { + const errorMessage = error instanceof Error ? error.message : String(error); + const errorString = errorMessage.toLowerCase(); + + // Check for popup/redirect blocked errors + if (errorString.includes('popup') || errorString.includes('blocked') || errorString.includes('window')) { + return { + type: AuthErrorType.REDIRECT_FAILED, + message: 'Unable to redirect. Please check if popups are blocked.', + retryable: true, + }; + } + + // Check for user cancellation + if (errorString.includes('cancel') || errorString.includes('closed') || errorString.includes('denied')) { + return { + type: AuthErrorType.USER_CANCELLED, + message: 'Sign in was cancelled.', + retryable: true, + }; + } + + // Check for network errors + if (errorString.includes('network') || errorString.includes('fetch') || errorString.includes('timeout') || errorString.includes('failed to fetch')) { + return { + type: AuthErrorType.NETWORK_ERROR, + message: 'Unable to reach authentication server. Please check your connection.', + retryable: true, + }; + } + + // Check for callback/state errors + if (errorString.includes('state') || errorString.includes('invalid') || errorString.includes('mismatch') || errorString.includes('no matching state')) { + return { + type: AuthErrorType.CALLBACK_FAILED, + message: 'Login verification failed. Please try again.', + retryable: true, + }; + } + + // Default error + return { + type: AuthErrorType.CALLBACK_FAILED, + message: errorMessage || 'An unexpected error occurred during sign in.', + retryable: true, + }; +} diff --git a/crawler/frontend/src/components/AuthCallback.tsx b/crawler/frontend/src/components/AuthCallback.tsx new file mode 100644 index 0000000..165a4e3 --- /dev/null +++ b/crawler/frontend/src/components/AuthCallback.tsx @@ -0,0 +1,111 @@ +import React, { useEffect, useState } from 'react'; +import { handleCallback, login, type AuthError } from '@/auth/authService'; +import { Loader2, CheckCircle, AlertCircle, Home } from 'lucide-react'; +import { Button } from './ui/button'; + +type CallbackState = 'processing' | 'success' | 'error'; + +const AuthCallback: React.FC = () => { + const [state, setState] = useState('processing'); + const [error, setError] = useState(null); + + useEffect(() => { + const processCallback = async () => { + try { + await handleCallback(); + setState('success'); + // Auto-redirect after success + setTimeout(() => { + window.location.href = '/'; + }, 1500); + } catch (err) { + setError(err as AuthError); + setState('error'); + } + }; + + processCallback(); + }, []); + + const handleRetry = async () => { + setState('processing'); + setError(null); + try { + await login(); + } catch (err) { + setError(err as AuthError); + setState('error'); + } + }; + + const handleGoHome = () => { + window.location.href = '/'; + }; + + return ( +
+
+
+ {state === 'processing' && ( +
+
+
+ +
+
+
+

Completing Sign In

+

+ Please wait while we verify your credentials... +

+
+
+ )} + + {state === 'success' && ( +
+
+
+ +
+
+
+

Welcome Back!

+

+ Redirecting you to the dashboard... +

+
+
+ )} + + {state === 'error' && ( +
+
+
+ +
+
+
+

Sign In Failed

+

+ {error?.message || 'An unexpected error occurred.'} +

+
+
+ + +
+
+ )} +
+
+
+ ); +}; + +export default AuthCallback; diff --git a/crawler/frontend/src/components/LoginModal.tsx b/crawler/frontend/src/components/LoginModal.tsx index 556f9fd..1eaf4c5 100644 --- a/crawler/frontend/src/components/LoginModal.tsx +++ b/crawler/frontend/src/components/LoginModal.tsx @@ -1,43 +1,127 @@ -import { login } from '@/auth/authService'; +import { login, type AuthError } from '@/auth/authService'; import { Button } from "@/components/ui/button"; import { DialogDescription } from '@radix-ui/react-dialog'; import React, { useState } from 'react'; import { Dialog, DialogContent, DialogFooter, DialogHeader, DialogTitle } from './ui/dialog'; +import { Home, LogIn, AlertCircle, Loader2 } from 'lucide-react'; -interface ModalProps { - isOpen: boolean; +interface LoginModalProps { + isOpen: boolean; } -const Modal: React.FC = ({ - isOpen, -}) => { - if (!isOpen) return null; - const [isLoading, setIsLoading] = useState(false) +const LoginModal: React.FC = ({ isOpen }) => { + const [isLoading, setIsLoading] = useState(false); + const [error, setError] = useState(null); - return ( - -
- - - Login to Wrongmove - (We are currently in closed beta; ask Viktor to send you an invitation) + if (!isOpen) return null; - - - {isLoading && ( -
Signing in. Please wait...
- ) - } - -
-
-
-
- ) + const handleLogin = async () => { + setIsLoading(true); + setError(null); + try { + await login(); + } catch (err) { + setError(err as AuthError); + setIsLoading(false); + } + }; + + const handleRetry = () => { + setError(null); + handleLogin(); + }; + + const handleCancel = () => { + setError(null); + setIsLoading(false); + }; + + return ( + + + +
+
+ +
+
+ Wrongmove + + Your smart property search companion + +
+
+
+ +
+ {/* Beta Notice */} +
+

+ We are currently in closed beta. Please contact Viktor to request an invitation. +

+
+ + {/* Error State */} + {error && ( +
+ +
+

{error.message}

+
+ + +
+
+
+ )} + + {/* Loading State */} + {isLoading && !error && ( +
+ + Redirecting to login... +
+ )} +
+ + + {!error && ( + + )} + +
+
+ ); }; -export default Modal; +export default LoginModal; From e8293c60421cc3d335da41cb3d53bef11e25d22d Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Mon, 2 Feb 2026 21:57:45 +0000 Subject: [PATCH 2/5] Add intelligent query splitting to maximize Rightmove data extraction --- crawler/.env.sample | 9 + crawler/CLAUDE.md | 233 ++++++++++ crawler/config/__init__.py | 3 +- crawler/config/scraper_config.py | 65 +++ crawler/poetry.lock | 526 +++++++++++++++++++++- crawler/pyproject.toml | 31 +- crawler/rec/query.py | 190 +++++++- crawler/services/listing_fetcher.py | 146 ++++++ crawler/services/query_splitter.py | 303 +++++++++++++ crawler/tasks/listing_tasks.py | 203 +++++---- crawler/tests/unit/test_query_splitter.py | 374 +++++++++++++++ 11 files changed, 1970 insertions(+), 113 deletions(-) create mode 100644 crawler/CLAUDE.md create mode 100644 crawler/config/scraper_config.py create mode 100644 crawler/services/listing_fetcher.py create mode 100644 crawler/services/query_splitter.py create mode 100644 crawler/tests/unit/test_query_splitter.py diff --git a/crawler/.env.sample b/crawler/.env.sample index 1e7fe00..aa1c93d 100644 --- a/crawler/.env.sample +++ b/crawler/.env.sample @@ -7,6 +7,15 @@ export DB_CONNECTION_STRING="sqlite:///data/wrongmove.db" # by default use SQLit export CELERY_BROKER_URL="redis://localhost:6379/0" # processing background tasks export CELERY_RESULT_BACKEND="redis://localhost:6379/1" +# Rightmove scraper configuration +# These settings control query splitting to work around Rightmove's ~1500 result cap +RIGHTMOVE_MAX_CONCURRENT=5 # Max concurrent HTTP requests +RIGHTMOVE_REQUEST_DELAY_MS=100 # Delay between requests in milliseconds +RIGHTMOVE_SPLIT_THRESHOLD=1200 # Split query when results exceed this threshold +RIGHTMOVE_MIN_PRICE_BAND=100 # Minimum price band width (won't split below this) +RIGHTMOVE_MAX_PAGES=60 # Max pages per subquery (60 * 25 = 1500 max results) +RIGHTMOVE_PROXY_URL= # Optional SOCKS proxy URL (e.g., socks5://localhost:9050 for Tor) + # Periodic scraping schedules (JSON array) # Each schedule has: name, enabled, hour, minute, day_of_week, listing_type, min/max_bedrooms, min/max_price, district_names, furnish_types # Cron fields: minute (0-59), hour (0-23), day_of_week (0-6, 0=Sunday) diff --git a/crawler/CLAUDE.md b/crawler/CLAUDE.md new file mode 100644 index 0000000..14714ab --- /dev/null +++ b/crawler/CLAUDE.md @@ -0,0 +1,233 @@ +# CLAUDE.md + +This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository. + +## Project Overview + +A real estate listing crawler and aggregator that scrapes property listings from Rightmove UK, extracts square meter data from floorplan images using OCR, calculates transit routes, and provides a web UI for browsing listings. + +## Development Environment + +**IMPORTANT**: This project runs on a remote host, not locally. Always use the remote executor to run commands: + +- **All shell commands** (Python, pytest, poetry, alembic, etc.) must be executed via the remote executor +- **Starting the project**: Use the remote executor to run `./start.sh` +- **Running tests**: Use the remote executor to run `pytest` +- **Any CLI operations**: Use the remote executor to run `python main.py ...` + +Never run commands directly on the local machine - always route them through the remote executor. + +## Commands + +### Setup and Run (Docker - Recommended) + +```bash +# Start all services (Redis, MySQL, API, Celery) with Docker +./start.sh + +# Rebuild images and start +./start.sh --build + +# Stop all containers +./start.sh --down + +# View logs +./start.sh --logs +``` + +### Setup and Run (Local with Poetry) + +```bash +# Install dependencies +poetry install && cp .env.sample .env + +# Start backend locally (requires Redis running) +./start.sh --local + +# Start frontend (from frontend/ directory) +cd frontend && ./start.sh +``` + +### CLI Operations + +The main CLI (`main.py`) uses Click with a `--data-dir` option (default: `data/rs/`): + +```bash +# Dump listings from Rightmove API +python main.py dump-listings --type rent --min-price 2000 --max-price 4000 --min-bedrooms 2 + +# Download floorplan images +python main.py dump-images + +# Extract square meters from floorplans using OCR +python main.py detect-floorplan + +# Calculate transit routes (consumes Google Maps API calls) +python main.py routing --destination-address 'Address' -m transit -l 10 + +# Export to GeoJSON for visualization +python main.py export-immoweb -O output.js --type rent [filter options] +``` + +### Testing + +```bash +# Run tests with coverage +pytest tests/ -v --cov=. --cov-report=term-missing + +# Run type checker +mypy . +``` + +### Database Migrations + +```bash +alembic upgrade head # Apply migrations +alembic revision -m "description" # Create new migration +``` + +### Code Formatting + +```bash +yapf --style .style.yapf --recursive . +``` + +## Architecture + +### Core Data Flow + +1. **Scraping** (`rec/query.py`): Fetches listing IDs and details from Rightmove's Android API +2. **Processing** (`listing_processor.py`): Pipeline with steps for fetching details, downloading images, and OCR detection +3. **Storage**: SQLModel/SQLAlchemy with MySQL or SQLite, plus JSON files in `data/rs//` +4. **API** (`api/app.py`): FastAPI endpoints authenticated via JWT from external Authentik service +5. **Background Tasks** (`tasks/listing_tasks.py`): Celery tasks for async listing processing with Redis broker + +### Key Models + +- `models/listing.py`: SQLModel entities (`RentListing`, `BuyListing`) with `QueryParameters` for filtering +- `data_access.py`: **DEPRECATED** - Legacy `Listing` dataclass for filesystem-based data access. Use `models.listing.RentListing` or `models.listing.BuyListing` instead. + +### Services Layer (Unified CLI and API) + +**IMPORTANT**: The `services/` directory contains unified handler functions that both the CLI and HTTP API use. This ensures consistency and code reuse. + +#### High-level services (use these in CLI and API): +- **`listing_service.py`**: Listing operations + - `get_listings()` - Retrieve listings from database + - `refresh_listings()` - Fetch new listings from Rightmove (sync or async) + - `download_images()` - Download floorplan images + - `detect_floorplans()` - Run OCR on floorplans + - `calculate_routes()` - Calculate transit routes + +- **`export_service.py`**: Export operations + - `export_to_csv()` - Export listings to CSV file + - `export_to_geojson()` - Export listings to GeoJSON (file or in-memory) + +- **`district_service.py`**: District management + - `get_all_districts()` - Get district name → region ID mapping + - `get_district_names()` - Get list of district names + - `validate_districts()` - Validate district names + +- **`task_service.py`**: Background task management + - `get_task_status()` - Get Celery task status + - `get_user_tasks()` - Get all tasks for a user + - `add_task_for_user()` - Associate task with user + +#### Low-level services (internal implementation): +- `listing_fetcher.py`: Fetches listing data from Rightmove API +- `image_fetcher.py`: Downloads floorplan images +- `floorplan_detector.py`: OCR-based square meter detection +- `route_calculator.py`: Calculates transit routes using Google Maps API +- `query_splitter.py`: Intelligent query splitting to maximize data extraction + +### Query Splitting System + +Rightmove's API caps search results at ~1,500 listings per query. The query splitting system works around this limitation to fetch **all matching listings**. + +#### How it works: + +1. **Initial Split**: Queries are split by district and bedroom count +2. **Probe**: Each subquery is probed (minimal API request) to get `totalAvailableResults` +3. **Adaptive Split**: If results exceed threshold (1,200), the price range is binary-split +4. **Recursive Refinement**: Splitting continues until all subqueries are under threshold +5. **Full Fetch**: Each subquery fetches up to 60 pages (1,500 results max) + +``` +Original: 2BR, £1000-£5000 → 3,000 results (over cap!) + ↓ split by price +£1000-£3000: 1,800 (still over!) | £3000-£5000: 1,200 ✓ + ↓ split again +£1000-£2000: 900 ✓ | £2000-£3000: 900 ✓ + +Final: 3 subqueries → 900 + 900 + 1,200 = 3,000 total results ✓ +``` + +#### Key components: +- `config/scraper_config.py`: Configuration with env var loading +- `services/query_splitter.py`: `QuerySplitter` class with `SubQuery` dataclass +- `rec/query.py`: `probe_query()` for result count probing, `create_session()` for connection pooling + +### Processing Pipeline + +`ListingProcessor` runs sequential steps defined in `listing_processor.py`: +1. `FetchListingDetailsStep` - Get property details from API +2. `FetchImagesStep` - Download floorplan images +3. `DetectFloorplanStep` - OCR to extract square meters from floorplans + +### Floorplan OCR + +`rec/floorplan.py` uses pytesseract with image preprocessing (adaptive thresholding) to extract square meter values from floorplan images. + +### Repository Pattern + +`repositories/listing_repository.py` handles database operations with SQLModel sessions. + +## Environment Variables + +- `DB_CONNECTION_STRING`: Database URL (SQLite default: `sqlite:///data/wrongmove.db`) +- `CELERY_BROKER_URL` / `CELERY_RESULT_BACKEND`: Redis URLs +- `ROUTING_API_KEY`: Google Maps API key for transit routing + +### Scraper Configuration + +These control the query splitting behavior (see `.env.sample` for defaults): + +| Variable | Default | Description | +|----------|---------|-------------| +| `RIGHTMOVE_MAX_CONCURRENT` | 5 | Max concurrent HTTP requests | +| `RIGHTMOVE_REQUEST_DELAY_MS` | 100 | Delay between requests (ms) | +| `RIGHTMOVE_SPLIT_THRESHOLD` | 1200 | Split query when results exceed this | +| `RIGHTMOVE_MIN_PRICE_BAND` | 100 | Minimum price band width (won't split below) | +| `RIGHTMOVE_MAX_PAGES` | 60 | Max pages per subquery (60 × 25 = 1500) | +| `RIGHTMOVE_PROXY_URL` | - | SOCKS proxy URL (e.g., `socks5://localhost:9050` for Tor) | + +## Project Structure + +- `main.py`: CLI entry point +- `api/`: FastAPI application with auth middleware +- `config/`: Configuration modules (scraper settings, scheduled tasks) +- `models/`: SQLModel database entities +- `repositories/`: Database access layer +- `rec/`: Core business logic (query, floorplan OCR, routing, districts) +- `services/`: Service layer modules (listing_fetcher, image_fetcher, floorplan_detector, route_calculator, query_splitter) +- `tasks/`: Celery background tasks +- `frontend/`: React/Vite frontend with Caddy proxy +- `alembic/`: Database migrations +- `tests/`: Test suite (unit and integration tests) + +## Type Checking + +The project uses strict mypy configuration with `disallow_untyped_defs=true`. Run `mypy .` to check types. + +## Exploration Preferences + +- Always ignore `node_modules` directory when exploring the codebase + +## Git Workflow + +**IMPORTANT**: After completing work items, always create separate commits for each logical change: +- Keep each commit focused on one feature/fix +- Do not include unrelated files +- Use descriptive commit messages +- Group related files together (e.g., tests with the code they test) + diff --git a/crawler/config/__init__.py b/crawler/config/__init__.py index 315e8c3..b82264c 100644 --- a/crawler/config/__init__.py +++ b/crawler/config/__init__.py @@ -1,4 +1,5 @@ """Configuration modules.""" from config.schedule_config import ScheduleConfig, SchedulesConfig +from config.scraper_config import ScraperConfig -__all__ = ["ScheduleConfig", "SchedulesConfig"] +__all__ = ["ScheduleConfig", "SchedulesConfig", "ScraperConfig"] diff --git a/crawler/config/scraper_config.py b/crawler/config/scraper_config.py new file mode 100644 index 0000000..e84c1d5 --- /dev/null +++ b/crawler/config/scraper_config.py @@ -0,0 +1,65 @@ +"""Scraper configuration with environment variable loading.""" +from __future__ import annotations + +import os +from dataclasses import dataclass +from typing import Self + + +@dataclass(frozen=True) +class ScraperConfig: + """Configuration for the Rightmove scraper. + + Attributes: + max_concurrent_requests: Maximum number of concurrent HTTP requests. + request_delay_ms: Delay between requests in milliseconds. + result_cap: Maximum results Rightmove returns per query (their limit). + split_threshold: When results exceed this, split the query further. + min_price_band: Minimum width of a price band (won't split below this). + max_pages_per_query: Maximum pages to fetch per subquery (60 * 25 = 1500). + proxy_url: Optional SOCKS proxy URL (e.g., socks5://localhost:9050 for Tor). + """ + + max_concurrent_requests: int = 5 + request_delay_ms: int = 100 + result_cap: int = 1500 + split_threshold: int = 1200 # Split when approaching cap + min_price_band: int = 100 # Minimum band width in currency units + max_pages_per_query: int = 60 # 60 * 25 = 1500 results max + proxy_url: str | None = None + + @classmethod + def from_env(cls) -> Self: + """Load configuration from environment variables. + + Environment variables: + RIGHTMOVE_MAX_CONCURRENT: Max concurrent requests (default: 5) + RIGHTMOVE_REQUEST_DELAY_MS: Request delay in ms (default: 100) + RIGHTMOVE_RESULT_CAP: Result cap per query (default: 1500) + RIGHTMOVE_SPLIT_THRESHOLD: Split threshold (default: 1200) + RIGHTMOVE_MIN_PRICE_BAND: Minimum price band width (default: 100) + RIGHTMOVE_MAX_PAGES: Max pages per query (default: 60) + RIGHTMOVE_PROXY_URL: SOCKS proxy URL (default: None) + + Returns: + ScraperConfig instance with values from environment or defaults. + """ + return cls( + max_concurrent_requests=int( + os.environ.get("RIGHTMOVE_MAX_CONCURRENT", "5") + ), + request_delay_ms=int( + os.environ.get("RIGHTMOVE_REQUEST_DELAY_MS", "100") + ), + result_cap=int(os.environ.get("RIGHTMOVE_RESULT_CAP", "1500")), + split_threshold=int( + os.environ.get("RIGHTMOVE_SPLIT_THRESHOLD", "1200") + ), + min_price_band=int( + os.environ.get("RIGHTMOVE_MIN_PRICE_BAND", "100") + ), + max_pages_per_query=int( + os.environ.get("RIGHTMOVE_MAX_PAGES", "60") + ), + proxy_url=os.environ.get("RIGHTMOVE_PROXY_URL") or None, + ) diff --git a/crawler/poetry.lock b/crawler/poetry.lock index 87eabca..c822ef9 100644 --- a/crawler/poetry.lock +++ b/crawler/poetry.lock @@ -6,7 +6,7 @@ version = "2.6.1" description = "Happy Eyeballs for asyncio" optional = false python-versions = ">=3.9" -groups = ["main"] +groups = ["main", "dev"] files = [ {file = "aiohappyeyeballs-2.6.1-py3-none-any.whl", hash = "sha256:f349ba8f4b75cb25c99c5c2d84e997e485204d2902a9597802b0371f09331fb8"}, {file = "aiohappyeyeballs-2.6.1.tar.gz", hash = "sha256:c3f9d0113123803ccadfdf3f0faa505bc78e6a72d1cc4806cbd719826e943558"}, @@ -18,7 +18,7 @@ version = "3.12.13" description = "Async http client/server framework (asyncio)" optional = false python-versions = ">=3.9" -groups = ["main"] +groups = ["main", "dev"] files = [ {file = "aiohttp-3.12.13-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:5421af8f22a98f640261ee48aae3a37f0c41371e99412d55eaf2f8a46d5dad29"}, {file = "aiohttp-3.12.13-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:0fcda86f6cb318ba36ed8f1396a6a4a3fd8f856f84d426584392083d10da4de0"}, @@ -120,13 +120,29 @@ yarl = ">=1.17.0,<2.0" [package.extras] speedups = ["Brotli ; platform_python_implementation == \"CPython\"", "aiodns (>=3.3.0)", "brotlicffi ; platform_python_implementation != \"CPython\""] +[[package]] +name = "aioresponses" +version = "0.7.8" +description = "Mock out requests made by ClientSession from aiohttp package" +optional = false +python-versions = "*" +groups = ["dev"] +files = [ + {file = "aioresponses-0.7.8-py2.py3-none-any.whl", hash = "sha256:b73bd4400d978855e55004b23a3a84cb0f018183bcf066a85ad392800b5b9a94"}, + {file = "aioresponses-0.7.8.tar.gz", hash = "sha256:b861cdfe5dc58f3b8afac7b0a6973d5d7b2cb608dd0f6253d16b8ee8eaf6df11"}, +] + +[package.dependencies] +aiohttp = ">=3.3.0,<4.0.0" +packaging = ">=22.0" + [[package]] name = "aiosignal" version = "1.3.2" description = "aiosignal: a list of registered asynchronous callbacks" optional = false python-versions = ">=3.9" -groups = ["main"] +groups = ["main", "dev"] files = [ {file = "aiosignal-1.3.2-py2.py3-none-any.whl", hash = "sha256:45cde58e409a301715980c2b01d0c28bdde3770d8290b5eb2173759d9acb31a5"}, {file = "aiosignal-1.3.2.tar.gz", hash = "sha256:a8c255c66fafb1e499c9351d0bf32ff2d8a0321595ebac3b93713656d2436f54"}, @@ -400,7 +416,7 @@ version = "5.0.1" description = "Timeout context manager for asyncio programs" optional = false python-versions = ">=3.8" -groups = ["main"] +groups = ["main", "dev"] markers = "python_version == \"3.11\" and python_full_version < \"3.11.3\"" files = [ {file = "async_timeout-5.0.1-py3-none-any.whl", hash = "sha256:39e3809566ff85354557ec2398b55e096c8364bacac9405a7a1fa429e77fe76c"}, @@ -932,13 +948,118 @@ mypy = ["bokeh", "contourpy[bokeh,docs]", "docutils-stubs", "mypy (==1.15.0)", " test = ["Pillow", "contourpy[test-no-images]", "matplotlib"] test-no-images = ["pytest", "pytest-cov", "pytest-rerunfailures", "pytest-xdist", "wurlitzer"] +[[package]] +name = "coverage" +version = "7.13.2" +description = "Code coverage measurement for Python" +optional = false +python-versions = ">=3.10" +groups = ["dev"] +files = [ + {file = "coverage-7.13.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:f4af3b01763909f477ea17c962e2cca8f39b350a4e46e3a30838b2c12e31b81b"}, + {file = "coverage-7.13.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:36393bd2841fa0b59498f75466ee9bdec4f770d3254f031f23e8fd8e140ffdd2"}, + {file = "coverage-7.13.2-cp310-cp310-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:9cc7573518b7e2186bd229b1a0fe24a807273798832c27032c4510f47ffdb896"}, + {file = "coverage-7.13.2-cp310-cp310-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:ca9566769b69a5e216a4e176d54b9df88f29d750c5b78dbb899e379b4e14b30c"}, + {file = "coverage-7.13.2-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9c9bdea644e94fd66d75a6f7e9a97bb822371e1fe7eadae2cacd50fcbc28e4dc"}, + {file = "coverage-7.13.2-cp310-cp310-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:5bd447332ec4f45838c1ad42268ce21ca87c40deb86eabd59888859b66be22a5"}, + {file = "coverage-7.13.2-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:7c79ad5c28a16a1277e1187cf83ea8dafdcc689a784228a7d390f19776db7c31"}, + {file = "coverage-7.13.2-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:76e06ccacd1fb6ada5d076ed98a8c6f66e2e6acd3df02819e2ee29fd637b76ad"}, + {file = "coverage-7.13.2-cp310-cp310-musllinux_1_2_riscv64.whl", hash = "sha256:49d49e9a5e9f4dc3d3dac95278a020afa6d6bdd41f63608a76fa05a719d5b66f"}, + {file = "coverage-7.13.2-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:ed2bce0e7bfa53f7b0b01c722da289ef6ad4c18ebd52b1f93704c21f116360c8"}, + {file = "coverage-7.13.2-cp310-cp310-win32.whl", hash = "sha256:1574983178b35b9af4db4a9f7328a18a14a0a0ce76ffaa1c1bacb4cc82089a7c"}, + {file = "coverage-7.13.2-cp310-cp310-win_amd64.whl", hash = "sha256:a360a8baeb038928ceb996f5623a4cd508728f8f13e08d4e96ce161702f3dd99"}, + {file = "coverage-7.13.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:060ebf6f2c51aff5ba38e1f43a2095e087389b1c69d559fde6049a4b0001320e"}, + {file = "coverage-7.13.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:c1ea8ca9db5e7469cd364552985e15911548ea5b69c48a17291f0cac70484b2e"}, + {file = "coverage-7.13.2-cp311-cp311-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:b780090d15fd58f07cf2011943e25a5f0c1c894384b13a216b6c86c8a8a7c508"}, + {file = "coverage-7.13.2-cp311-cp311-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:88a800258d83acb803c38175b4495d293656d5fac48659c953c18e5f539a274b"}, + {file = "coverage-7.13.2-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6326e18e9a553e674d948536a04a80d850a5eeefe2aae2e6d7cf05d54046c01b"}, + {file = "coverage-7.13.2-cp311-cp311-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:59562de3f797979e1ff07c587e2ac36ba60ca59d16c211eceaa579c266c5022f"}, + {file = "coverage-7.13.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:27ba1ed6f66b0e2d61bfa78874dffd4f8c3a12f8e2b5410e515ab345ba7bc9c3"}, + {file = "coverage-7.13.2-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:8be48da4d47cc68754ce643ea50b3234557cbefe47c2f120495e7bd0a2756f2b"}, + {file = "coverage-7.13.2-cp311-cp311-musllinux_1_2_riscv64.whl", hash = "sha256:2a47a4223d3361b91176aedd9d4e05844ca67d7188456227b6bf5e436630c9a1"}, + {file = "coverage-7.13.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:c6f141b468740197d6bd38f2b26ade124363228cc3f9858bd9924ab059e00059"}, + {file = "coverage-7.13.2-cp311-cp311-win32.whl", hash = "sha256:89567798404af067604246e01a49ef907d112edf2b75ef814b1364d5ce267031"}, + {file = "coverage-7.13.2-cp311-cp311-win_amd64.whl", hash = "sha256:21dd57941804ae2ac7e921771a5e21bbf9aabec317a041d164853ad0a96ce31e"}, + {file = "coverage-7.13.2-cp311-cp311-win_arm64.whl", hash = "sha256:10758e0586c134a0bafa28f2d37dd2cdb5e4a90de25c0fc0c77dabbad46eca28"}, + {file = "coverage-7.13.2-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:f106b2af193f965d0d3234f3f83fc35278c7fb935dfbde56ae2da3dd2c03b84d"}, + {file = "coverage-7.13.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:78f45d21dc4d5d6bd29323f0320089ef7eae16e4bef712dff79d184fa7330af3"}, + {file = "coverage-7.13.2-cp312-cp312-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:fae91dfecd816444c74531a9c3d6ded17a504767e97aa674d44f638107265b99"}, + {file = "coverage-7.13.2-cp312-cp312-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:264657171406c114787b441484de620e03d8f7202f113d62fcd3d9688baa3e6f"}, + {file = "coverage-7.13.2-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ae47d8dcd3ded0155afbb59c62bd8ab07ea0fd4902e1c40567439e6db9dcaf2f"}, + {file = "coverage-7.13.2-cp312-cp312-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:8a0b33e9fd838220b007ce8f299114d406c1e8edb21336af4c97a26ecfd185aa"}, + {file = "coverage-7.13.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:b3becbea7f3ce9a2d4d430f223ec15888e4deb31395840a79e916368d6004cce"}, + {file = "coverage-7.13.2-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:f819c727a6e6eeb8711e4ce63d78c620f69630a2e9d53bc95ca5379f57b6ba94"}, + {file = "coverage-7.13.2-cp312-cp312-musllinux_1_2_riscv64.whl", hash = "sha256:4f7b71757a3ab19f7ba286e04c181004c1d61be921795ee8ba6970fd0ec91da5"}, + {file = "coverage-7.13.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:b7fc50d2afd2e6b4f6f2f403b70103d280a8e0cb35320cbbe6debcda02a1030b"}, + {file = "coverage-7.13.2-cp312-cp312-win32.whl", hash = "sha256:292250282cf9bcf206b543d7608bda17ca6fc151f4cbae949fc7e115112fbd41"}, + {file = "coverage-7.13.2-cp312-cp312-win_amd64.whl", hash = "sha256:eeea10169fac01549a7921d27a3e517194ae254b542102267bef7a93ed38c40e"}, + {file = "coverage-7.13.2-cp312-cp312-win_arm64.whl", hash = "sha256:2a5b567f0b635b592c917f96b9a9cb3dbd4c320d03f4bf94e9084e494f2e8894"}, + {file = "coverage-7.13.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:ed75de7d1217cf3b99365d110975f83af0528c849ef5180a12fd91b5064df9d6"}, + {file = "coverage-7.13.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:97e596de8fa9bada4d88fde64a3f4d37f1b6131e4faa32bad7808abc79887ddc"}, + {file = "coverage-7.13.2-cp313-cp313-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:68c86173562ed4413345410c9480a8d64864ac5e54a5cda236748031e094229f"}, + {file = "coverage-7.13.2-cp313-cp313-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:7be4d613638d678b2b3773b8f687537b284d7074695a43fe2fbbfc0e31ceaed1"}, + {file = "coverage-7.13.2-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d7f63ce526a96acd0e16c4af8b50b64334239550402fb1607ce6a584a6d62ce9"}, + {file = "coverage-7.13.2-cp313-cp313-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:406821f37f864f968e29ac14c3fccae0fec9fdeba48327f0341decf4daf92d7c"}, + {file = "coverage-7.13.2-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:ee68e5a4e3e5443623406b905db447dceddffee0dceb39f4e0cd9ec2a35004b5"}, + {file = "coverage-7.13.2-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:2ee0e58cca0c17dd9c6c1cdde02bb705c7b3fbfa5f3b0b5afeda20d4ebff8ef4"}, + {file = "coverage-7.13.2-cp313-cp313-musllinux_1_2_riscv64.whl", hash = "sha256:6e5bbb5018bf76a56aabdb64246b5288d5ae1b7d0dd4d0534fe86df2c2992d1c"}, + {file = "coverage-7.13.2-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:a55516c68ef3e08e134e818d5e308ffa6b1337cc8b092b69b24287bf07d38e31"}, + {file = "coverage-7.13.2-cp313-cp313-win32.whl", hash = "sha256:5b20211c47a8abf4abc3319d8ce2464864fa9f30c5fcaf958a3eed92f4f1fef8"}, + {file = "coverage-7.13.2-cp313-cp313-win_amd64.whl", hash = "sha256:14f500232e521201cf031549fb1ebdfc0a40f401cf519157f76c397e586c3beb"}, + {file = "coverage-7.13.2-cp313-cp313-win_arm64.whl", hash = "sha256:9779310cb5a9778a60c899f075a8514c89fa6d10131445c2207fc893e0b14557"}, + {file = "coverage-7.13.2-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:e64fa5a1e41ce5df6b547cbc3d3699381c9e2c2c369c67837e716ed0f549d48e"}, + {file = "coverage-7.13.2-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:b01899e82a04085b6561eb233fd688474f57455e8ad35cd82286463ba06332b7"}, + {file = "coverage-7.13.2-cp313-cp313t-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:838943bea48be0e2768b0cf7819544cdedc1bbb2f28427eabb6eb8c9eb2285d3"}, + {file = "coverage-7.13.2-cp313-cp313t-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:93d1d25ec2b27e90bcfef7012992d1f5121b51161b8bffcda756a816cf13c2c3"}, + {file = "coverage-7.13.2-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:93b57142f9621b0d12349c43fc7741fe578e4bc914c1e5a54142856cfc0bf421"}, + {file = "coverage-7.13.2-cp313-cp313t-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:f06799ae1bdfff7ccb8665d75f8291c69110ba9585253de254688aa8a1ccc6c5"}, + {file = "coverage-7.13.2-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:7f9405ab4f81d490811b1d91c7a20361135a2df4c170e7f0b747a794da5b7f23"}, + {file = "coverage-7.13.2-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:f9ab1d5b86f8fbc97a5b3cd6280a3fd85fef3b028689d8a2c00918f0d82c728c"}, + {file = "coverage-7.13.2-cp313-cp313t-musllinux_1_2_riscv64.whl", hash = "sha256:f674f59712d67e841525b99e5e2b595250e39b529c3bda14764e4f625a3fa01f"}, + {file = "coverage-7.13.2-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:c6cadac7b8ace1ba9144feb1ae3cb787a6065ba6d23ffc59a934b16406c26573"}, + {file = "coverage-7.13.2-cp313-cp313t-win32.whl", hash = "sha256:14ae4146465f8e6e6253eba0cccd57423e598a4cb925958b240c805300918343"}, + {file = "coverage-7.13.2-cp313-cp313t-win_amd64.whl", hash = "sha256:9074896edd705a05769e3de0eac0a8388484b503b68863dd06d5e473f874fd47"}, + {file = "coverage-7.13.2-cp313-cp313t-win_arm64.whl", hash = "sha256:69e526e14f3f854eda573d3cf40cffd29a1a91c684743d904c33dbdcd0e0f3e7"}, + {file = "coverage-7.13.2-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:387a825f43d680e7310e6f325b2167dd093bc8ffd933b83e9aa0983cf6e0a2ef"}, + {file = "coverage-7.13.2-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:f0d7fea9d8e5d778cd5a9e8fc38308ad688f02040e883cdc13311ef2748cb40f"}, + {file = "coverage-7.13.2-cp314-cp314-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:e080afb413be106c95c4ee96b4fffdc9e2fa56a8bbf90b5c0918e5c4449412f5"}, + {file = "coverage-7.13.2-cp314-cp314-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:a7fc042ba3c7ce25b8a9f097eb0f32a5ce1ccdb639d9eec114e26def98e1f8a4"}, + {file = "coverage-7.13.2-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d0ba505e021557f7f8173ee8cd6b926373d8653e5ff7581ae2efce1b11ef4c27"}, + {file = "coverage-7.13.2-cp314-cp314-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:7de326f80e3451bd5cc7239ab46c73ddb658fe0b7649476bc7413572d36cd548"}, + {file = "coverage-7.13.2-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:abaea04f1e7e34841d4a7b343904a3f59481f62f9df39e2cd399d69a187a9660"}, + {file = "coverage-7.13.2-cp314-cp314-musllinux_1_2_i686.whl", hash = "sha256:9f93959ee0c604bccd8e0697be21de0887b1f73efcc3aa73a3ec0fd13feace92"}, + {file = "coverage-7.13.2-cp314-cp314-musllinux_1_2_riscv64.whl", hash = "sha256:13fe81ead04e34e105bf1b3c9f9cdf32ce31736ee5d90a8d2de02b9d3e1bcb82"}, + {file = "coverage-7.13.2-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:d6d16b0f71120e365741bca2cb473ca6fe38930bc5431c5e850ba949f708f892"}, + {file = "coverage-7.13.2-cp314-cp314-win32.whl", hash = "sha256:9b2f4714bb7d99ba3790ee095b3b4ac94767e1347fe424278a0b10acb3ff04fe"}, + {file = "coverage-7.13.2-cp314-cp314-win_amd64.whl", hash = "sha256:e4121a90823a063d717a96e0a0529c727fb31ea889369a0ee3ec00ed99bf6859"}, + {file = "coverage-7.13.2-cp314-cp314-win_arm64.whl", hash = "sha256:6873f0271b4a15a33e7590f338d823f6f66f91ed147a03938d7ce26efd04eee6"}, + {file = "coverage-7.13.2-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:f61d349f5b7cd95c34017f1927ee379bfbe9884300d74e07cf630ccf7a610c1b"}, + {file = "coverage-7.13.2-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:a43d34ce714f4ca674c0d90beb760eb05aad906f2c47580ccee9da8fe8bfb417"}, + {file = "coverage-7.13.2-cp314-cp314t-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:bff1b04cb9d4900ce5c56c4942f047dc7efe57e2608cb7c3c8936e9970ccdbee"}, + {file = "coverage-7.13.2-cp314-cp314t-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:6ae99e4560963ad8e163e819e5d77d413d331fd00566c1e0856aa252303552c1"}, + {file = "coverage-7.13.2-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e79a8c7d461820257d9aa43716c4efc55366d7b292e46b5b37165be1d377405d"}, + {file = "coverage-7.13.2-cp314-cp314t-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:060ee84f6a769d40c492711911a76811b4befb6fba50abb450371abb720f5bd6"}, + {file = "coverage-7.13.2-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:3bca209d001fd03ea2d978f8a4985093240a355c93078aee3f799852c23f561a"}, + {file = "coverage-7.13.2-cp314-cp314t-musllinux_1_2_i686.whl", hash = "sha256:6b8092aa38d72f091db61ef83cb66076f18f02da3e1a75039a4f218629600e04"}, + {file = "coverage-7.13.2-cp314-cp314t-musllinux_1_2_riscv64.whl", hash = "sha256:4a3158dc2dcce5200d91ec28cd315c999eebff355437d2765840555d765a6e5f"}, + {file = "coverage-7.13.2-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:3973f353b2d70bd9796cc12f532a05945232ccae966456c8ed7034cb96bbfd6f"}, + {file = "coverage-7.13.2-cp314-cp314t-win32.whl", hash = "sha256:79f6506a678a59d4ded048dc72f1859ebede8ec2b9a2d509ebe161f01c2879d3"}, + {file = "coverage-7.13.2-cp314-cp314t-win_amd64.whl", hash = "sha256:196bfeabdccc5a020a57d5a368c681e3a6ceb0447d153aeccc1ab4d70a5032ba"}, + {file = "coverage-7.13.2-cp314-cp314t-win_arm64.whl", hash = "sha256:69269ab58783e090bfbf5b916ab3d188126e22d6070bbfc93098fdd474ef937c"}, + {file = "coverage-7.13.2-py3-none-any.whl", hash = "sha256:40ce1ea1e25125556d8e76bd0b61500839a07944cc287ac21d5626f3e620cad5"}, + {file = "coverage-7.13.2.tar.gz", hash = "sha256:044c6951ec37146b72a50cc81ef02217d27d4c3640efd2640311393cbbf143d3"}, +] + +[package.extras] +toml = ["tomli ; python_full_version <= \"3.11.0a6\""] + [[package]] name = "cryptography" version = "45.0.4" description = "cryptography is a package which provides cryptographic recipes and primitives to Python developers." optional = false python-versions = "!=3.9.0,!=3.9.1,>=3.7" -groups = ["main"] +groups = ["main", "dev"] files = [ {file = "cryptography-45.0.4-cp311-abi3-macosx_10_9_universal2.whl", hash = "sha256:425a9a6ac2823ee6e46a76a21a4e8342d8fa5c01e08b823c1f19a8b74f096069"}, {file = "cryptography-45.0.4-cp311-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:680806cf63baa0039b920f4976f5f31b10e772de42f16310a6839d9f21a26b0d"}, @@ -1132,6 +1253,30 @@ files = [ [package.extras] tests = ["asttokens (>=2.1.0)", "coverage", "coverage-enable-subprocess", "ipython", "littleutils", "pytest", "rich ; python_version >= \"3.11\""] +[[package]] +name = "fakeredis" +version = "2.33.0" +description = "Python implementation of redis API, can be used for testing purposes." +optional = false +python-versions = ">=3.7" +groups = ["dev"] +files = [ + {file = "fakeredis-2.33.0-py3-none-any.whl", hash = "sha256:de535f3f9ccde1c56672ab2fdd6a8efbc4f2619fc2f1acc87b8737177d71c965"}, + {file = "fakeredis-2.33.0.tar.gz", hash = "sha256:d7bc9a69d21df108a6451bbffee23b3eba432c21a654afc7ff2d295428ec5770"}, +] + +[package.dependencies] +redis = {version = ">=4.3", markers = "python_version > \"3.8\""} +sortedcontainers = ">=2" + +[package.extras] +bf = ["pyprobables (>=0.6)"] +cf = ["pyprobables (>=0.6)"] +json = ["jsonpath-ng (>=1.6)"] +lua = ["lupa (>=2.1)"] +probabilistic = ["pyprobables (>=0.6)"] +valkey = ["valkey (>=6) ; python_version >= \"3.8\""] + [[package]] name = "fastapi" version = "0.115.13" @@ -1309,7 +1454,7 @@ version = "1.7.0" description = "A list-like structure which implements collections.abc.MutableSequence" optional = false python-versions = ">=3.9" -groups = ["main"] +groups = ["main", "dev"] files = [ {file = "frozenlist-1.7.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:cc4df77d638aa2ed703b878dd093725b72a824c3c546c076e8fdf276f78ee84a"}, {file = "frozenlist-1.7.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:716a9973a2cc963160394f701964fe25012600f3d311f60c790400b00e568b61"}, @@ -1676,14 +1821,14 @@ test = ["Cython (>=0.29.24)"] [[package]] name = "httpx" -version = "0.28.1" +version = "0.27.2" description = "The next generation HTTP client." optional = false python-versions = ">=3.8" groups = ["main", "dev"] files = [ - {file = "httpx-0.28.1-py3-none-any.whl", hash = "sha256:d909fcccc110f8c7faf814ca82a9a4d816bc5a6dbfea25d6591d6985b8ba59ad"}, - {file = "httpx-0.28.1.tar.gz", hash = "sha256:75e98c5f16b0f35b567856f597f06ff2270a374470a5c2392242528e3e3e42fc"}, + {file = "httpx-0.27.2-py3-none-any.whl", hash = "sha256:7bb2708e112d8fdd7829cd4243970f0c223274051cb35ee80c03301ee29a3df0"}, + {file = "httpx-0.27.2.tar.gz", hash = "sha256:f7c2be1d2f3c3c3160d441802406b206c2b76f5947b11115e6df10c6c65e66c2"}, ] [package.dependencies] @@ -1691,6 +1836,7 @@ anyio = "*" certifi = "*" httpcore = "==1.*" idna = "*" +sniffio = "*" [package.extras] brotli = ["brotli ; platform_python_implementation == \"CPython\"", "brotlicffi ; platform_python_implementation != \"CPython\""] @@ -1777,6 +1923,18 @@ perf = ["ipython"] test = ["flufl.flake8", "importlib_resources (>=1.3) ; python_version < \"3.9\"", "jaraco.test (>=5.4)", "packaging", "pyfakefs", "pytest (>=6,!=8.1.*)", "pytest-perf (>=0.9.2)"] type = ["pytest-mypy"] +[[package]] +name = "iniconfig" +version = "2.3.0" +description = "brain-dead simple config-ini parsing" +optional = false +python-versions = ">=3.10" +groups = ["dev"] +files = [ + {file = "iniconfig-2.3.0-py3-none-any.whl", hash = "sha256:f631c04d2c48c52b84d0d0549c99ff3859c98df65b3101406327ecc7d53fbf12"}, + {file = "iniconfig-2.3.0.tar.gz", hash = "sha256:c76315c77db068650d49c5b56314774a7804df16fee4402c1f19d6d15d8c4730"}, +] + [[package]] name = "ipdb" version = "0.13.13" @@ -2360,6 +2518,93 @@ interegular = ["interegular (>=0.3.1,<0.4.0)"] nearley = ["js2py"] regex = ["regex"] +[[package]] +name = "librt" +version = "0.7.8" +description = "Mypyc runtime library" +optional = false +python-versions = ">=3.9" +groups = ["dev"] +markers = "platform_python_implementation != \"PyPy\"" +files = [ + {file = "librt-0.7.8-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:b45306a1fc5f53c9330fbee134d8b3227fe5da2ab09813b892790400aa49352d"}, + {file = "librt-0.7.8-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:864c4b7083eeee250ed55135d2127b260d7eb4b5e953a9e5df09c852e327961b"}, + {file = "librt-0.7.8-cp310-cp310-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:6938cc2de153bc927ed8d71c7d2f2ae01b4e96359126c602721340eb7ce1a92d"}, + {file = "librt-0.7.8-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:66daa6ac5de4288a5bbfbe55b4caa7bf0cd26b3269c7a476ffe8ce45f837f87d"}, + {file = "librt-0.7.8-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4864045f49dc9c974dadb942ac56a74cd0479a2aafa51ce272c490a82322ea3c"}, + {file = "librt-0.7.8-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:a36515b1328dc5b3ffce79fe204985ca8572525452eacabee2166f44bb387b2c"}, + {file = "librt-0.7.8-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:b7e7f140c5169798f90b80d6e607ed2ba5059784968a004107c88ad61fb3641d"}, + {file = "librt-0.7.8-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:ff71447cb778a4f772ddc4ce360e6ba9c95527ed84a52096bd1bbf9fee2ec7c0"}, + {file = "librt-0.7.8-cp310-cp310-win32.whl", hash = "sha256:047164e5f68b7a8ebdf9fae91a3c2161d3192418aadd61ddd3a86a56cbe3dc85"}, + {file = "librt-0.7.8-cp310-cp310-win_amd64.whl", hash = "sha256:d6f254d096d84156a46a84861183c183d30734e52383602443292644d895047c"}, + {file = "librt-0.7.8-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ff3e9c11aa260c31493d4b3197d1e28dd07768594a4f92bec4506849d736248f"}, + {file = "librt-0.7.8-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:ddb52499d0b3ed4aa88746aaf6f36a08314677d5c346234c3987ddc506404eac"}, + {file = "librt-0.7.8-cp311-cp311-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:e9c0afebbe6ce177ae8edba0c7c4d626f2a0fc12c33bb993d163817c41a7a05c"}, + {file = "librt-0.7.8-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:631599598e2c76ded400c0a8722dec09217c89ff64dc54b060f598ed68e7d2a8"}, + {file = "librt-0.7.8-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9c1ba843ae20db09b9d5c80475376168feb2640ce91cd9906414f23cc267a1ff"}, + {file = "librt-0.7.8-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:b5b007bb22ea4b255d3ee39dfd06d12534de2fcc3438567d9f48cdaf67ae1ae3"}, + {file = "librt-0.7.8-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:dbd79caaf77a3f590cbe32dc2447f718772d6eea59656a7dcb9311161b10fa75"}, + {file = "librt-0.7.8-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:87808a8d1e0bd62a01cafc41f0fd6818b5a5d0ca0d8a55326a81643cdda8f873"}, + {file = "librt-0.7.8-cp311-cp311-win32.whl", hash = "sha256:31724b93baa91512bd0a376e7cf0b59d8b631ee17923b1218a65456fa9bda2e7"}, + {file = "librt-0.7.8-cp311-cp311-win_amd64.whl", hash = "sha256:978e8b5f13e52cf23a9e80f3286d7546baa70bc4ef35b51d97a709d0b28e537c"}, + {file = "librt-0.7.8-cp311-cp311-win_arm64.whl", hash = "sha256:20e3946863d872f7cabf7f77c6c9d370b8b3d74333d3a32471c50d3a86c0a232"}, + {file = "librt-0.7.8-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:9b6943885b2d49c48d0cff23b16be830ba46b0152d98f62de49e735c6e655a63"}, + {file = "librt-0.7.8-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:46ef1f4b9b6cc364b11eea0ecc0897314447a66029ee1e55859acb3dd8757c93"}, + {file = "librt-0.7.8-cp312-cp312-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:907ad09cfab21e3c86e8f1f87858f7049d1097f77196959c033612f532b4e592"}, + {file = "librt-0.7.8-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:2991b6c3775383752b3ca0204842743256f3ad3deeb1d0adc227d56b78a9a850"}, + {file = "librt-0.7.8-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:03679b9856932b8c8f674e87aa3c55ea11c9274301f76ae8dc4d281bda55cf62"}, + {file = "librt-0.7.8-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:3968762fec1b2ad34ce57458b6de25dbb4142713e9ca6279a0d352fa4e9f452b"}, + {file = "librt-0.7.8-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:bb7a7807523a31f03061288cc4ffc065d684c39db7644c676b47d89553c0d714"}, + {file = "librt-0.7.8-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:ad64a14b1e56e702e19b24aae108f18ad1bf7777f3af5fcd39f87d0c5a814449"}, + {file = "librt-0.7.8-cp312-cp312-win32.whl", hash = "sha256:0241a6ed65e6666236ea78203a73d800dbed896cf12ae25d026d75dc1fcd1dac"}, + {file = "librt-0.7.8-cp312-cp312-win_amd64.whl", hash = "sha256:6db5faf064b5bab9675c32a873436b31e01d66ca6984c6f7f92621656033a708"}, + {file = "librt-0.7.8-cp312-cp312-win_arm64.whl", hash = "sha256:57175aa93f804d2c08d2edb7213e09276bd49097611aefc37e3fa38d1fb99ad0"}, + {file = "librt-0.7.8-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:4c3995abbbb60b3c129490fa985dfe6cac11d88fc3c36eeb4fb1449efbbb04fc"}, + {file = "librt-0.7.8-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:44e0c2cbc9bebd074cf2cdbe472ca185e824be4e74b1c63a8e934cea674bebf2"}, + {file = "librt-0.7.8-cp313-cp313-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:4d2f1e492cae964b3463a03dc77a7fe8742f7855d7258c7643f0ee32b6651dd3"}, + {file = "librt-0.7.8-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:451e7ffcef8f785831fdb791bd69211f47e95dc4c6ddff68e589058806f044c6"}, + {file = "librt-0.7.8-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:3469e1af9f1380e093ae06bedcbdd11e407ac0b303a56bbe9afb1d6824d4982d"}, + {file = "librt-0.7.8-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:f11b300027ce19a34f6d24ebb0a25fd0e24a9d53353225a5c1e6cadbf2916b2e"}, + {file = "librt-0.7.8-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:4adc73614f0d3c97874f02f2c7fd2a27854e7e24ad532ea6b965459c5b757eca"}, + {file = "librt-0.7.8-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:60c299e555f87e4c01b2eca085dfccda1dde87f5a604bb45c2906b8305819a93"}, + {file = "librt-0.7.8-cp313-cp313-win32.whl", hash = "sha256:b09c52ed43a461994716082ee7d87618096851319bf695d57ec123f2ab708951"}, + {file = "librt-0.7.8-cp313-cp313-win_amd64.whl", hash = "sha256:f8f4a901a3fa28969d6e4519deceab56c55a09d691ea7b12ca830e2fa3461e34"}, + {file = "librt-0.7.8-cp313-cp313-win_arm64.whl", hash = "sha256:43d4e71b50763fcdcf64725ac680d8cfa1706c928b844794a7aa0fa9ac8e5f09"}, + {file = "librt-0.7.8-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:be927c3c94c74b05128089a955fba86501c3b544d1d300282cc1b4bd370cb418"}, + {file = "librt-0.7.8-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:7b0803e9008c62a7ef79058233db7ff6f37a9933b8f2573c05b07ddafa226611"}, + {file = "librt-0.7.8-cp314-cp314-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:79feb4d00b2a4e0e05c9c56df707934f41fcb5fe53fd9efb7549068d0495b758"}, + {file = "librt-0.7.8-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b9122094e3f24aa759c38f46bd8863433820654927370250f460ae75488b66ea"}, + {file = "librt-0.7.8-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:7e03bea66af33c95ce3addf87a9bf1fcad8d33e757bc479957ddbc0e4f7207ac"}, + {file = "librt-0.7.8-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:f1ade7f31675db00b514b98f9ab9a7698c7282dad4be7492589109471852d398"}, + {file = "librt-0.7.8-cp314-cp314-musllinux_1_2_i686.whl", hash = "sha256:a14229ac62adcf1b90a15992f1ab9c69ae8b99ffb23cb64a90878a6e8a2f5b81"}, + {file = "librt-0.7.8-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:5bcaaf624fd24e6a0cb14beac37677f90793a96864c67c064a91458611446e83"}, + {file = "librt-0.7.8-cp314-cp314-win32.whl", hash = "sha256:7aa7d5457b6c542ecaed79cec4ad98534373c9757383973e638ccced0f11f46d"}, + {file = "librt-0.7.8-cp314-cp314-win_amd64.whl", hash = "sha256:3d1322800771bee4a91f3b4bd4e49abc7d35e65166821086e5afd1e6c0d9be44"}, + {file = "librt-0.7.8-cp314-cp314-win_arm64.whl", hash = "sha256:5363427bc6a8c3b1719f8f3845ea53553d301382928a86e8fab7984426949bce"}, + {file = "librt-0.7.8-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:ca916919793a77e4a98d4a1701e345d337ce53be4a16620f063191f7322ac80f"}, + {file = "librt-0.7.8-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:54feb7b4f2f6706bb82325e836a01be805770443e2400f706e824e91f6441dde"}, + {file = "librt-0.7.8-cp314-cp314t-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:39a4c76fee41007070f872b648cc2f711f9abf9a13d0c7162478043377b52c8e"}, + {file = "librt-0.7.8-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ac9c8a458245c7de80bc1b9765b177055efff5803f08e548dd4bb9ab9a8d789b"}, + {file = "librt-0.7.8-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:95b67aa7eff150f075fda09d11f6bfb26edffd300f6ab1666759547581e8f666"}, + {file = "librt-0.7.8-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:535929b6eff670c593c34ff435d5440c3096f20fa72d63444608a5aef64dd581"}, + {file = "librt-0.7.8-cp314-cp314t-musllinux_1_2_i686.whl", hash = "sha256:63937bd0f4d1cb56653dc7ae900d6c52c41f0015e25aaf9902481ee79943b33a"}, + {file = "librt-0.7.8-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:cf243da9e42d914036fd362ac3fa77d80a41cadcd11ad789b1b5eec4daaf67ca"}, + {file = "librt-0.7.8-cp314-cp314t-win32.whl", hash = "sha256:171ca3a0a06c643bd0a2f62a8944e1902c94aa8e5da4db1ea9a8daf872685365"}, + {file = "librt-0.7.8-cp314-cp314t-win_amd64.whl", hash = "sha256:445b7304145e24c60288a2f172b5ce2ca35c0f81605f5299f3fa567e189d2e32"}, + {file = "librt-0.7.8-cp314-cp314t-win_arm64.whl", hash = "sha256:8766ece9de08527deabcd7cb1b4f1a967a385d26e33e536d6d8913db6ef74f06"}, + {file = "librt-0.7.8-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:c7e8f88f79308d86d8f39c491773cbb533d6cb7fa6476f35d711076ee04fceb6"}, + {file = "librt-0.7.8-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:389bd25a0db916e1d6bcb014f11aa9676cedaa485e9ec3752dfe19f196fd377b"}, + {file = "librt-0.7.8-cp39-cp39-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:73fd300f501a052f2ba52ede721232212f3b06503fa12665408ecfc9d8fd149c"}, + {file = "librt-0.7.8-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6d772edc6a5f7835635c7562f6688e031f0b97e31d538412a852c49c9a6c92d5"}, + {file = "librt-0.7.8-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:bfde8a130bd0f239e45503ab39fab239ace094d63ee1d6b67c25a63d741c0f71"}, + {file = "librt-0.7.8-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:fdec6e2368ae4f796fc72fad7fd4bd1753715187e6d870932b0904609e7c878e"}, + {file = "librt-0.7.8-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:00105e7d541a8f2ee5be52caacea98a005e0478cfe78c8080fbb7b5d2b340c63"}, + {file = "librt-0.7.8-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:c6f8947d3dfd7f91066c5b4385812c18be26c9d5a99ca56667547f2c39149d94"}, + {file = "librt-0.7.8-cp39-cp39-win32.whl", hash = "sha256:41d7bb1e07916aeb12ae4a44e3025db3691c4149ab788d0315781b4d29b86afb"}, + {file = "librt-0.7.8-cp39-cp39-win_amd64.whl", hash = "sha256:e90a8e237753c83b8e484d478d9a996dc5e39fd5bd4c6ce32563bc8123f132be"}, + {file = "librt-0.7.8.tar.gz", hash = "sha256:1a4ede613941d9c3470b0368be851df6bb78ab218635512d0370b27a277a0862"}, +] + [[package]] name = "mako" version = "1.3.10" @@ -2595,7 +2840,7 @@ version = "6.5.0" description = "multidict implementation" optional = false python-versions = ">=3.9" -groups = ["main"] +groups = ["main", "dev"] files = [ {file = "multidict-6.5.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:2e118a202904623b1d2606d1c8614e14c9444b59d64454b0c355044058066469"}, {file = "multidict-6.5.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:a42995bdcaff4e22cb1280ae7752c3ed3fbb398090c6991a2797a4a0e5ed16a9"}, @@ -2709,6 +2954,79 @@ files = [ {file = "multidict-6.5.0.tar.gz", hash = "sha256:942bd8002492ba819426a8d7aefde3189c1b87099cdf18aaaefefcf7f3f7b6d2"}, ] +[[package]] +name = "mypy" +version = "1.19.1" +description = "Optional static typing for Python" +optional = false +python-versions = ">=3.9" +groups = ["dev"] +files = [ + {file = "mypy-1.19.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:5f05aa3d375b385734388e844bc01733bd33c644ab48e9684faa54e5389775ec"}, + {file = "mypy-1.19.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:022ea7279374af1a5d78dfcab853fe6a536eebfda4b59deab53cd21f6cd9f00b"}, + {file = "mypy-1.19.1-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ee4c11e460685c3e0c64a4c5de82ae143622410950d6be863303a1c4ba0e36d6"}, + {file = "mypy-1.19.1-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:de759aafbae8763283b2ee5869c7255391fbc4de3ff171f8f030b5ec48381b74"}, + {file = "mypy-1.19.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:ab43590f9cd5108f41aacf9fca31841142c786827a74ab7cc8a2eacb634e09a1"}, + {file = "mypy-1.19.1-cp310-cp310-win_amd64.whl", hash = "sha256:2899753e2f61e571b3971747e302d5f420c3fd09650e1951e99f823bc3089dac"}, + {file = "mypy-1.19.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:d8dfc6ab58ca7dda47d9237349157500468e404b17213d44fc1cb77bce532288"}, + {file = "mypy-1.19.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:e3f276d8493c3c97930e354b2595a44a21348b320d859fb4a2b9f66da9ed27ab"}, + {file = "mypy-1.19.1-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:2abb24cf3f17864770d18d673c85235ba52456b36a06b6afc1e07c1fdcd3d0e6"}, + {file = "mypy-1.19.1-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a009ffa5a621762d0c926a078c2d639104becab69e79538a494bcccb62cc0331"}, + {file = "mypy-1.19.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:f7cee03c9a2e2ee26ec07479f38ea9c884e301d42c6d43a19d20fb014e3ba925"}, + {file = "mypy-1.19.1-cp311-cp311-win_amd64.whl", hash = "sha256:4b84a7a18f41e167f7995200a1d07a4a6810e89d29859df936f1c3923d263042"}, + {file = "mypy-1.19.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:a8174a03289288c1f6c46d55cef02379b478bfbc8e358e02047487cad44c6ca1"}, + {file = "mypy-1.19.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:ffcebe56eb09ff0c0885e750036a095e23793ba6c2e894e7e63f6d89ad51f22e"}, + {file = "mypy-1.19.1-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b64d987153888790bcdb03a6473d321820597ab8dd9243b27a92153c4fa50fd2"}, + {file = "mypy-1.19.1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c35d298c2c4bba75feb2195655dfea8124d855dfd7343bf8b8c055421eaf0cf8"}, + {file = "mypy-1.19.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:34c81968774648ab5ac09c29a375fdede03ba253f8f8287847bd480782f73a6a"}, + {file = "mypy-1.19.1-cp312-cp312-win_amd64.whl", hash = "sha256:b10e7c2cd7870ba4ad9b2d8a6102eb5ffc1f16ca35e3de6bfa390c1113029d13"}, + {file = "mypy-1.19.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:e3157c7594ff2ef1634ee058aafc56a82db665c9438fd41b390f3bde1ab12250"}, + {file = "mypy-1.19.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:bdb12f69bcc02700c2b47e070238f42cb87f18c0bc1fc4cdb4fb2bc5fd7a3b8b"}, + {file = "mypy-1.19.1-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f859fb09d9583a985be9a493d5cfc5515b56b08f7447759a0c5deaf68d80506e"}, + {file = "mypy-1.19.1-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c9a6538e0415310aad77cb94004ca6482330fece18036b5f360b62c45814c4ef"}, + {file = "mypy-1.19.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:da4869fc5e7f62a88f3fe0b5c919d1d9f7ea3cef92d3689de2823fd27e40aa75"}, + {file = "mypy-1.19.1-cp313-cp313-win_amd64.whl", hash = "sha256:016f2246209095e8eda7538944daa1d60e1e8134d98983b9fc1e92c1fc0cb8dd"}, + {file = "mypy-1.19.1-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:06e6170bd5836770e8104c8fdd58e5e725cfeb309f0a6c681a811f557e97eac1"}, + {file = "mypy-1.19.1-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:804bd67b8054a85447c8954215a906d6eff9cabeabe493fb6334b24f4bfff718"}, + {file = "mypy-1.19.1-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:21761006a7f497cb0d4de3d8ef4ca70532256688b0523eee02baf9eec895e27b"}, + {file = "mypy-1.19.1-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:28902ee51f12e0f19e1e16fbe2f8f06b6637f482c459dd393efddd0ec7f82045"}, + {file = "mypy-1.19.1-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:481daf36a4c443332e2ae9c137dfee878fcea781a2e3f895d54bd3002a900957"}, + {file = "mypy-1.19.1-cp314-cp314-win_amd64.whl", hash = "sha256:8bb5c6f6d043655e055be9b542aa5f3bdd30e4f3589163e85f93f3640060509f"}, + {file = "mypy-1.19.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:7bcfc336a03a1aaa26dfce9fff3e287a3ba99872a157561cbfcebe67c13308e3"}, + {file = "mypy-1.19.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:b7951a701c07ea584c4fe327834b92a30825514c868b1f69c30445093fdd9d5a"}, + {file = "mypy-1.19.1-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b13cfdd6c87fc3efb69ea4ec18ef79c74c3f98b4e5498ca9b85ab3b2c2329a67"}, + {file = "mypy-1.19.1-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4f28f99c824ecebcdaa2e55d82953e38ff60ee5ec938476796636b86afa3956e"}, + {file = "mypy-1.19.1-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:c608937067d2fc5a4dd1a5ce92fd9e1398691b8c5d012d66e1ddd430e9244376"}, + {file = "mypy-1.19.1-cp39-cp39-win_amd64.whl", hash = "sha256:409088884802d511ee52ca067707b90c883426bd95514e8cfda8281dc2effe24"}, + {file = "mypy-1.19.1-py3-none-any.whl", hash = "sha256:f1235f5ea01b7db5468d53ece6aaddf1ad0b88d9e7462b86ef96fe04995d7247"}, + {file = "mypy-1.19.1.tar.gz", hash = "sha256:19d88bb05303fe63f71dd2c6270daca27cb9401c4ca8255fe50d1d920e0eb9ba"}, +] + +[package.dependencies] +librt = {version = ">=0.6.2", markers = "platform_python_implementation != \"PyPy\""} +mypy_extensions = ">=1.0.0" +pathspec = ">=0.9.0" +typing_extensions = ">=4.6.0" + +[package.extras] +dmypy = ["psutil (>=4.0)"] +faster-cache = ["orjson"] +install-types = ["pip"] +mypyc = ["setuptools (>=50)"] +reports = ["lxml"] + +[[package]] +name = "mypy-extensions" +version = "1.1.0" +description = "Type system extensions for programs checked with the mypy type checker." +optional = false +python-versions = ">=3.8" +groups = ["dev"] +files = [ + {file = "mypy_extensions-1.1.0-py3-none-any.whl", hash = "sha256:1be4cccdb0f2482337c4743e60421de3a356cd97508abadd57d47403e94f5505"}, + {file = "mypy_extensions-1.1.0.tar.gz", hash = "sha256:52e68efc3284861e772bbcd66823fde5ae21fd2fdb51c62a211403730b916558"}, +] + [[package]] name = "mysqlclient" version = "2.2.7" @@ -3226,6 +3544,24 @@ files = [ qa = ["flake8 (==5.0.4)", "mypy (==0.971)", "types-setuptools (==67.2.0.1)"] testing = ["docopt", "pytest"] +[[package]] +name = "pathspec" +version = "1.0.4" +description = "Utility library for gitignore style pattern matching of file paths." +optional = false +python-versions = ">=3.9" +groups = ["dev"] +files = [ + {file = "pathspec-1.0.4-py3-none-any.whl", hash = "sha256:fb6ae2fd4e7c921a165808a552060e722767cfa526f99ca5156ed2ce45a5c723"}, + {file = "pathspec-1.0.4.tar.gz", hash = "sha256:0210e2ae8a21a9137c0d470578cb0e595af87edaa6ebf12ff176f14a02e0e645"}, +] + +[package.extras] +hyperscan = ["hyperscan (>=0.7)"] +optional = ["typing-extensions (>=4)"] +re2 = ["google-re2 (>=1.1)"] +tests = ["pytest (>=9)", "typing-extensions (>=4.15)"] + [[package]] name = "pexpect" version = "4.9.0" @@ -3357,6 +3693,22 @@ docs = ["furo (>=2024.8.6)", "proselint (>=0.14)", "sphinx (>=8.1.3)", "sphinx-a test = ["appdirs (==1.4.4)", "covdefaults (>=2.3)", "pytest (>=8.3.4)", "pytest-cov (>=6)", "pytest-mock (>=3.14)"] type = ["mypy (>=1.14.1)"] +[[package]] +name = "pluggy" +version = "1.6.0" +description = "plugin and hook calling mechanisms for python" +optional = false +python-versions = ">=3.9" +groups = ["dev"] +files = [ + {file = "pluggy-1.6.0-py3-none-any.whl", hash = "sha256:e920276dd6813095e9377c0bc5566d94c932c33b27a3e3945d8389c374dd4746"}, + {file = "pluggy-1.6.0.tar.gz", hash = "sha256:7dcc130b76258d33b90f61b658791dede3486c3e6bfb003ee5c9bfb396dd22f3"}, +] + +[package.extras] +dev = ["pre-commit", "tox"] +testing = ["coverage", "pytest", "pytest-benchmark"] + [[package]] name = "podman-compose" version = "1.5.0" @@ -3412,7 +3764,7 @@ version = "0.3.2" description = "Accelerated property cache" optional = false python-versions = ">=3.9" -groups = ["main"] +groups = ["main", "dev"] files = [ {file = "propcache-0.3.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:22d9962a358aedbb7a2e36187ff273adeaab9743373a272976d2e348d08c7770"}, {file = "propcache-0.3.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:0d0fda578d1dc3f77b6b5a5dce3b9ad69a8250a891760a548df850a5e8da87f3"}, @@ -3777,6 +4129,66 @@ files = [ packaging = ">=21.3" Pillow = ">=8.0.0" +[[package]] +name = "pytest" +version = "8.4.2" +description = "pytest: simple powerful testing with Python" +optional = false +python-versions = ">=3.9" +groups = ["dev"] +files = [ + {file = "pytest-8.4.2-py3-none-any.whl", hash = "sha256:872f880de3fc3a5bdc88a11b39c9710c3497a547cfa9320bc3c5e62fbf272e79"}, + {file = "pytest-8.4.2.tar.gz", hash = "sha256:86c0d0b93306b961d58d62a4db4879f27fe25513d4b969df351abdddb3c30e01"}, +] + +[package.dependencies] +colorama = {version = ">=0.4", markers = "sys_platform == \"win32\""} +iniconfig = ">=1" +packaging = ">=20" +pluggy = ">=1.5,<2" +pygments = ">=2.7.2" + +[package.extras] +dev = ["argcomplete", "attrs (>=19.2)", "hypothesis (>=3.56)", "mock", "requests", "setuptools", "xmlschema"] + +[[package]] +name = "pytest-asyncio" +version = "0.23.8" +description = "Pytest support for asyncio" +optional = false +python-versions = ">=3.8" +groups = ["dev"] +files = [ + {file = "pytest_asyncio-0.23.8-py3-none-any.whl", hash = "sha256:50265d892689a5faefb84df80819d1ecef566eb3549cf915dfb33569359d1ce2"}, + {file = "pytest_asyncio-0.23.8.tar.gz", hash = "sha256:759b10b33a6dc61cce40a8bd5205e302978bbbcc00e279a8b61d9a6a3c82e4d3"}, +] + +[package.dependencies] +pytest = ">=7.0.0,<9" + +[package.extras] +docs = ["sphinx (>=5.3)", "sphinx-rtd-theme (>=1.0)"] +testing = ["coverage (>=6.2)", "hypothesis (>=5.7.1)"] + +[[package]] +name = "pytest-cov" +version = "4.1.0" +description = "Pytest plugin for measuring coverage." +optional = false +python-versions = ">=3.7" +groups = ["dev"] +files = [ + {file = "pytest-cov-4.1.0.tar.gz", hash = "sha256:3904b13dfbfec47f003b8e77fd5b589cd11904a21ddf1ab38a64f204d6a10ef6"}, + {file = "pytest_cov-4.1.0-py3-none-any.whl", hash = "sha256:6ba70b9e97e69fcc3fb45bfeab2d0a138fb65c4d0d6a41ef33983ad114be8c3a"}, +] + +[package.dependencies] +coverage = {version = ">=5.2.1", extras = ["toml"]} +pytest = ">=4.6" + +[package.extras] +testing = ["fields", "hunter", "process-tests", "pytest-xdist", "six", "virtualenv"] + [[package]] name = "python-dateutil" version = "2.9.0.post0" @@ -4069,7 +4481,7 @@ version = "6.2.0" description = "Python client for Redis database and key-value store" optional = false python-versions = ">=3.9" -groups = ["main"] +groups = ["main", "dev"] files = [ {file = "redis-6.2.0-py3-none-any.whl", hash = "sha256:c8ddf316ee0aab65f04a11229e94a64b2618451dab7a67cb2f77eb799d872d5e"}, {file = "redis-6.2.0.tar.gz", hash = "sha256:e821f129b75dde6cb99dd35e5c76e8c49512a5a0d8dfdc560b2fbd44b85ca977"}, @@ -4603,6 +5015,18 @@ files = [ {file = "sniffio-1.3.1.tar.gz", hash = "sha256:f4324edc670a0f49750a81b895f35c3adb843cca46f0530f79fc1babb23789dc"}, ] +[[package]] +name = "sortedcontainers" +version = "2.4.0" +description = "Sorted Containers -- Sorted List, Sorted Dict, Sorted Set" +optional = false +python-versions = "*" +groups = ["dev"] +files = [ + {file = "sortedcontainers-2.4.0-py2.py3-none-any.whl", hash = "sha256:a163dcaede0f1c021485e957a39245190e74249897e2ae4b2aa38595db237ee0"}, + {file = "sortedcontainers-2.4.0.tar.gz", hash = "sha256:25caa5a06cc30b6b83d11423433f65d1f9d76c4c6a0c90e3379eaa43b9bfdb88"}, +] + [[package]] name = "soupsieve" version = "2.8" @@ -5005,6 +5429,37 @@ rich = ">=10.11.0" shellingham = ">=1.3.0" typing-extensions = ">=3.7.4.3" +[[package]] +name = "types-cffi" +version = "1.17.0.20250915" +description = "Typing stubs for cffi" +optional = false +python-versions = ">=3.9" +groups = ["dev"] +files = [ + {file = "types_cffi-1.17.0.20250915-py3-none-any.whl", hash = "sha256:cef4af1116c83359c11bb4269283c50f0688e9fc1d7f0eeb390f3661546da52c"}, + {file = "types_cffi-1.17.0.20250915.tar.gz", hash = "sha256:4362e20368f78dabd5c56bca8004752cc890e07a71605d9e0d9e069dbaac8c06"}, +] + +[package.dependencies] +types-setuptools = "*" + +[[package]] +name = "types-pyopenssl" +version = "24.1.0.20240722" +description = "Typing stubs for pyOpenSSL" +optional = false +python-versions = ">=3.8" +groups = ["dev"] +files = [ + {file = "types-pyOpenSSL-24.1.0.20240722.tar.gz", hash = "sha256:47913b4678a01d879f503a12044468221ed8576263c1540dcb0484ca21b08c39"}, + {file = "types_pyOpenSSL-24.1.0.20240722-py3-none-any.whl", hash = "sha256:6a7a5d2ec042537934cfb4c9d4deb0e16c4c6250b09358df1f083682fe6fda54"}, +] + +[package.dependencies] +cryptography = ">=35.0.0" +types-cffi = "*" + [[package]] name = "types-python-dateutil" version = "2.9.0.20250822" @@ -5017,6 +5472,49 @@ files = [ {file = "types_python_dateutil-2.9.0.20250822.tar.gz", hash = "sha256:84c92c34bd8e68b117bff742bc00b692a1e8531262d4507b33afcc9f7716cd53"}, ] +[[package]] +name = "types-redis" +version = "4.6.0.20241004" +description = "Typing stubs for redis" +optional = false +python-versions = ">=3.8" +groups = ["dev"] +files = [ + {file = "types-redis-4.6.0.20241004.tar.gz", hash = "sha256:5f17d2b3f9091ab75384153bfa276619ffa1cf6a38da60e10d5e6749cc5b902e"}, + {file = "types_redis-4.6.0.20241004-py3-none-any.whl", hash = "sha256:ef5da68cb827e5f606c8f9c0b49eeee4c2669d6d97122f301d3a55dc6a63f6ed"}, +] + +[package.dependencies] +cryptography = ">=35.0.0" +types-pyOpenSSL = "*" + +[[package]] +name = "types-requests" +version = "2.32.4.20260107" +description = "Typing stubs for requests" +optional = false +python-versions = ">=3.9" +groups = ["dev"] +files = [ + {file = "types_requests-2.32.4.20260107-py3-none-any.whl", hash = "sha256:b703fe72f8ce5b31ef031264fe9395cac8f46a04661a79f7ed31a80fb308730d"}, + {file = "types_requests-2.32.4.20260107.tar.gz", hash = "sha256:018a11ac158f801bfa84857ddec1650750e393df8a004a8a9ae2a9bec6fcb24f"}, +] + +[package.dependencies] +urllib3 = ">=2" + +[[package]] +name = "types-setuptools" +version = "80.10.0.20260124" +description = "Typing stubs for setuptools" +optional = false +python-versions = ">=3.9" +groups = ["dev"] +files = [ + {file = "types_setuptools-80.10.0.20260124-py3-none-any.whl", hash = "sha256:efed7e044f01adb9c2806c7a8e1b6aa3656b8e382379b53d5f26ee3db24d4c01"}, + {file = "types_setuptools-80.10.0.20260124.tar.gz", hash = "sha256:1b86d9f0368858663276a0cbe5fe5a9722caf94b5acde8aba0399a6e90680f20"}, +] + [[package]] name = "typing-extensions" version = "4.14.0" @@ -5569,7 +6067,7 @@ version = "1.20.1" description = "Yet another URL library" optional = false python-versions = ">=3.9" -groups = ["main"] +groups = ["main", "dev"] files = [ {file = "yarl-1.20.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:6032e6da6abd41e4acda34d75a816012717000fa6839f37124a47fcefc49bec4"}, {file = "yarl-1.20.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:2c7b34d804b8cf9b214f05015c4fee2ebe7ed05cf581e7192c06555c71f4446a"}, @@ -5705,4 +6203,4 @@ type = ["pytest-mypy"] [metadata] lock-version = "2.1" python-versions = ">3.11" -content-hash = "5f53cec7fc3cc93d494341e9fd6562076c1a8952f83075f671a3507c50fcb334" +content-hash = "10a74594d9f695ab1077ff992bcd012b93b174b25c3f2ca681d6308653abbd14" diff --git a/crawler/pyproject.toml b/crawler/pyproject.toml index 7e2d1a1..4028cf9 100644 --- a/crawler/pyproject.toml +++ b/crawler/pyproject.toml @@ -20,6 +20,7 @@ matplotlib = "^3.10.0" opencv-python = "^4.11.0.86" click = "^8.2.0" aiohttp = "^3.11.18" +aiohttp-socks = "^0.8.4" sqlmodel = "^0.0.24" alembic = "^1.16.1" sqlalchemy = {extras = ["asyncio"], version = "^2.0.41"} @@ -42,6 +43,15 @@ mysqlclient = "^2.2.7" ipdb = "^0.13.13" jupyterlab = "^4.4.7" podman-compose = "^1.5.0" +pytest = "^8.0.0" +pytest-asyncio = "^0.23.0" +pytest-cov = "^4.1.0" +httpx = "^0.27.0" +aioresponses = "^0.7.6" +fakeredis = "^2.21.0" +mypy = "^1.8.0" +types-requests = "^2.31.0" +types-redis = "^4.6.0" [build-system] requires = ["poetry-core>=1.0.0"] @@ -52,4 +62,23 @@ build-backend = "poetry.core.masonry.api" lint.ignore = [ "E741", # Ambigious name ] -exclude = ["*.ipynb"] \ No newline at end of file +exclude = ["*.ipynb"] + +[tool.pytest.ini_options] +asyncio_mode = "auto" +testpaths = ["tests"] +asyncio_default_fixture_loop_scope = "function" + +[tool.mypy] +python_version = "3.11" +warn_return_any = true +warn_unused_ignores = true +disallow_untyped_defs = true +disallow_incomplete_defs = true +check_untyped_defs = true +strict_optional = true +plugins = ["pydantic.mypy"] + +[[tool.mypy.overrides]] +module = ["transformers.*", "pytesseract.*", "cv2.*", "celery.*", "tqdm.*", "aiohttp.*", "aiohttp_socks.*", "tenacity.*", "geopy.*", "pandas.*", "numpy.*", "PIL.*", "sqlmodel.*", "sqlalchemy.*", "alembic.*", "apprise.*", "opentelemetry.*"] +ignore_missing_imports = true \ No newline at end of file diff --git a/crawler/rec/query.py b/crawler/rec/query.py index b589876..a2526a6 100644 --- a/crawler/rec/query.py +++ b/crawler/rec/query.py @@ -1,16 +1,19 @@ import enum from typing import Any +from contextlib import asynccontextmanager +from collections.abc import AsyncIterator + import aiohttp from models.listing import FurnishType, ListingType from rec import districts from tenacity import retry, stop_after_attempt, wait_random +from config.scraper_config import ScraperConfig -headers = { +DEFAULT_HEADERS = { "Host": "api.rightmove.co.uk", - # 'Accept-Encoding': 'gzip, deflate, br', - "User-Agent": "okhttp/4.10.0", - "Connection": "close", + "User-Agent": "okhttp/4.12.0", + "Connection": "keep-alive", } @@ -24,15 +27,66 @@ class PropertyType(enum.StrEnum): TERRACED = "terraced" +@asynccontextmanager +async def create_session( + config: ScraperConfig | None = None, +) -> AsyncIterator[aiohttp.ClientSession]: + """Create an aiohttp session with optional proxy support. + + Args: + config: Scraper configuration. Loads from environment if not provided. + + Yields: + Configured aiohttp ClientSession. + """ + if config is None: + config = ScraperConfig.from_env() + + connector = None + if config.proxy_url: + try: + from aiohttp_socks import ProxyConnector + + connector = ProxyConnector.from_url(config.proxy_url) + except ImportError: + raise ImportError( + "aiohttp-socks is required for proxy support. " + "Install with: pip install aiohttp-socks" + ) + + session = aiohttp.ClientSession( + trust_env=True, + connector=connector, + headers=DEFAULT_HEADERS, + ) + try: + yield session + finally: + await session.close() + + @retry(wait=wait_random(min=1, max=2), stop=stop_after_attempt(3)) -async def detail_query(detail_id: int) -> dict[str, Any]: +async def detail_query( + detail_id: int, + session: aiohttp.ClientSession | None = None, +) -> dict[str, Any]: + """Fetch detailed property information. + + Args: + detail_id: The property identifier. + session: Optional aiohttp session. Creates new one if not provided. + + Returns: + Property details as a dictionary. + """ params = { "apiApplication": "ANDROID", "appVersion": "3.70.0", } url = f"https://api.rightmove.co.uk/api/property/{detail_id}" - async with aiohttp.ClientSession(trust_env=True) as session: - async with session.get(url, params=params, headers=headers) as response: + + async def do_request(s: aiohttp.ClientSession) -> dict[str, Any]: + async with s.get(url, params=params, headers=DEFAULT_HEADERS) as response: if response.status != 200: raise Exception( f"""id: {detail_id}. Status Code: {response.status}.""" @@ -40,6 +94,12 @@ async def detail_query(detail_id: int) -> dict[str, Any]: ) return await response.json() + if session: + return await do_request(session) + else: + async with aiohttp.ClientSession(trust_env=True) as new_session: + return await do_request(new_session) + @retry(wait=wait_random(min=1, max=60), stop=stop_after_attempt(3)) async def listing_query( @@ -57,7 +117,29 @@ async def listing_query( property_type: list[PropertyType] = [], page_size: int = 25, furnish_types: list[FurnishType] = [], + session: aiohttp.ClientSession | None = None, ) -> dict[str, Any]: + """Execute a listing search query. + + Args: + page: Page number to fetch (1-indexed). + channel: Listing type (BUY or RENT). + min_bedrooms: Minimum number of bedrooms. + max_bedrooms: Maximum number of bedrooms. + radius: Search radius. + min_price: Minimum price. + max_price: Maximum price. + district: District identifier string. + mustNewHome: Filter for new homes only (BUY only). + max_days_since_added: Maximum days since listing was added (BUY only). + property_type: List of property types to filter (BUY only). + page_size: Number of results per page (default 25). + furnish_types: List of furnish types to filter (RENT only). + session: Optional aiohttp session. Creates new one if not provided. + + Returns: + API response as a dictionary. + """ params: dict[str, str] = { "locationIdentifier": districts.get_districts()[district], "channel": str(channel).upper(), @@ -95,19 +177,105 @@ async def listing_query( if furnish_types: params["furnishTypes"] = ",".join(furnish_types) - headers = { + request_headers = { "Host": "api.rightmove.co.uk", "Accept-Encoding": "gzip, deflate, br", "User-Agent": "okhttp/4.12.0", "Connection": "keep-alive", } - async with aiohttp.ClientSession(trust_env=True) as session: - async with session.get( + async def do_request(s: aiohttp.ClientSession) -> dict[str, Any]: + async with s.get( "https://api.rightmove.co.uk/api/property-listing", params=params, - headers=headers, + headers=request_headers, ) as response: if response.status != 200: raise Exception(f"Failed due to: {await response.text()}") return await response.json() + + if session: + return await do_request(session) + else: + async with aiohttp.ClientSession(trust_env=True) as new_session: + return await do_request(new_session) + + +@retry(wait=wait_random(min=1, max=10), stop=stop_after_attempt(3)) +async def probe_query( + *, + session: aiohttp.ClientSession, + channel: ListingType, + min_bedrooms: int, + max_bedrooms: int, + radius: float, + min_price: int, + max_price: int, + district: str, + max_days_since_added: int = 30, + furnish_types: list[FurnishType] = [], +) -> dict[str, Any]: + """Probe the API to get result count without fetching full results. + + Makes a minimal request (page_size=1) to efficiently get totalAvailableResults. + + Args: + session: aiohttp session for making requests. + channel: Listing type (BUY or RENT). + min_bedrooms: Minimum number of bedrooms. + max_bedrooms: Maximum number of bedrooms. + radius: Search radius. + min_price: Minimum price. + max_price: Maximum price. + district: District identifier string. + max_days_since_added: Maximum days since listing was added (BUY only). + furnish_types: List of furnish types to filter (RENT only). + + Returns: + API response containing totalAvailableResults. + """ + params: dict[str, str] = { + "locationIdentifier": districts.get_districts()[district], + "channel": str(channel).upper(), + "page": "1", + "numberOfPropertiesPerPage": "1", # Minimal page size for probing + "radius": str(radius), + "sortBy": "distance", + "includeUnavailableProperties": "false", + "minPrice": str(min_price), + "maxPrice": str(max_price), + "minBedrooms": str(min_bedrooms), + "maxBedrooms": str(max_bedrooms), + "apiApplication": "ANDROID", + "appVersion": "4.28.0", + } + + if channel is ListingType.BUY: + params["dontShow"] = "sharedOwnership,retirement" + if max_days_since_added is not None and max_days_since_added in [ + 1, + 3, + 7, + 14, + ]: + params["maxDaysSinceAdded"] = str(max_days_since_added) + + if channel is ListingType.RENT: + if furnish_types: + params["furnishTypes"] = ",".join(furnish_types) + + request_headers = { + "Host": "api.rightmove.co.uk", + "Accept-Encoding": "gzip, deflate, br", + "User-Agent": "okhttp/4.12.0", + "Connection": "keep-alive", + } + + async with session.get( + "https://api.rightmove.co.uk/api/property-listing", + params=params, + headers=request_headers, + ) as response: + if response.status != 200: + raise Exception(f"Probe failed: {await response.text()}") + return await response.json() diff --git a/crawler/services/listing_fetcher.py b/crawler/services/listing_fetcher.py new file mode 100644 index 0000000..a94f3e0 --- /dev/null +++ b/crawler/services/listing_fetcher.py @@ -0,0 +1,146 @@ +"""Listing fetcher service - fetches listing data from Rightmove API.""" +import asyncio +import logging +from typing import Any + +from config.scraper_config import ScraperConfig +from listing_processor import ListingProcessor +from rec.query import create_session, listing_query +from models.listing import QueryParameters +from repositories import ListingRepository +from tqdm.asyncio import tqdm +from models import Listing as modelListing +from services.query_splitter import QuerySplitter, SubQuery + +logger = logging.getLogger("uvicorn.error") + + +async def dump_listings_full( + parameters: QueryParameters, + repository: ListingRepository, +) -> list[modelListing]: + """Fetches all listings, images as well as detects floorplans.""" + new_listings = await dump_listings(parameters, repository) + logger.debug(f"Upserted {len(new_listings)} new listings") + # refresh listings + listings = await repository.get_listings(parameters) # this can be better + new_listings = [x for x in listings if x.id in new_listings] + return new_listings + + +async def dump_listings( + parameters: QueryParameters, + repository: ListingRepository, +) -> list[modelListing]: + """Fetch listings from Rightmove API and process them. + + Uses intelligent query splitting to maximize data extraction + while respecting Rightmove's result caps. + """ + config = ScraperConfig.from_env() + splitter = QuerySplitter(config) + + async with create_session(config) as session: + # Phase 1 & 2: Split and probe queries + logger.info("Splitting query and probing result counts...") + subqueries = await splitter.split(parameters, session) + + total_estimated = splitter.calculate_total_estimated_results(subqueries) + logger.info( + f"Split into {len(subqueries)} subqueries, " + f"estimated {total_estimated} total results" + ) + + # Phase 3: Fetch all pages for each subquery + semaphore = asyncio.Semaphore(config.max_concurrent_requests) + + async def fetch_subquery(sq: SubQuery) -> list[dict[str, Any]]: + """Fetch all pages for a single subquery.""" + results: list[dict[str, Any]] = [] + + estimated = sq.estimated_results or 0 + if estimated == 0: + return results + + page_size = parameters.page_size + max_pages = min( + config.max_pages_per_query, + (estimated // page_size) + 1, + ) + + for page_id in range(1, max_pages + 1): + async with semaphore: + await asyncio.sleep(config.request_delay_ms / 1000) + try: + result = await listing_query( + page=page_id, + channel=parameters.listing_type, + min_bedrooms=sq.min_bedrooms, + max_bedrooms=sq.max_bedrooms, + radius=parameters.radius, + min_price=sq.min_price, + max_price=sq.max_price, + district=sq.district, + page_size=page_size, + max_days_since_added=parameters.max_days_since_added, + furnish_types=parameters.furnish_types or [], + session=session, + ) + results.append(result) + + properties = result.get("properties", []) + if len(properties) < page_size: + break + + except Exception as e: + if "GENERIC_ERROR" in str(e): + logger.debug( + f"Max page for {sq.district}: {page_id - 1}" + ) + break + logger.warning( + f"Error fetching page {page_id} for {sq.district}: {e}" + ) + break + + return results + + # Fetch all subqueries with progress bar + all_results = await tqdm.gather( + *[fetch_subquery(sq) for sq in subqueries], + desc="Fetching listings", + ) + + # Extract listing identifiers from results + listing_ids: list[int] = [] + for subquery_results in all_results: + for response_json in subquery_results: + if not response_json: + continue + if response_json.get("totalAvailableResults", 0) == 0: + continue + for property_data in response_json.get("properties", []): + identifier = property_data.get("identifier") + if identifier: + listing_ids.append(identifier) + + logger.info(f"Found {len(listing_ids)} total listings") + + # Deduplicate + unique_ids = list(set(listing_ids)) + logger.info(f"After deduplication: {len(unique_ids)} unique listings") + + # Filter out listings already in database + all_listing_ids = [x.id for x in await repository.get_listings()] + missing_ids = [ + listing_id for listing_id in unique_ids if listing_id not in all_listing_ids + ] + + listing_processor = ListingProcessor(repository) + logger.info(f"Starting processing {len(missing_ids)} new listings") + processed_listings = await tqdm.gather( + *[listing_processor.process_listing(id) for id in missing_ids] + ) + filtered_listings = [x for x in processed_listings if x is not None] + + return filtered_listings diff --git a/crawler/services/query_splitter.py b/crawler/services/query_splitter.py new file mode 100644 index 0000000..0609634 --- /dev/null +++ b/crawler/services/query_splitter.py @@ -0,0 +1,303 @@ +"""Query splitting service for handling Rightmove's result cap. + +This module provides intelligent query splitting to work around Rightmove's +~1,500 listing cap per search. It adaptively splits queries by price bands +based on actual result counts. +""" +from __future__ import annotations + +import asyncio +import logging +from dataclasses import dataclass, replace +from typing import Any + +import aiohttp + +from config.scraper_config import ScraperConfig +from models.listing import ListingType, QueryParameters +from rec.districts import get_districts + +logger = logging.getLogger("uvicorn.error") + + +@dataclass +class SubQuery: + """Represents a single query subdivision. + + Attributes: + district: District identifier string. + min_bedrooms: Minimum number of bedrooms. + max_bedrooms: Maximum number of bedrooms. + min_price: Minimum price in currency units. + max_price: Maximum price in currency units. + estimated_results: Cached result count from probing (None if not probed). + """ + + district: str + min_bedrooms: int + max_bedrooms: int + min_price: int + max_price: int + estimated_results: int | None = None + + @property + def price_range(self) -> int: + """Returns the width of the price band.""" + return self.max_price - self.min_price + + +class QuerySplitter: + """Splits large queries into smaller subqueries to avoid result caps. + + Uses adaptive binary search on price ranges to find optimal subdivisions + that keep each subquery under the result threshold. + """ + + def __init__(self, config: ScraperConfig | None = None) -> None: + """Initialize the splitter with configuration. + + Args: + config: Scraper configuration. Loads from environment if not provided. + """ + self.config = config or ScraperConfig.from_env() + + def create_initial_subqueries( + self, + parameters: QueryParameters, + districts: dict[str, str], + ) -> list[SubQuery]: + """Create initial subqueries by splitting on district and bedrooms. + + This creates the initial split before probing for result counts. + Each bedroom count gets its own subquery to enable finer-grained splitting. + + Args: + parameters: Original query parameters. + districts: Dictionary of district name to location ID. + + Returns: + List of initial SubQuery objects. + """ + subqueries: list[SubQuery] = [] + + for district in districts.keys(): + for num_bedrooms in range( + parameters.min_bedrooms, parameters.max_bedrooms + 1 + ): + subqueries.append( + SubQuery( + district=district, + min_bedrooms=num_bedrooms, + max_bedrooms=num_bedrooms, + min_price=parameters.min_price, + max_price=parameters.max_price, + ) + ) + + return subqueries + + async def probe_result_count( + self, + subquery: SubQuery, + session: aiohttp.ClientSession, + parameters: QueryParameters, + ) -> int: + """Probe the API to get the total result count for a subquery. + + Makes a minimal request (page_size=1) to get totalAvailableResults. + + Args: + subquery: The subquery to probe. + session: aiohttp session for making requests. + parameters: Original query parameters for additional settings. + + Returns: + Total available results for this subquery. + """ + from rec.query import probe_query + + try: + result = await probe_query( + session=session, + channel=parameters.listing_type, + min_bedrooms=subquery.min_bedrooms, + max_bedrooms=subquery.max_bedrooms, + radius=parameters.radius, + min_price=subquery.min_price, + max_price=subquery.max_price, + district=subquery.district, + max_days_since_added=parameters.max_days_since_added, + furnish_types=parameters.furnish_types or [], + ) + return result.get("totalAvailableResults", 0) + except Exception as e: + logger.warning(f"Failed to probe subquery {subquery}: {e}") + return 0 + + def split_by_price(self, subquery: SubQuery) -> list[SubQuery]: + """Split a subquery into two by halving the price range. + + Args: + subquery: The subquery to split. + + Returns: + List of two subqueries covering the same price range. + """ + mid_price = (subquery.min_price + subquery.max_price) // 2 + + return [ + replace( + subquery, + max_price=mid_price, + estimated_results=None, + ), + replace( + subquery, + min_price=mid_price, + estimated_results=None, + ), + ] + + async def adaptive_split( + self, + subquery: SubQuery, + session: aiohttp.ClientSession, + parameters: QueryParameters, + semaphore: asyncio.Semaphore, + ) -> list[SubQuery]: + """Recursively split a subquery until all parts are under threshold. + + Uses binary search on price range to find optimal splits. + + Args: + subquery: The subquery to split. + session: aiohttp session for making requests. + parameters: Original query parameters. + semaphore: Semaphore for rate limiting. + + Returns: + List of subqueries that are all under the split threshold. + """ + # Check if we can split further + if subquery.price_range <= self.config.min_price_band: + logger.warning( + f"Cannot split further, price band at minimum: {subquery}" + ) + return [subquery] + + # Split into two halves + halves = self.split_by_price(subquery) + result: list[SubQuery] = [] + + for half in halves: + async with semaphore: + await asyncio.sleep(self.config.request_delay_ms / 1000) + count = await self.probe_result_count(half, session, parameters) + + half = replace(half, estimated_results=count) + + if count > self.config.split_threshold: + # Need to split further + result.extend( + await self.adaptive_split( + half, session, parameters, semaphore + ) + ) + else: + result.append(half) + + return result + + async def split( + self, + parameters: QueryParameters, + session: aiohttp.ClientSession, + on_progress: Any = None, + ) -> list[SubQuery]: + """Split query parameters into optimized subqueries. + + Performs the full splitting algorithm: + 1. Create initial splits by district and bedroom count + 2. Probe each to get result counts + 3. Adaptively split any that exceed the threshold + + Args: + parameters: Original query parameters to split. + session: aiohttp session for making requests. + on_progress: Optional callback for progress updates. + + Returns: + List of SubQuery objects, each under the result threshold. + """ + # Get valid districts + if parameters.district_names: + districts = { + district: locid + for district, locid in get_districts().items() + if district in parameters.district_names + } + else: + districts = get_districts() + + # Phase 1: Create initial subqueries + initial_subqueries = self.create_initial_subqueries(parameters, districts) + logger.info(f"Created {len(initial_subqueries)} initial subqueries") + + if on_progress: + on_progress( + phase="splitting", + message=f"Created {len(initial_subqueries)} initial subqueries", + ) + + # Phase 2: Probe and adaptively split + semaphore = asyncio.Semaphore(self.config.max_concurrent_requests) + refined_subqueries: list[SubQuery] = [] + + # Probe all initial subqueries in parallel + async def probe_and_split(sq: SubQuery) -> list[SubQuery]: + async with semaphore: + await asyncio.sleep(self.config.request_delay_ms / 1000) + count = await self.probe_result_count(sq, session, parameters) + + sq = replace(sq, estimated_results=count) + + if count > self.config.split_threshold: + logger.info( + f"Subquery {sq.district}/{sq.min_bedrooms}BR " + f"has {count} results, splitting..." + ) + return await self.adaptive_split( + sq, session, parameters, semaphore + ) + return [sq] + + tasks = [probe_and_split(sq) for sq in initial_subqueries] + results = await asyncio.gather(*tasks) + + for subquery_list in results: + refined_subqueries.extend(subquery_list) + + logger.info( + f"Refined to {len(refined_subqueries)} subqueries after splitting" + ) + + if on_progress: + on_progress( + phase="splitting_complete", + message=f"Refined to {len(refined_subqueries)} subqueries", + ) + + return refined_subqueries + + def calculate_total_estimated_results( + self, subqueries: list[SubQuery] + ) -> int: + """Calculate total estimated results across all subqueries. + + Args: + subqueries: List of subqueries with estimated_results set. + + Returns: + Sum of all estimated results. + """ + return sum(sq.estimated_results or 0 for sq in subqueries) diff --git a/crawler/tasks/listing_tasks.py b/crawler/tasks/listing_tasks.py index f86f89e..1fb3041 100644 --- a/crawler/tasks/listing_tasks.py +++ b/crawler/tasks/listing_tasks.py @@ -1,18 +1,17 @@ import asyncio -import itertools import logging from typing import Any from celery import Task from celery.schedules import crontab from celery_app import app from config.schedule_config import SchedulesConfig +from config.scraper_config import ScraperConfig from listing_processor import ListingProcessor from models.listing import Listing, QueryParameters -from rec.districts import get_districts -from rec.query import listing_query +from rec.query import create_session, listing_query from repositories.listing_repository import ListingRepository from database import engine -from services import image_fetcher, floorplan_detector +from services.query_splitter import QuerySplitter, SubQuery from utils.redis_lock import redis_lock logger = logging.getLogger("uvicorn.error") @@ -134,106 +133,138 @@ async def get_ids_to_process( repository: ListingRepository, task: Task, ) -> set[int]: - semaphore = asyncio.Semaphore(5) # if too high, rightmove drops connections - districts = await get_valid_districts_to_scrape(parameters.district_names) - task.update_state(state="Fetching listings to scrape", meta={"progress": 0}) - json_responses: list[list[dict[str, Any]]] = await asyncio.gather( - *[ - _fetch_listings_with_semaphore( - task=task, semaphore=semaphore, parameters=parameters, district=district - ) - for district in districts.keys() - ], - ) - json_responses_flat = list(itertools.chain.from_iterable(json_responses)) - logger.debug(f"Total listings fetched {len(json_responses_flat)}") + """Fetch all listing IDs using intelligent query splitting. - identifiers: set[int] = set() - for response_json in json_responses_flat: - if response_json == {}: - continue - if response_json["totalAvailableResults"] == 0: - continue - for property in response_json["properties"]: - identifier = property["identifier"] - identifiers.add(identifier) + Uses the QuerySplitter to adaptively split large queries and maximize + data extraction while respecting Rightmove's result caps. - # if listing is already in db, do not fetch details again - all_listing_ids = {l.id for l in await repository.get_listings()} - new_ids = identifiers - all_listing_ids - return new_ids + Args: + parameters: Query parameters for the search. + repository: Repository for checking existing listings. + task: Celery task for progress updates. + Returns: + Set of new listing IDs that need to be processed. + """ + config = ScraperConfig.from_env() + splitter = QuerySplitter(config) -async def get_valid_districts_to_scrape( - district_names: set[str] | None, -) -> dict[str, str]: - if district_names: - districts = { - district: locid - for district, locid in get_districts().items() - if district in district_names - } - else: - districts = get_districts() - return districts + def on_progress(phase: str, message: str) -> None: + task.update_state(state=message, meta={"phase": phase}) - -async def _fetch_listings_with_semaphore( - *, - task: Task, - semaphore: asyncio.Semaphore, - parameters: QueryParameters, - district: str, -) -> list[dict[str, Any]]: - result = [] - # split the price in N bands to avoid the 1.5k capping by rightmove - # basically instead of 1 query with price between 1k and 5k that is capped at 1500 results - # we do 10 queries each with an increment in price range so we send more queries but each - # has a smaller chance of returning more than 1.5k results - - number_of_steps = 10 - price_step = parameters.max_price // number_of_steps - - for step in range(number_of_steps): + async with create_session(config) as session: + # Phase 1 & 2: Split and probe queries task.update_state( - state=f"Fetching listings ({step} out of {number_of_steps})", - meta={"progress": step / number_of_steps}, + state="Analyzing query and splitting by price bands...", + meta={"phase": "splitting", "progress": 0}, ) - min_price = step * price_step - max_price = (step + 1) * price_step - logger.debug( - f"Step {step} of {number_of_steps} with {min_price=} and {max_price=}" + subqueries = await splitter.split(parameters, session, on_progress) + + total_estimated = splitter.calculate_total_estimated_results(subqueries) + logger.info( + f"Split into {len(subqueries)} subqueries, " + f"estimated {total_estimated} total results" ) - for num_bedrooms in range(parameters.min_bedrooms, parameters.max_bedrooms + 1): - for page_id in range( - 1, - 3, # seems like all searches stop at 1500 entries (page_id * page_size) - ): - logger.debug(f"Processing {page_id=} for {district=}") + # Phase 3: Fetch all pages for each subquery + task.update_state( + state=f"Fetching listings from {len(subqueries)} subqueries...", + meta={ + "phase": "fetching", + "subqueries": len(subqueries), + "estimated_results": total_estimated, + }, + ) + semaphore = asyncio.Semaphore(config.max_concurrent_requests) + identifiers: set[int] = set() + + async def fetch_subquery(sq: SubQuery) -> list[dict[str, Any]]: + """Fetch all pages for a single subquery.""" + results: list[dict[str, Any]] = [] + + # Calculate how many pages we need based on estimated results + estimated = sq.estimated_results or 0 + if estimated == 0: + return results + + # Fetch pages up to max_pages_per_query or until no more results + page_size = parameters.page_size + max_pages = min( + config.max_pages_per_query, + (estimated // page_size) + 1, + ) + + for page_id in range(1, max_pages + 1): async with semaphore: + await asyncio.sleep(config.request_delay_ms / 1000) try: - listing_query_result = await listing_query( + result = await listing_query( page=page_id, channel=parameters.listing_type, - # min_bedrooms=parameters.min_bedrooms, - # max_bedrooms=parameters.max_bedrooms, - min_bedrooms=num_bedrooms, - max_bedrooms=num_bedrooms, + min_bedrooms=sq.min_bedrooms, + max_bedrooms=sq.max_bedrooms, radius=parameters.radius, - min_price=min_price, - max_price=max_price, - district=district, - page_size=parameters.page_size, + min_price=sq.min_price, + max_price=sq.max_price, + district=sq.district, + page_size=page_size, max_days_since_added=parameters.max_days_since_added, furnish_types=parameters.furnish_types or [], + session=session, ) + results.append(result) + + # Check if we've received all results + properties = result.get("properties", []) + if len(properties) < page_size: + # No more results on next page + break except Exception as e: - if "GENERIC_ERROR" in str(e): # Too big page id - logger.debug(f"Max page id for {district=}: {page_id-1}") + if "GENERIC_ERROR" in str(e): + # Reached end of results + logger.debug( + f"Max page for {sq.district}: {page_id - 1}" + ) break - raise e - result.append(listing_query_result) - return result + logger.warning( + f"Error fetching page {page_id} for {sq.district}: {e}" + ) + break + + return results + + # Fetch all subqueries concurrently + all_results = await asyncio.gather( + *[fetch_subquery(sq) for sq in subqueries] + ) + + # Extract identifiers from all results + for subquery_results in all_results: + for response_json in subquery_results: + if not response_json: + continue + if response_json.get("totalAvailableResults", 0) == 0: + continue + for property_data in response_json.get("properties", []): + identifier = property_data.get("identifier") + if identifier: + identifiers.add(identifier) + + logger.info(f"Found {len(identifiers)} unique listings") + + # Filter out listings already in the database + all_listing_ids = {l.id for l in await repository.get_listings()} + new_ids = identifiers - all_listing_ids + + task.update_state( + state=f"Found {len(new_ids)} new listings to process", + meta={ + "phase": "filtering", + "total_found": len(identifiers), + "new_listings": len(new_ids), + }, + ) + + return new_ids diff --git a/crawler/tests/unit/test_query_splitter.py b/crawler/tests/unit/test_query_splitter.py new file mode 100644 index 0000000..467f234 --- /dev/null +++ b/crawler/tests/unit/test_query_splitter.py @@ -0,0 +1,374 @@ +"""Unit tests for QuerySplitter service.""" +import pytest +from unittest.mock import AsyncMock, patch + +from config.scraper_config import ScraperConfig +from models.listing import ListingType, QueryParameters +from services.query_splitter import QuerySplitter, SubQuery + + +class TestScraperConfig: + """Tests for the ScraperConfig dataclass.""" + + def test_default_values(self) -> None: + """Test that default values are set correctly.""" + config = ScraperConfig() + assert config.max_concurrent_requests == 5 + assert config.request_delay_ms == 100 + assert config.result_cap == 1500 + assert config.split_threshold == 1200 + assert config.min_price_band == 100 + assert config.max_pages_per_query == 60 + assert config.proxy_url is None + + def test_from_env(self) -> None: + """Test loading configuration from environment variables.""" + with patch.dict( + "os.environ", + { + "RIGHTMOVE_MAX_CONCURRENT": "10", + "RIGHTMOVE_REQUEST_DELAY_MS": "200", + "RIGHTMOVE_SPLIT_THRESHOLD": "1000", + "RIGHTMOVE_MIN_PRICE_BAND": "50", + "RIGHTMOVE_MAX_PAGES": "30", + "RIGHTMOVE_PROXY_URL": "socks5://localhost:9050", + }, + ): + config = ScraperConfig.from_env() + assert config.max_concurrent_requests == 10 + assert config.request_delay_ms == 200 + assert config.split_threshold == 1000 + assert config.min_price_band == 50 + assert config.max_pages_per_query == 30 + assert config.proxy_url == "socks5://localhost:9050" + + def test_from_env_empty_proxy(self) -> None: + """Test that empty proxy URL is converted to None.""" + with patch.dict( + "os.environ", + { + "RIGHTMOVE_PROXY_URL": "", + }, + clear=False, + ): + config = ScraperConfig.from_env() + assert config.proxy_url is None + + +class TestSubQuery: + """Tests for the SubQuery dataclass.""" + + def test_price_range_calculation(self) -> None: + """Test that price_range is calculated correctly.""" + sq = SubQuery( + district="Kings Cross", + min_bedrooms=2, + max_bedrooms=2, + min_price=1000, + max_price=2000, + ) + assert sq.price_range == 1000 + + +class TestQuerySplitter: + """Tests for the QuerySplitter class.""" + + @pytest.fixture + def config(self) -> ScraperConfig: + """Create a test configuration.""" + return ScraperConfig( + max_concurrent_requests=5, + request_delay_ms=10, # Faster for testing + result_cap=1500, + split_threshold=1200, + min_price_band=100, + max_pages_per_query=60, + proxy_url=None, + ) + + @pytest.fixture + def splitter(self, config: ScraperConfig) -> QuerySplitter: + """Create a QuerySplitter instance.""" + return QuerySplitter(config) + + @pytest.fixture + def parameters(self) -> QueryParameters: + """Create test query parameters.""" + return QueryParameters( + listing_type=ListingType.RENT, + min_bedrooms=2, + max_bedrooms=3, + min_price=1000, + max_price=5000, + district_names={"Kings Cross", "Angel"}, + ) + + def test_create_initial_subqueries( + self, splitter: QuerySplitter, parameters: QueryParameters + ) -> None: + """Test that initial subqueries are created correctly.""" + districts = {"Kings Cross": "STATION^5168", "Angel": "STATION^1234"} + + subqueries = splitter.create_initial_subqueries(parameters, districts) + + # 2 districts × 2 bedroom counts (2,3) = 4 subqueries + assert len(subqueries) == 4 + + # Check first subquery + assert subqueries[0].district == "Kings Cross" + assert subqueries[0].min_bedrooms == 2 + assert subqueries[0].max_bedrooms == 2 + assert subqueries[0].min_price == 1000 + assert subqueries[0].max_price == 5000 + + def test_split_by_price(self, splitter: QuerySplitter) -> None: + """Test that price splitting works correctly.""" + sq = SubQuery( + district="Kings Cross", + min_bedrooms=2, + max_bedrooms=2, + min_price=1000, + max_price=5000, + ) + + halves = splitter.split_by_price(sq) + + assert len(halves) == 2 + assert halves[0].min_price == 1000 + assert halves[0].max_price == 3000 # midpoint + assert halves[1].min_price == 3000 + assert halves[1].max_price == 5000 + + # Both should have same bedroom range and district + for half in halves: + assert half.district == "Kings Cross" + assert half.min_bedrooms == 2 + assert half.max_bedrooms == 2 + + @pytest.mark.asyncio + async def test_probe_result_count( + self, splitter: QuerySplitter, parameters: QueryParameters + ) -> None: + """Test probing API for result count.""" + sq = SubQuery( + district="Kings Cross", + min_bedrooms=2, + max_bedrooms=2, + min_price=1000, + max_price=5000, + ) + + mock_session = AsyncMock() + + # Mock the probe_query function + with patch("services.query_splitter.probe_query") as mock_probe: + mock_probe.return_value = {"totalAvailableResults": 800} + + count = await splitter.probe_result_count(sq, mock_session, parameters) + + assert count == 800 + mock_probe.assert_called_once() + + @pytest.mark.asyncio + async def test_probe_result_count_handles_error( + self, splitter: QuerySplitter, parameters: QueryParameters + ) -> None: + """Test that probe_result_count handles errors gracefully.""" + sq = SubQuery( + district="Kings Cross", + min_bedrooms=2, + max_bedrooms=2, + min_price=1000, + max_price=5000, + ) + + mock_session = AsyncMock() + + with patch("services.query_splitter.probe_query") as mock_probe: + mock_probe.side_effect = Exception("API error") + + count = await splitter.probe_result_count(sq, mock_session, parameters) + + # Should return 0 on error + assert count == 0 + + @pytest.mark.asyncio + async def test_adaptive_split_no_split_needed( + self, splitter: QuerySplitter, parameters: QueryParameters + ) -> None: + """Test adaptive split when results are below threshold.""" + sq = SubQuery( + district="Kings Cross", + min_bedrooms=2, + max_bedrooms=2, + min_price=1000, + max_price=2000, + ) + + mock_session = AsyncMock() + mock_semaphore = AsyncMock() + + with patch("services.query_splitter.probe_query") as mock_probe: + # First half has 600 results, second half has 500 + mock_probe.side_effect = [ + {"totalAvailableResults": 600}, + {"totalAvailableResults": 500}, + ] + + result = await splitter.adaptive_split( + sq, mock_session, parameters, mock_semaphore + ) + + # Both halves are under threshold (1200), so we get 2 subqueries back + assert len(result) == 2 + assert result[0].estimated_results == 600 + assert result[1].estimated_results == 500 + + @pytest.mark.asyncio + async def test_adaptive_split_recursive_splitting( + self, splitter: QuerySplitter, parameters: QueryParameters + ) -> None: + """Test adaptive split performs recursive splitting when needed.""" + sq = SubQuery( + district="Kings Cross", + min_bedrooms=2, + max_bedrooms=2, + min_price=1000, + max_price=5000, + ) + + mock_session = AsyncMock() + mock_semaphore = AsyncMock() + + with patch("services.query_splitter.probe_query") as mock_probe: + # First split: 1000-3000 has 1300 (over threshold), 3000-5000 has 800 + # Second split of 1000-3000: 1000-2000 has 700, 2000-3000 has 600 + mock_probe.side_effect = [ + {"totalAvailableResults": 1300}, # First half - needs more splitting + {"totalAvailableResults": 800}, # Second half - OK + {"totalAvailableResults": 700}, # First quarter - OK + {"totalAvailableResults": 600}, # Second quarter - OK + ] + + result = await splitter.adaptive_split( + sq, mock_session, parameters, mock_semaphore + ) + + # Should get 3 subqueries: [1000-2000 (700), 2000-3000 (600), 3000-5000 (800)] + assert len(result) == 3 + + @pytest.mark.asyncio + async def test_adaptive_split_respects_min_price_band( + self, splitter: QuerySplitter, parameters: QueryParameters + ) -> None: + """Test that adaptive split stops at min_price_band.""" + sq = SubQuery( + district="Kings Cross", + min_bedrooms=2, + max_bedrooms=2, + min_price=1000, + max_price=1050, # Only 50 range, below min_price_band of 100 + estimated_results=1500, # Over threshold but can't split + ) + + mock_session = AsyncMock() + mock_semaphore = AsyncMock() + + result = await splitter.adaptive_split( + sq, mock_session, parameters, mock_semaphore + ) + + # Can't split below min_price_band, should return original + assert len(result) == 1 + assert result[0].min_price == 1000 + assert result[0].max_price == 1050 + + def test_calculate_total_estimated_results( + self, splitter: QuerySplitter + ) -> None: + """Test calculation of total estimated results.""" + subqueries = [ + SubQuery( + district="Kings Cross", + min_bedrooms=2, + max_bedrooms=2, + min_price=1000, + max_price=2000, + estimated_results=500, + ), + SubQuery( + district="Kings Cross", + min_bedrooms=3, + max_bedrooms=3, + min_price=1000, + max_price=2000, + estimated_results=300, + ), + SubQuery( + district="Angel", + min_bedrooms=2, + max_bedrooms=2, + min_price=1000, + max_price=2000, + estimated_results=None, # Not probed + ), + ] + + total = splitter.calculate_total_estimated_results(subqueries) + assert total == 800 # 500 + 300 + 0 + + @pytest.mark.asyncio + async def test_split_integration( + self, splitter: QuerySplitter, parameters: QueryParameters + ) -> None: + """Integration test for the full split workflow.""" + mock_session = AsyncMock() + mock_districts = {"Kings Cross": "STATION^5168", "Angel": "STATION^1234"} + + with patch("services.query_splitter.get_districts", return_value=mock_districts): + with patch("services.query_splitter.probe_query") as mock_probe: + # Mock probe results for each initial subquery + # 2 districts × 2 bedroom counts = 4 initial subqueries + mock_probe.side_effect = [ + {"totalAvailableResults": 500}, # KC 2BR - OK + {"totalAvailableResults": 1300}, # KC 3BR - needs split + {"totalAvailableResults": 600}, # Angel 2BR - OK + {"totalAvailableResults": 800}, # Angel 3BR - OK + # Split KC 3BR + {"totalAvailableResults": 700}, # KC 3BR first half + {"totalAvailableResults": 600}, # KC 3BR second half + ] + + result = await splitter.split(parameters, mock_session) + + # Should have 5 subqueries total: + # KC 2BR (500), KC 3BR split into 2 (700+600), Angel 2BR (600), Angel 3BR (800) + assert len(result) == 5 + + # Verify total estimated results + total = splitter.calculate_total_estimated_results(result) + assert total == 3200 # 500 + 700 + 600 + 600 + 800 + + @pytest.mark.asyncio + async def test_split_with_on_progress_callback( + self, splitter: QuerySplitter, parameters: QueryParameters + ) -> None: + """Test that on_progress callback is called during split.""" + mock_session = AsyncMock() + mock_districts = {"Kings Cross": "STATION^5168", "Angel": "STATION^1234"} + progress_calls = [] + + def on_progress(phase: str, message: str) -> None: + progress_calls.append((phase, message)) + + with patch("services.query_splitter.get_districts", return_value=mock_districts): + with patch("services.query_splitter.probe_query") as mock_probe: + mock_probe.return_value = {"totalAvailableResults": 500} + + await splitter.split(parameters, mock_session, on_progress) + + # Should have received at least 2 progress updates + assert len(progress_calls) >= 2 + phases = [call[0] for call in progress_calls] + assert "splitting" in phases + assert "splitting_complete" in phases From f880664a98a0b1b0096d8f59d3acbfb8d4ba49f5 Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Mon, 2 Feb 2026 22:50:19 +0000 Subject: [PATCH 3/5] Add throttling detection and circuit breaker for Rightmove scraper --- crawler/.env.sample | 6 + crawler/config/scraper_config.py | 24 ++ crawler/rec/circuit_breaker.py | 137 +++++++ crawler/rec/exceptions.py | 74 ++++ crawler/rec/query.py | 239 +++++++++++-- crawler/rec/throttle_detector.py | 232 ++++++++++++ crawler/services/listing_fetcher.py | 144 ++++---- crawler/services/query_splitter.py | 13 + .../integration/test_throttle_integration.py | 311 ++++++++++++++++ crawler/tests/unit/test_throttle_detection.py | 334 ++++++++++++++++++ 10 files changed, 1428 insertions(+), 86 deletions(-) create mode 100644 crawler/rec/circuit_breaker.py create mode 100644 crawler/rec/exceptions.py create mode 100644 crawler/rec/throttle_detector.py create mode 100644 crawler/tests/integration/test_throttle_integration.py create mode 100644 crawler/tests/unit/test_throttle_detection.py diff --git a/crawler/.env.sample b/crawler/.env.sample index aa1c93d..b709cde 100644 --- a/crawler/.env.sample +++ b/crawler/.env.sample @@ -16,6 +16,12 @@ RIGHTMOVE_MIN_PRICE_BAND=100 # Minimum price band width (won't split below RIGHTMOVE_MAX_PAGES=60 # Max pages per subquery (60 * 25 = 1500 max results) RIGHTMOVE_PROXY_URL= # Optional SOCKS proxy URL (e.g., socks5://localhost:9050 for Tor) +# Throttling detection and circuit breaker +RIGHTMOVE_SLOW_RESPONSE_THRESHOLD=10.0 # Response time threshold in seconds +RIGHTMOVE_ENABLE_CIRCUIT_BREAKER=true # Enable circuit breaker protection +RIGHTMOVE_CIRCUIT_BREAKER_FAILURES=5 # Consecutive failures to open circuit +RIGHTMOVE_CIRCUIT_BREAKER_TIMEOUT=60.0 # Seconds to wait before recovery attempt + # Periodic scraping schedules (JSON array) # Each schedule has: name, enabled, hour, minute, day_of_week, listing_type, min/max_bedrooms, min/max_price, district_names, furnish_types # Cron fields: minute (0-59), hour (0-23), day_of_week (0-6, 0=Sunday) diff --git a/crawler/config/scraper_config.py b/crawler/config/scraper_config.py index e84c1d5..860d343 100644 --- a/crawler/config/scraper_config.py +++ b/crawler/config/scraper_config.py @@ -18,6 +18,10 @@ class ScraperConfig: min_price_band: Minimum width of a price band (won't split below this). max_pages_per_query: Maximum pages to fetch per subquery (60 * 25 = 1500). proxy_url: Optional SOCKS proxy URL (e.g., socks5://localhost:9050 for Tor). + slow_response_threshold: Response time threshold in seconds for throttle detection. + enable_circuit_breaker: Whether to enable circuit breaker protection. + circuit_breaker_failure_threshold: Number of consecutive failures to open circuit. + circuit_breaker_recovery_timeout: Seconds to wait before testing recovery. """ max_concurrent_requests: int = 5 @@ -27,6 +31,10 @@ class ScraperConfig: min_price_band: int = 100 # Minimum band width in currency units max_pages_per_query: int = 60 # 60 * 25 = 1500 results max proxy_url: str | None = None + slow_response_threshold: float = 10.0 # seconds + enable_circuit_breaker: bool = True + circuit_breaker_failure_threshold: int = 5 + circuit_breaker_recovery_timeout: float = 60.0 @classmethod def from_env(cls) -> Self: @@ -40,6 +48,10 @@ class ScraperConfig: RIGHTMOVE_MIN_PRICE_BAND: Minimum price band width (default: 100) RIGHTMOVE_MAX_PAGES: Max pages per query (default: 60) RIGHTMOVE_PROXY_URL: SOCKS proxy URL (default: None) + RIGHTMOVE_SLOW_RESPONSE_THRESHOLD: Slow response threshold in seconds (default: 10.0) + RIGHTMOVE_ENABLE_CIRCUIT_BREAKER: Enable circuit breaker (default: True) + RIGHTMOVE_CIRCUIT_BREAKER_FAILURES: Failures to open circuit (default: 5) + RIGHTMOVE_CIRCUIT_BREAKER_TIMEOUT: Recovery timeout in seconds (default: 60.0) Returns: ScraperConfig instance with values from environment or defaults. @@ -62,4 +74,16 @@ class ScraperConfig: os.environ.get("RIGHTMOVE_MAX_PAGES", "60") ), proxy_url=os.environ.get("RIGHTMOVE_PROXY_URL") or None, + slow_response_threshold=float( + os.environ.get("RIGHTMOVE_SLOW_RESPONSE_THRESHOLD", "10.0") + ), + enable_circuit_breaker=os.environ.get( + "RIGHTMOVE_ENABLE_CIRCUIT_BREAKER", "true" + ).lower() in ("true", "1", "yes"), + circuit_breaker_failure_threshold=int( + os.environ.get("RIGHTMOVE_CIRCUIT_BREAKER_FAILURES", "5") + ), + circuit_breaker_recovery_timeout=float( + os.environ.get("RIGHTMOVE_CIRCUIT_BREAKER_TIMEOUT", "60.0") + ), ) diff --git a/crawler/rec/circuit_breaker.py b/crawler/rec/circuit_breaker.py new file mode 100644 index 0000000..27bf12f --- /dev/null +++ b/crawler/rec/circuit_breaker.py @@ -0,0 +1,137 @@ +"""Circuit breaker pattern for protecting against cascading failures.""" +from __future__ import annotations + +import enum +import logging +import time +from dataclasses import dataclass + +from rec.exceptions import CircuitBreakerOpenError + +logger = logging.getLogger("uvicorn.error") + + +class CircuitState(enum.Enum): + """Circuit breaker states.""" + + CLOSED = "closed" # Normal operation + OPEN = "open" # Too many failures, blocking requests + HALF_OPEN = "half_open" # Testing if service recovered + + +@dataclass +class CircuitBreaker: + """Circuit breaker for protecting against cascading failures. + + Implements the circuit breaker pattern: + - CLOSED: Requests pass through normally, failures are counted + - OPEN: After N consecutive failures, circuit opens and blocks all requests + - HALF_OPEN: After recovery timeout, allow one request to test if service recovered + + Attributes: + failure_threshold: Number of consecutive failures before opening. + recovery_timeout: Seconds to wait before attempting half-open state. + state: Current circuit state. + failure_count: Count of consecutive failures. + last_failure_time: Timestamp of last failure. + last_state_change: Timestamp of last state change. + """ + + failure_threshold: int + recovery_timeout: float + state: CircuitState = CircuitState.CLOSED + failure_count: int = 0 + last_failure_time: float = 0.0 + last_state_change: float = 0.0 + + def __post_init__(self) -> None: + """Initialize state change timestamp.""" + self.last_state_change = time.time() + + def call(self) -> None: + """Check if a request should be allowed. + + Raises: + CircuitBreakerOpenError: If circuit is open and blocking requests. + """ + current_time = time.time() + + if self.state == CircuitState.OPEN: + # Check if we should transition to half-open + if current_time - self.last_failure_time >= self.recovery_timeout: + self._transition_to_half_open() + else: + raise CircuitBreakerOpenError( + f"Circuit breaker is open. " + f"Waiting {self.recovery_timeout - (current_time - self.last_failure_time):.1f}s " + f"before retry." + ) + + # Allow request to proceed (CLOSED or HALF_OPEN) + + def record_success(self) -> None: + """Record a successful request.""" + if self.state == CircuitState.HALF_OPEN: + # Service has recovered, close the circuit + self._transition_to_closed() + + # Reset failure count on success + self.failure_count = 0 + + def record_failure(self) -> None: + """Record a failed request.""" + self.failure_count += 1 + self.last_failure_time = time.time() + + if self.state == CircuitState.HALF_OPEN: + # Test request failed, reopen circuit + self._transition_to_open() + elif self.state == CircuitState.CLOSED: + # Check if we should open the circuit + if self.failure_count >= self.failure_threshold: + self._transition_to_open() + + def _transition_to_open(self) -> None: + """Transition to OPEN state.""" + self.state = CircuitState.OPEN + self.last_state_change = time.time() + logger.warning( + f"Circuit breaker OPENED after {self.failure_count} consecutive failures. " + f"Will retry in {self.recovery_timeout}s" + ) + + def _transition_to_half_open(self) -> None: + """Transition to HALF_OPEN state.""" + self.state = CircuitState.HALF_OPEN + self.last_state_change = time.time() + logger.info("Circuit breaker entering HALF_OPEN state, testing service recovery") + + def _transition_to_closed(self) -> None: + """Transition to CLOSED state.""" + self.state = CircuitState.CLOSED + self.last_state_change = time.time() + self.failure_count = 0 + logger.info("Circuit breaker CLOSED, service recovered") + + def reset(self) -> None: + """Manually reset the circuit breaker to CLOSED state.""" + self.state = CircuitState.CLOSED + self.failure_count = 0 + self.last_failure_time = 0.0 + self.last_state_change = time.time() + logger.info("Circuit breaker manually reset to CLOSED state") + + @property + def is_open(self) -> bool: + """Check if circuit is currently open.""" + return self.state == CircuitState.OPEN + + @property + def is_closed(self) -> bool: + """Check if circuit is currently closed.""" + return self.state == CircuitState.CLOSED + + @property + def is_half_open(self) -> bool: + """Check if circuit is currently half-open.""" + return self.state == CircuitState.HALF_OPEN diff --git a/crawler/rec/exceptions.py b/crawler/rec/exceptions.py new file mode 100644 index 0000000..9d24e94 --- /dev/null +++ b/crawler/rec/exceptions.py @@ -0,0 +1,74 @@ +"""Custom exceptions for Rightmove API errors.""" + + +class RightmoveAPIError(Exception): + """Base exception for all Rightmove API errors.""" + + pass + + +class ThrottlingError(RightmoveAPIError): + """Base exception for throttling-related errors. + + Indicates that Rightmove is limiting our requests and we should back off. + """ + + pass + + +class RateLimitError(ThrottlingError): + """HTTP 429 - Too Many Requests. + + Rightmove is explicitly rate limiting our requests. + """ + + pass + + +class ServiceUnavailableError(ThrottlingError): + """HTTP 503 - Service Unavailable. + + Rightmove's service is temporarily unavailable, possibly due to overload. + """ + + pass + + +class IPBlockedError(ThrottlingError): + """HTTP 403 - Forbidden (IP blocked). + + Our IP may be blocked or blacklisted by Rightmove. + """ + + pass + + +class SlowResponseError(ThrottlingError): + """Response time exceeded threshold. + + API is responding very slowly, indicating potential throttling or overload. + """ + + pass + + +class UnexpectedEmptyResponseError(RightmoveAPIError): + """Empty response received when data was expected.""" + + pass + + +class InvalidResponseError(RightmoveAPIError): + """Response contains error messages or invalid data.""" + + pass + + +class CircuitBreakerOpenError(RightmoveAPIError): + """Circuit breaker is open, requests are being blocked. + + The circuit breaker has detected too many failures and is preventing + further requests to allow the service to recover. + """ + + pass diff --git a/crawler/rec/query.py b/crawler/rec/query.py index a2526a6..4de8a1a 100644 --- a/crawler/rec/query.py +++ b/crawler/rec/query.py @@ -1,4 +1,6 @@ import enum +import logging +import time from typing import Any from contextlib import asynccontextmanager from collections.abc import AsyncIterator @@ -6,9 +8,26 @@ from collections.abc import AsyncIterator import aiohttp from models.listing import FurnishType, ListingType from rec import districts -from tenacity import retry, stop_after_attempt, wait_random +from rec.exceptions import ( + CircuitBreakerOpenError, + ThrottlingError, +) +from rec.throttle_detector import get_throttle_metrics, validate_response +from rec.circuit_breaker import CircuitBreaker +from tenacity import ( + retry, + retry_if_exception_type, + stop_after_attempt, + wait_exponential, + wait_random, +) from config.scraper_config import ScraperConfig +logger = logging.getLogger("uvicorn.error") + +# Global circuit breaker instance +_circuit_breaker: CircuitBreaker | None = None + DEFAULT_HEADERS = { "Host": "api.rightmove.co.uk", @@ -65,20 +84,81 @@ async def create_session( await session.close() -@retry(wait=wait_random(min=1, max=2), stop=stop_after_attempt(3)) +def get_circuit_breaker(config: ScraperConfig | None = None) -> CircuitBreaker | None: + """Get the global circuit breaker instance. + + Args: + config: Configuration for initializing the circuit breaker. + + Returns: + CircuitBreaker instance if enabled, None otherwise. + """ + global _circuit_breaker + if config is None: + config = ScraperConfig.from_env() + + if not config.enable_circuit_breaker: + return None + + if _circuit_breaker is None: + _circuit_breaker = CircuitBreaker( + failure_threshold=config.circuit_breaker_failure_threshold, + recovery_timeout=config.circuit_breaker_recovery_timeout, + ) + return _circuit_breaker + + +def reset_circuit_breaker() -> None: + """Reset the global circuit breaker.""" + global _circuit_breaker + if _circuit_breaker is not None: + _circuit_breaker.reset() + + +def check_circuit_breaker(config: ScraperConfig | None = None) -> None: + """Check if the circuit breaker allows requests. + + Args: + config: Configuration for the circuit breaker. + + Raises: + CircuitBreakerOpenError: If the circuit is open. + """ + cb = get_circuit_breaker(config) + if cb is not None: + cb.call() + + +@retry( + retry=retry_if_exception_type(ThrottlingError), + wait=wait_exponential(multiplier=2, min=2, max=120), + stop=stop_after_attempt(5), +) async def detail_query( detail_id: int, session: aiohttp.ClientSession | None = None, + config: ScraperConfig | None = None, ) -> dict[str, Any]: """Fetch detailed property information. Args: detail_id: The property identifier. session: Optional aiohttp session. Creates new one if not provided. + config: Scraper configuration. Loads from environment if not provided. Returns: Property details as a dictionary. + + Raises: + CircuitBreakerOpenError: If the circuit breaker is open. + ThrottlingError: If the request is throttled. """ + if config is None: + config = ScraperConfig.from_env() + + check_circuit_breaker(config) + cb = get_circuit_breaker(config) + params = { "apiApplication": "ANDROID", "appVersion": "3.70.0", @@ -86,13 +166,38 @@ async def detail_query( url = f"https://api.rightmove.co.uk/api/property/{detail_id}" async def do_request(s: aiohttp.ClientSession) -> dict[str, Any]: - async with s.get(url, params=params, headers=DEFAULT_HEADERS) as response: - if response.status != 200: - raise Exception( - f"""id: {detail_id}. Status Code: {response.status}.""" - f"""Failed due to: {await response.text()}""" + start_time = time.time() + try: + async with s.get(url, params=params, headers=DEFAULT_HEADERS) as response: + response_time = time.time() - start_time + body = await response.json() if response.status == 200 else None + + # Validate response for throttling + validate_response( + response, + response_time, + body, + config.slow_response_threshold, + expect_data=True, ) - return await response.json() + + if response.status != 200: + raise Exception( + f"""id: {detail_id}. Status Code: {response.status}.""" + f"""Failed due to: {await response.text()}""" + ) + + if cb is not None: + cb.record_success() + return body # type: ignore + except ThrottlingError: + if cb is not None: + cb.record_failure() + raise + except Exception as e: + if cb is not None: + cb.record_failure() + raise e if session: return await do_request(session) @@ -101,7 +206,11 @@ async def detail_query( return await do_request(new_session) -@retry(wait=wait_random(min=1, max=60), stop=stop_after_attempt(3)) +@retry( + retry=retry_if_exception_type(ThrottlingError), + wait=wait_exponential(multiplier=2, min=2, max=120), + stop=stop_after_attempt(5), +) async def listing_query( *, page: int, @@ -118,6 +227,7 @@ async def listing_query( page_size: int = 25, furnish_types: list[FurnishType] = [], session: aiohttp.ClientSession | None = None, + config: ScraperConfig | None = None, ) -> dict[str, Any]: """Execute a listing search query. @@ -136,10 +246,21 @@ async def listing_query( page_size: Number of results per page (default 25). furnish_types: List of furnish types to filter (RENT only). session: Optional aiohttp session. Creates new one if not provided. + config: Scraper configuration. Loads from environment if not provided. Returns: API response as a dictionary. + + Raises: + CircuitBreakerOpenError: If the circuit breaker is open. + ThrottlingError: If the request is throttled. """ + if config is None: + config = ScraperConfig.from_env() + + check_circuit_breaker(config) + cb = get_circuit_breaker(config) + params: dict[str, str] = { "locationIdentifier": districts.get_districts()[district], "channel": str(channel).upper(), @@ -185,14 +306,39 @@ async def listing_query( } async def do_request(s: aiohttp.ClientSession) -> dict[str, Any]: - async with s.get( - "https://api.rightmove.co.uk/api/property-listing", - params=params, - headers=request_headers, - ) as response: - if response.status != 200: - raise Exception(f"Failed due to: {await response.text()}") - return await response.json() + start_time = time.time() + try: + async with s.get( + "https://api.rightmove.co.uk/api/property-listing", + params=params, + headers=request_headers, + ) as response: + response_time = time.time() - start_time + body = await response.json() if response.status == 200 else None + + # Validate response for throttling + validate_response( + response, + response_time, + body, + config.slow_response_threshold, + expect_data=(page == 1), # Only expect data on first page + ) + + if response.status != 200: + raise Exception(f"Failed due to: {await response.text()}") + + if cb is not None: + cb.record_success() + return body # type: ignore + except ThrottlingError: + if cb is not None: + cb.record_failure() + raise + except Exception as e: + if cb is not None: + cb.record_failure() + raise e if session: return await do_request(session) @@ -201,7 +347,11 @@ async def listing_query( return await do_request(new_session) -@retry(wait=wait_random(min=1, max=10), stop=stop_after_attempt(3)) +@retry( + retry=retry_if_exception_type(ThrottlingError), + wait=wait_exponential(multiplier=2, min=2, max=60), + stop=stop_after_attempt(5), +) async def probe_query( *, session: aiohttp.ClientSession, @@ -214,6 +364,7 @@ async def probe_query( district: str, max_days_since_added: int = 30, furnish_types: list[FurnishType] = [], + config: ScraperConfig | None = None, ) -> dict[str, Any]: """Probe the API to get result count without fetching full results. @@ -230,10 +381,21 @@ async def probe_query( district: District identifier string. max_days_since_added: Maximum days since listing was added (BUY only). furnish_types: List of furnish types to filter (RENT only). + config: Scraper configuration. Loads from environment if not provided. Returns: API response containing totalAvailableResults. + + Raises: + CircuitBreakerOpenError: If the circuit breaker is open. + ThrottlingError: If the request is throttled. """ + if config is None: + config = ScraperConfig.from_env() + + check_circuit_breaker(config) + cb = get_circuit_breaker(config) + params: dict[str, str] = { "locationIdentifier": districts.get_districts()[district], "channel": str(channel).upper(), @@ -271,11 +433,36 @@ async def probe_query( "Connection": "keep-alive", } - async with session.get( - "https://api.rightmove.co.uk/api/property-listing", - params=params, - headers=request_headers, - ) as response: - if response.status != 200: - raise Exception(f"Probe failed: {await response.text()}") - return await response.json() + start_time = time.time() + try: + async with session.get( + "https://api.rightmove.co.uk/api/property-listing", + params=params, + headers=request_headers, + ) as response: + response_time = time.time() - start_time + body = await response.json() if response.status == 200 else None + + # Validate response for throttling + validate_response( + response, + response_time, + body, + config.slow_response_threshold, + expect_data=False, # Probe doesn't need data, just count + ) + + if response.status != 200: + raise Exception(f"Probe failed: {await response.text()}") + + if cb is not None: + cb.record_success() + return body # type: ignore + except ThrottlingError: + if cb is not None: + cb.record_failure() + raise + except Exception as e: + if cb is not None: + cb.record_failure() + raise e diff --git a/crawler/rec/throttle_detector.py b/crawler/rec/throttle_detector.py new file mode 100644 index 0000000..dc999ed --- /dev/null +++ b/crawler/rec/throttle_detector.py @@ -0,0 +1,232 @@ +"""Throttling detection and metrics for Rightmove API.""" +from __future__ import annotations + +import time +from dataclasses import dataclass, field +from typing import Any + +import aiohttp + +from rec.exceptions import ( + InvalidResponseError, + IPBlockedError, + RateLimitError, + ServiceUnavailableError, + SlowResponseError, + UnexpectedEmptyResponseError, +) + + +@dataclass +class ThrottleMetrics: + """Tracks throttling events and metrics. + + Attributes: + rate_limit_count: Number of HTTP 429 errors. + service_unavailable_count: Number of HTTP 503 errors. + ip_blocked_count: Number of HTTP 403 errors. + slow_response_count: Number of slow responses. + empty_response_count: Number of unexpected empty responses. + invalid_response_count: Number of invalid/error responses. + total_requests: Total number of requests made. + total_response_time: Cumulative response time in seconds. + """ + + rate_limit_count: int = 0 + service_unavailable_count: int = 0 + ip_blocked_count: int = 0 + slow_response_count: int = 0 + empty_response_count: int = 0 + invalid_response_count: int = 0 + total_requests: int = 0 + total_response_time: float = 0.0 + _start_time: float = field(default_factory=time.time) + + def record_rate_limit(self) -> None: + """Record a rate limit error (HTTP 429).""" + self.rate_limit_count += 1 + + def record_service_unavailable(self) -> None: + """Record a service unavailable error (HTTP 503).""" + self.service_unavailable_count += 1 + + def record_ip_blocked(self) -> None: + """Record an IP blocked error (HTTP 403).""" + self.ip_blocked_count += 1 + + def record_slow_response(self, response_time: float) -> None: + """Record a slow response. + + Args: + response_time: Response time in seconds. + """ + self.slow_response_count += 1 + self.total_response_time += response_time + self.total_requests += 1 + + def record_empty_response(self) -> None: + """Record an unexpected empty response.""" + self.empty_response_count += 1 + + def record_invalid_response(self) -> None: + """Record an invalid or error response.""" + self.invalid_response_count += 1 + + def record_request(self, response_time: float) -> None: + """Record a successful request. + + Args: + response_time: Response time in seconds. + """ + self.total_requests += 1 + self.total_response_time += response_time + + @property + def average_response_time(self) -> float: + """Calculate average response time in seconds.""" + if self.total_requests == 0: + return 0.0 + return self.total_response_time / self.total_requests + + @property + def total_throttling_events(self) -> int: + """Total number of throttling events.""" + return ( + self.rate_limit_count + + self.service_unavailable_count + + self.ip_blocked_count + + self.slow_response_count + ) + + @property + def throttle_rate(self) -> float: + """Percentage of requests that were throttled.""" + if self.total_requests == 0: + return 0.0 + return (self.total_throttling_events / self.total_requests) * 100 + + @property + def elapsed_time(self) -> float: + """Time elapsed since metrics started tracking.""" + return time.time() - self._start_time + + def summary(self) -> str: + """Generate a summary of throttling metrics.""" + return ( + f"Throttle Metrics Summary:\n" + f" Total Requests: {self.total_requests}\n" + f" Total Throttling Events: {self.total_throttling_events}\n" + f" Throttle Rate: {self.throttle_rate:.2f}%\n" + f" Rate Limit (429): {self.rate_limit_count}\n" + f" Service Unavailable (503): {self.service_unavailable_count}\n" + f" IP Blocked (403): {self.ip_blocked_count}\n" + f" Slow Responses: {self.slow_response_count}\n" + f" Empty Responses: {self.empty_response_count}\n" + f" Invalid Responses: {self.invalid_response_count}\n" + f" Average Response Time: {self.average_response_time:.2f}s\n" + f" Elapsed Time: {self.elapsed_time:.2f}s" + ) + + +# Global metrics instance +_global_metrics: ThrottleMetrics | None = None + + +def get_throttle_metrics() -> ThrottleMetrics: + """Get the global throttle metrics instance. + + Returns: + Global ThrottleMetrics instance. + """ + global _global_metrics + if _global_metrics is None: + _global_metrics = ThrottleMetrics() + return _global_metrics + + +def reset_throttle_metrics() -> None: + """Reset the global throttle metrics.""" + global _global_metrics + _global_metrics = ThrottleMetrics() + + +def validate_response( + response: aiohttp.ClientResponse, + response_time: float, + response_body: dict[str, Any] | None, + slow_response_threshold: float, + expect_data: bool = True, +) -> None: + """Validate an API response and raise appropriate exceptions for throttling. + + Args: + response: The aiohttp response object. + response_time: Time taken for the request in seconds. + response_body: Parsed JSON response body (if available). + slow_response_threshold: Threshold in seconds for slow responses. + expect_data: Whether we expect data in the response. + + Raises: + RateLimitError: If HTTP 429 is returned. + ServiceUnavailableError: If HTTP 503 is returned. + IPBlockedError: If HTTP 403 is returned. + SlowResponseError: If response time exceeds threshold. + UnexpectedEmptyResponseError: If response is empty when data is expected. + InvalidResponseError: If response contains error messages. + """ + metrics = get_throttle_metrics() + + # Check HTTP status codes + if response.status == 429: + metrics.record_rate_limit() + raise RateLimitError( + f"Rate limit exceeded (HTTP 429). " + f"Response time: {response_time:.2f}s" + ) + + if response.status == 503: + metrics.record_service_unavailable() + raise ServiceUnavailableError( + f"Service unavailable (HTTP 503). " + f"Response time: {response_time:.2f}s" + ) + + if response.status == 403: + metrics.record_ip_blocked() + raise IPBlockedError( + f"Access forbidden, possible IP block (HTTP 403). " + f"Response time: {response_time:.2f}s" + ) + + # Check response time + if response_time > slow_response_threshold: + metrics.record_slow_response(response_time) + raise SlowResponseError( + f"Slow response detected: {response_time:.2f}s " + f"(threshold: {slow_response_threshold}s)" + ) + + # Check response body if available + if response_body is not None: + # Check for error messages + if "error" in response_body or "GENERIC_ERROR" in str(response_body): + metrics.record_invalid_response() + raise InvalidResponseError( + f"Error in response body: {response_body}" + ) + + # Check for unexpected empty responses + if expect_data: + properties = response_body.get("properties", []) + total_results = response_body.get("totalAvailableResults", 0) + + # If we expect data but got none (and total shows there should be some) + if total_results > 0 and len(properties) == 0: + metrics.record_empty_response() + raise UnexpectedEmptyResponseError( + f"Expected data but got empty response. " + f"Total available: {total_results}" + ) + + # Record successful request + metrics.record_request(response_time) diff --git a/crawler/services/listing_fetcher.py b/crawler/services/listing_fetcher.py index a94f3e0..2674c9d 100644 --- a/crawler/services/listing_fetcher.py +++ b/crawler/services/listing_fetcher.py @@ -6,6 +6,8 @@ from typing import Any from config.scraper_config import ScraperConfig from listing_processor import ListingProcessor from rec.query import create_session, listing_query +from rec.exceptions import CircuitBreakerOpenError, ThrottlingError +from rec.throttle_detector import get_throttle_metrics, reset_throttle_metrics from models.listing import QueryParameters from repositories import ListingRepository from tqdm.asyncio import tqdm @@ -40,76 +42,98 @@ async def dump_listings( config = ScraperConfig.from_env() splitter = QuerySplitter(config) - async with create_session(config) as session: - # Phase 1 & 2: Split and probe queries - logger.info("Splitting query and probing result counts...") - subqueries = await splitter.split(parameters, session) + # Reset throttle metrics at start + reset_throttle_metrics() - total_estimated = splitter.calculate_total_estimated_results(subqueries) - logger.info( - f"Split into {len(subqueries)} subqueries, " - f"estimated {total_estimated} total results" - ) + try: + async with create_session(config) as session: + # Phase 1 & 2: Split and probe queries + logger.info("Splitting query and probing result counts...") + subqueries = await splitter.split(parameters, session) - # Phase 3: Fetch all pages for each subquery - semaphore = asyncio.Semaphore(config.max_concurrent_requests) - - async def fetch_subquery(sq: SubQuery) -> list[dict[str, Any]]: - """Fetch all pages for a single subquery.""" - results: list[dict[str, Any]] = [] - - estimated = sq.estimated_results or 0 - if estimated == 0: - return results - - page_size = parameters.page_size - max_pages = min( - config.max_pages_per_query, - (estimated // page_size) + 1, + total_estimated = splitter.calculate_total_estimated_results(subqueries) + logger.info( + f"Split into {len(subqueries)} subqueries, " + f"estimated {total_estimated} total results" ) - for page_id in range(1, max_pages + 1): - async with semaphore: - await asyncio.sleep(config.request_delay_ms / 1000) - try: - result = await listing_query( - page=page_id, - channel=parameters.listing_type, - min_bedrooms=sq.min_bedrooms, - max_bedrooms=sq.max_bedrooms, - radius=parameters.radius, - min_price=sq.min_price, - max_price=sq.max_price, - district=sq.district, - page_size=page_size, - max_days_since_added=parameters.max_days_since_added, - furnish_types=parameters.furnish_types or [], - session=session, - ) - results.append(result) + # Phase 3: Fetch all pages for each subquery + semaphore = asyncio.Semaphore(config.max_concurrent_requests) - properties = result.get("properties", []) - if len(properties) < page_size: + async def fetch_subquery(sq: SubQuery) -> list[dict[str, Any]]: + """Fetch all pages for a single subquery.""" + results: list[dict[str, Any]] = [] + + estimated = sq.estimated_results or 0 + if estimated == 0: + return results + + page_size = parameters.page_size + max_pages = min( + config.max_pages_per_query, + (estimated // page_size) + 1, + ) + + for page_id in range(1, max_pages + 1): + async with semaphore: + await asyncio.sleep(config.request_delay_ms / 1000) + try: + result = await listing_query( + page=page_id, + channel=parameters.listing_type, + min_bedrooms=sq.min_bedrooms, + max_bedrooms=sq.max_bedrooms, + radius=parameters.radius, + min_price=sq.min_price, + max_price=sq.max_price, + district=sq.district, + page_size=page_size, + max_days_since_added=parameters.max_days_since_added, + furnish_types=parameters.furnish_types or [], + session=session, + config=config, + ) + results.append(result) + + properties = result.get("properties", []) + if len(properties) < page_size: + break + + except CircuitBreakerOpenError as e: + logger.error(f"Circuit breaker open: {e}") break - - except Exception as e: - if "GENERIC_ERROR" in str(e): - logger.debug( - f"Max page for {sq.district}: {page_id - 1}" + except ThrottlingError as e: + logger.warning( + f"Throttling error on page {page_id} for {sq.district}: {e}" + ) + break + except Exception as e: + if "GENERIC_ERROR" in str(e): + logger.debug( + f"Max page for {sq.district}: {page_id - 1}" + ) + break + logger.warning( + f"Error fetching page {page_id} for {sq.district}: {e}" ) break - logger.warning( - f"Error fetching page {page_id} for {sq.district}: {e}" - ) - break - return results + return results - # Fetch all subqueries with progress bar - all_results = await tqdm.gather( - *[fetch_subquery(sq) for sq in subqueries], - desc="Fetching listings", - ) + # Fetch all subqueries with progress bar + all_results = await tqdm.gather( + *[fetch_subquery(sq) for sq in subqueries], + desc="Fetching listings", + ) + except CircuitBreakerOpenError as e: + logger.error(f"Circuit breaker prevented listing fetch: {e}") + logger.info(get_throttle_metrics().summary()) + return [] + finally: + # Log throttle metrics at end + metrics = get_throttle_metrics() + if metrics.total_requests > 0: + logger.info("\n" + metrics.summary()) # Extract listing identifiers from results listing_ids: list[int] = [] diff --git a/crawler/services/query_splitter.py b/crawler/services/query_splitter.py index 0609634..b183ac2 100644 --- a/crawler/services/query_splitter.py +++ b/crawler/services/query_splitter.py @@ -16,6 +16,7 @@ import aiohttp from config.scraper_config import ScraperConfig from models.listing import ListingType, QueryParameters from rec.districts import get_districts +from rec.exceptions import CircuitBreakerOpenError, ThrottlingError logger = logging.getLogger("uvicorn.error") @@ -113,6 +114,9 @@ class QuerySplitter: Returns: Total available results for this subquery. + + Raises: + CircuitBreakerOpenError: If the circuit breaker is open. """ from rec.query import probe_query @@ -128,8 +132,17 @@ class QuerySplitter: district=subquery.district, max_days_since_added=parameters.max_days_since_added, furnish_types=parameters.furnish_types or [], + config=self.config, ) return result.get("totalAvailableResults", 0) + except CircuitBreakerOpenError: + logger.error("Circuit breaker is open, stopping probe operations") + raise + except ThrottlingError as e: + logger.warning( + f"Throttling detected during probe for {subquery.district}: {e}" + ) + return 0 except Exception as e: logger.warning(f"Failed to probe subquery {subquery}: {e}") return 0 diff --git a/crawler/tests/integration/test_throttle_integration.py b/crawler/tests/integration/test_throttle_integration.py new file mode 100644 index 0000000..39be15b --- /dev/null +++ b/crawler/tests/integration/test_throttle_integration.py @@ -0,0 +1,311 @@ +"""Integration tests for throttle detection and circuit breaker.""" +import asyncio +import pytest +from unittest.mock import AsyncMock, MagicMock, patch +from aiohttp import ClientResponse + +from config.scraper_config import ScraperConfig +from rec.exceptions import ( + CircuitBreakerOpenError, + RateLimitError, + ServiceUnavailableError, + ThrottlingError, +) +from rec.query import ( + detail_query, + listing_query, + probe_query, + get_circuit_breaker, + reset_circuit_breaker, +) +from rec.throttle_detector import reset_throttle_metrics, get_throttle_metrics +from rec.circuit_breaker import CircuitBreaker, CircuitState +from models.listing import ListingType + + +@pytest.fixture +def config() -> ScraperConfig: + """Create a test configuration.""" + return ScraperConfig( + max_concurrent_requests=5, + request_delay_ms=10, + slow_response_threshold=2.0, + enable_circuit_breaker=True, + circuit_breaker_failure_threshold=3, + circuit_breaker_recovery_timeout=0.5, + ) + + +@pytest.fixture(autouse=True) +def reset_globals() -> None: + """Reset global state before each test.""" + reset_throttle_metrics() + reset_circuit_breaker() + + +class MockResponse: + """Mock aiohttp response.""" + + def __init__( + self, + status: int = 200, + json_data: dict | None = None, + text: str = "", + ): + self.status = status + self._json_data = json_data or {} + self._text = text + + async def json(self) -> dict: + return self._json_data + + async def text(self) -> str: + return self._text + + async def __aenter__(self) -> "MockResponse": + return self + + async def __aexit__(self, *args: object) -> None: + pass + + +class TestThrottlingRetryBehavior: + """Test retry behavior for throttling errors.""" + + @pytest.mark.asyncio + async def test_rate_limit_triggers_retry(self, config: ScraperConfig) -> None: + """Test that 429 responses trigger retry with backoff.""" + call_count = 0 + + async def mock_get(*args: object, **kwargs: object) -> MockResponse: + nonlocal call_count + call_count += 1 + if call_count < 3: + return MockResponse(status=429) + return MockResponse( + status=200, + json_data={"totalAvailableResults": 10, "properties": []}, + ) + + mock_session = MagicMock() + mock_session.get = mock_get + + # Mock district lookup + with patch("rec.query.districts.get_districts", return_value={"Test": "LOC1"}): + # The retry decorator will catch RateLimitError and retry + # We need to patch the tenacity wait to speed up the test + with patch("tenacity.wait_exponential.__call__", return_value=0): + result = await probe_query( + session=mock_session, + channel=ListingType.RENT, + min_bedrooms=1, + max_bedrooms=2, + radius=1.0, + min_price=1000, + max_price=2000, + district="Test", + config=config, + ) + + assert result["totalAvailableResults"] == 10 + assert call_count == 3 + + @pytest.mark.asyncio + async def test_service_unavailable_triggers_retry( + self, config: ScraperConfig + ) -> None: + """Test that 503 responses trigger retry.""" + call_count = 0 + + async def mock_get(*args: object, **kwargs: object) -> MockResponse: + nonlocal call_count + call_count += 1 + if call_count < 2: + return MockResponse(status=503) + return MockResponse( + status=200, + json_data={"totalAvailableResults": 5, "properties": []}, + ) + + mock_session = MagicMock() + mock_session.get = mock_get + + with patch("rec.query.districts.get_districts", return_value={"Test": "LOC1"}): + with patch("tenacity.wait_exponential.__call__", return_value=0): + result = await probe_query( + session=mock_session, + channel=ListingType.RENT, + min_bedrooms=1, + max_bedrooms=2, + radius=1.0, + min_price=1000, + max_price=2000, + district="Test", + config=config, + ) + + assert call_count == 2 + + +class TestCircuitBreakerIntegration: + """Test circuit breaker integration with queries.""" + + @pytest.mark.asyncio + async def test_circuit_breaker_opens_after_failures( + self, config: ScraperConfig + ) -> None: + """Test that circuit breaker opens after consecutive failures.""" + call_count = 0 + + async def mock_get(*args: object, **kwargs: object) -> MockResponse: + nonlocal call_count + call_count += 1 + return MockResponse(status=429) + + mock_session = MagicMock() + mock_session.get = mock_get + + with patch("rec.query.districts.get_districts", return_value={"Test": "LOC1"}): + # After enough failures, circuit should open + with pytest.raises((RateLimitError, CircuitBreakerOpenError)): + with patch("tenacity.wait_exponential.__call__", return_value=0): + await probe_query( + session=mock_session, + channel=ListingType.RENT, + min_bedrooms=1, + max_bedrooms=2, + radius=1.0, + min_price=1000, + max_price=2000, + district="Test", + config=config, + ) + + # Check circuit breaker state + cb = get_circuit_breaker(config) + assert cb is not None + # After many failures, the circuit should be open + assert cb.failure_count >= config.circuit_breaker_failure_threshold + + @pytest.mark.asyncio + async def test_circuit_breaker_blocks_requests_when_open( + self, config: ScraperConfig + ) -> None: + """Test that open circuit breaker blocks requests immediately.""" + # Force open the circuit breaker + cb = get_circuit_breaker(config) + assert cb is not None + for _ in range(config.circuit_breaker_failure_threshold): + cb.record_failure() + + assert cb.is_open + + mock_session = MagicMock() + + with patch("rec.query.districts.get_districts", return_value={"Test": "LOC1"}): + with pytest.raises(CircuitBreakerOpenError): + await probe_query( + session=mock_session, + channel=ListingType.RENT, + min_bedrooms=1, + max_bedrooms=2, + radius=1.0, + min_price=1000, + max_price=2000, + district="Test", + config=config, + ) + + +class TestMetricsTracking: + """Test throttle metrics are properly tracked.""" + + @pytest.mark.asyncio + async def test_metrics_tracked_on_rate_limit(self, config: ScraperConfig) -> None: + """Test that rate limit errors are tracked in metrics.""" + async def mock_get(*args: object, **kwargs: object) -> MockResponse: + return MockResponse(status=429) + + mock_session = MagicMock() + mock_session.get = mock_get + + with patch("rec.query.districts.get_districts", return_value={"Test": "LOC1"}): + with pytest.raises(RateLimitError): + with patch("tenacity.wait_exponential.__call__", return_value=0): + await probe_query( + session=mock_session, + channel=ListingType.RENT, + min_bedrooms=1, + max_bedrooms=2, + radius=1.0, + min_price=1000, + max_price=2000, + district="Test", + config=config, + ) + + metrics = get_throttle_metrics() + assert metrics.rate_limit_count > 0 + + @pytest.mark.asyncio + async def test_metrics_tracked_on_success(self, config: ScraperConfig) -> None: + """Test that successful requests are tracked in metrics.""" + async def mock_get(*args: object, **kwargs: object) -> MockResponse: + return MockResponse( + status=200, + json_data={"totalAvailableResults": 10, "properties": []}, + ) + + mock_session = MagicMock() + mock_session.get = mock_get + + with patch("rec.query.districts.get_districts", return_value={"Test": "LOC1"}): + await probe_query( + session=mock_session, + channel=ListingType.RENT, + min_bedrooms=1, + max_bedrooms=2, + radius=1.0, + min_price=1000, + max_price=2000, + district="Test", + config=config, + ) + + metrics = get_throttle_metrics() + assert metrics.total_requests == 1 + assert metrics.total_throttling_events == 0 + + +class TestConfigIntegration: + """Test configuration integration.""" + + def test_config_from_env_includes_throttle_settings(self) -> None: + """Test that config loads throttle settings from environment.""" + import os + + original_env = os.environ.copy() + try: + os.environ["RIGHTMOVE_SLOW_RESPONSE_THRESHOLD"] = "5.0" + os.environ["RIGHTMOVE_ENABLE_CIRCUIT_BREAKER"] = "false" + os.environ["RIGHTMOVE_CIRCUIT_BREAKER_FAILURES"] = "10" + os.environ["RIGHTMOVE_CIRCUIT_BREAKER_TIMEOUT"] = "120.0" + + config = ScraperConfig.from_env() + + assert config.slow_response_threshold == 5.0 + assert config.enable_circuit_breaker is False + assert config.circuit_breaker_failure_threshold == 10 + assert config.circuit_breaker_recovery_timeout == 120.0 + finally: + os.environ.clear() + os.environ.update(original_env) + + def test_circuit_breaker_disabled_returns_none(self) -> None: + """Test that disabled circuit breaker returns None.""" + config = ScraperConfig( + enable_circuit_breaker=False, + ) + reset_circuit_breaker() + cb = get_circuit_breaker(config) + assert cb is None diff --git a/crawler/tests/unit/test_throttle_detection.py b/crawler/tests/unit/test_throttle_detection.py new file mode 100644 index 0000000..2e786e9 --- /dev/null +++ b/crawler/tests/unit/test_throttle_detection.py @@ -0,0 +1,334 @@ +"""Unit tests for throttle detection and circuit breaker.""" +import pytest +from unittest.mock import MagicMock, AsyncMock +import time + +from rec.exceptions import ( + RightmoveAPIError, + ThrottlingError, + RateLimitError, + ServiceUnavailableError, + IPBlockedError, + SlowResponseError, + UnexpectedEmptyResponseError, + InvalidResponseError, + CircuitBreakerOpenError, +) +from rec.throttle_detector import ( + ThrottleMetrics, + validate_response, + get_throttle_metrics, + reset_throttle_metrics, +) +from rec.circuit_breaker import CircuitBreaker, CircuitState + + +class TestExceptionHierarchy: + """Test custom exception hierarchy.""" + + def test_rightmove_api_error_is_exception(self) -> None: + assert issubclass(RightmoveAPIError, Exception) + + def test_throttling_error_is_rightmove_api_error(self) -> None: + assert issubclass(ThrottlingError, RightmoveAPIError) + + def test_rate_limit_error_is_throttling_error(self) -> None: + assert issubclass(RateLimitError, ThrottlingError) + + def test_service_unavailable_error_is_throttling_error(self) -> None: + assert issubclass(ServiceUnavailableError, ThrottlingError) + + def test_ip_blocked_error_is_throttling_error(self) -> None: + assert issubclass(IPBlockedError, ThrottlingError) + + def test_slow_response_error_is_throttling_error(self) -> None: + assert issubclass(SlowResponseError, ThrottlingError) + + def test_unexpected_empty_response_error_is_rightmove_api_error(self) -> None: + assert issubclass(UnexpectedEmptyResponseError, RightmoveAPIError) + assert not issubclass(UnexpectedEmptyResponseError, ThrottlingError) + + def test_invalid_response_error_is_rightmove_api_error(self) -> None: + assert issubclass(InvalidResponseError, RightmoveAPIError) + assert not issubclass(InvalidResponseError, ThrottlingError) + + def test_circuit_breaker_open_error_is_rightmove_api_error(self) -> None: + assert issubclass(CircuitBreakerOpenError, RightmoveAPIError) + + def test_exception_messages(self) -> None: + error = RateLimitError("Too many requests") + assert str(error) == "Too many requests" + + +class TestThrottleMetrics: + """Test ThrottleMetrics class.""" + + def test_initial_state(self) -> None: + metrics = ThrottleMetrics() + assert metrics.rate_limit_count == 0 + assert metrics.service_unavailable_count == 0 + assert metrics.ip_blocked_count == 0 + assert metrics.slow_response_count == 0 + assert metrics.empty_response_count == 0 + assert metrics.invalid_response_count == 0 + assert metrics.total_requests == 0 + assert metrics.total_response_time == 0.0 + + def test_record_rate_limit(self) -> None: + metrics = ThrottleMetrics() + metrics.record_rate_limit() + assert metrics.rate_limit_count == 1 + metrics.record_rate_limit() + assert metrics.rate_limit_count == 2 + + def test_record_service_unavailable(self) -> None: + metrics = ThrottleMetrics() + metrics.record_service_unavailable() + assert metrics.service_unavailable_count == 1 + + def test_record_ip_blocked(self) -> None: + metrics = ThrottleMetrics() + metrics.record_ip_blocked() + assert metrics.ip_blocked_count == 1 + + def test_record_slow_response(self) -> None: + metrics = ThrottleMetrics() + metrics.record_slow_response(15.0) + assert metrics.slow_response_count == 1 + assert metrics.total_response_time == 15.0 + assert metrics.total_requests == 1 + + def test_record_empty_response(self) -> None: + metrics = ThrottleMetrics() + metrics.record_empty_response() + assert metrics.empty_response_count == 1 + + def test_record_invalid_response(self) -> None: + metrics = ThrottleMetrics() + metrics.record_invalid_response() + assert metrics.invalid_response_count == 1 + + def test_record_request(self) -> None: + metrics = ThrottleMetrics() + metrics.record_request(0.5) + assert metrics.total_requests == 1 + assert metrics.total_response_time == 0.5 + + def test_average_response_time(self) -> None: + metrics = ThrottleMetrics() + metrics.record_request(1.0) + metrics.record_request(2.0) + metrics.record_request(3.0) + assert metrics.average_response_time == 2.0 + + def test_average_response_time_zero_requests(self) -> None: + metrics = ThrottleMetrics() + assert metrics.average_response_time == 0.0 + + def test_total_throttling_events(self) -> None: + metrics = ThrottleMetrics() + metrics.record_rate_limit() + metrics.record_service_unavailable() + metrics.record_ip_blocked() + metrics.record_slow_response(15.0) + assert metrics.total_throttling_events == 4 + + def test_throttle_rate(self) -> None: + metrics = ThrottleMetrics() + metrics.record_request(0.5) # 1 normal request + metrics.record_request(0.5) # 2 normal requests + metrics.record_rate_limit() + metrics.record_request(0.5) # 3 normal requests (rate limit doesn't count as request) + # 1 throttling event, 3 requests = 33.33% + assert metrics.throttle_rate == pytest.approx(33.33, rel=0.01) + + def test_throttle_rate_zero_requests(self) -> None: + metrics = ThrottleMetrics() + assert metrics.throttle_rate == 0.0 + + def test_elapsed_time(self) -> None: + metrics = ThrottleMetrics() + time.sleep(0.1) + assert metrics.elapsed_time >= 0.1 + + def test_summary(self) -> None: + metrics = ThrottleMetrics() + metrics.record_request(1.0) + metrics.record_rate_limit() + summary = metrics.summary() + assert "Total Requests:" in summary + assert "Rate Limit (429):" in summary + assert "1" in summary + + +class TestGlobalMetrics: + """Test global metrics accessor.""" + + def test_get_throttle_metrics_singleton(self) -> None: + reset_throttle_metrics() + m1 = get_throttle_metrics() + m2 = get_throttle_metrics() + assert m1 is m2 + + def test_reset_throttle_metrics(self) -> None: + reset_throttle_metrics() + metrics = get_throttle_metrics() + metrics.record_rate_limit() + assert metrics.rate_limit_count == 1 + reset_throttle_metrics() + new_metrics = get_throttle_metrics() + assert new_metrics.rate_limit_count == 0 + + +class TestValidateResponse: + """Test validate_response function.""" + + def setup_method(self) -> None: + reset_throttle_metrics() + + def create_mock_response(self, status: int) -> MagicMock: + response = MagicMock() + response.status = status + return response + + def test_rate_limit_error(self) -> None: + response = self.create_mock_response(429) + with pytest.raises(RateLimitError): + validate_response(response, 0.5, None, 10.0) + assert get_throttle_metrics().rate_limit_count == 1 + + def test_service_unavailable_error(self) -> None: + response = self.create_mock_response(503) + with pytest.raises(ServiceUnavailableError): + validate_response(response, 0.5, None, 10.0) + assert get_throttle_metrics().service_unavailable_count == 1 + + def test_ip_blocked_error(self) -> None: + response = self.create_mock_response(403) + with pytest.raises(IPBlockedError): + validate_response(response, 0.5, None, 10.0) + assert get_throttle_metrics().ip_blocked_count == 1 + + def test_slow_response_error(self) -> None: + response = self.create_mock_response(200) + body = {"totalAvailableResults": 0, "properties": []} + with pytest.raises(SlowResponseError): + validate_response(response, 15.0, body, 10.0) + assert get_throttle_metrics().slow_response_count == 1 + + def test_slow_response_just_under_threshold(self) -> None: + response = self.create_mock_response(200) + body = {"totalAvailableResults": 0, "properties": []} + # Should not raise + validate_response(response, 9.9, body, 10.0) + assert get_throttle_metrics().slow_response_count == 0 + + def test_error_in_response_body(self) -> None: + response = self.create_mock_response(200) + body = {"error": "Something went wrong"} + with pytest.raises(InvalidResponseError): + validate_response(response, 0.5, body, 10.0) + assert get_throttle_metrics().invalid_response_count == 1 + + def test_generic_error_in_body(self) -> None: + response = self.create_mock_response(200) + body = {"message": "GENERIC_ERROR occurred"} + with pytest.raises(InvalidResponseError): + validate_response(response, 0.5, body, 10.0) + + def test_unexpected_empty_response(self) -> None: + response = self.create_mock_response(200) + body = {"totalAvailableResults": 100, "properties": []} + with pytest.raises(UnexpectedEmptyResponseError): + validate_response(response, 0.5, body, 10.0, expect_data=True) + assert get_throttle_metrics().empty_response_count == 1 + + def test_empty_response_when_not_expecting_data(self) -> None: + response = self.create_mock_response(200) + body = {"totalAvailableResults": 100, "properties": []} + # Should not raise when expect_data=False + validate_response(response, 0.5, body, 10.0, expect_data=False) + assert get_throttle_metrics().empty_response_count == 0 + + def test_valid_response(self) -> None: + response = self.create_mock_response(200) + body = { + "totalAvailableResults": 10, + "properties": [{"id": 1}, {"id": 2}], + } + validate_response(response, 0.5, body, 10.0, expect_data=True) + assert get_throttle_metrics().total_requests == 1 + assert get_throttle_metrics().total_throttling_events == 0 + + +class TestCircuitBreaker: + """Test CircuitBreaker class.""" + + def test_initial_state_is_closed(self) -> None: + cb = CircuitBreaker(failure_threshold=3, recovery_timeout=10.0) + assert cb.state == CircuitState.CLOSED + assert cb.is_closed + assert not cb.is_open + assert not cb.is_half_open + + def test_allows_requests_when_closed(self) -> None: + cb = CircuitBreaker(failure_threshold=3, recovery_timeout=10.0) + # Should not raise + cb.call() + + def test_opens_after_threshold_failures(self) -> None: + cb = CircuitBreaker(failure_threshold=3, recovery_timeout=10.0) + cb.record_failure() + cb.record_failure() + assert cb.is_closed + cb.record_failure() + assert cb.is_open + + def test_blocks_requests_when_open(self) -> None: + cb = CircuitBreaker(failure_threshold=1, recovery_timeout=60.0) + cb.record_failure() + assert cb.is_open + with pytest.raises(CircuitBreakerOpenError): + cb.call() + + def test_success_resets_failure_count(self) -> None: + cb = CircuitBreaker(failure_threshold=3, recovery_timeout=10.0) + cb.record_failure() + cb.record_failure() + assert cb.failure_count == 2 + cb.record_success() + assert cb.failure_count == 0 + + def test_transitions_to_half_open_after_timeout(self) -> None: + cb = CircuitBreaker(failure_threshold=1, recovery_timeout=0.1) + cb.record_failure() + assert cb.is_open + time.sleep(0.15) + cb.call() # Should transition to half-open + assert cb.is_half_open + + def test_half_open_success_closes_circuit(self) -> None: + cb = CircuitBreaker(failure_threshold=1, recovery_timeout=0.1) + cb.record_failure() + time.sleep(0.15) + cb.call() # Transition to half-open + assert cb.is_half_open + cb.record_success() + assert cb.is_closed + + def test_half_open_failure_reopens_circuit(self) -> None: + cb = CircuitBreaker(failure_threshold=1, recovery_timeout=0.1) + cb.record_failure() + time.sleep(0.15) + cb.call() # Transition to half-open + assert cb.is_half_open + cb.record_failure() + assert cb.is_open + + def test_reset(self) -> None: + cb = CircuitBreaker(failure_threshold=1, recovery_timeout=60.0) + cb.record_failure() + assert cb.is_open + cb.reset() + assert cb.is_closed + assert cb.failure_count == 0 From c4b11ccfe955c90b238060af1b4e4883f37c789a Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Mon, 2 Feb 2026 23:01:13 +0000 Subject: [PATCH 4/5] Add comprehensive logging to Celery tasks and listing processor --- crawler/listing_processor.py | 60 +++++-- crawler/tasks/listing_tasks.py | 316 +++++++++++++++++++++++---------- 2 files changed, 269 insertions(+), 107 deletions(-) diff --git a/crawler/listing_processor.py b/crawler/listing_processor.py index 020016f..e1151ca 100644 --- a/crawler/listing_processor.py +++ b/crawler/listing_processor.py @@ -13,6 +13,9 @@ from repositories.listing_repository import ListingRepository logger = logging.getLogger("uvicorn.error") +# Also use celery task logger for visibility in worker output +celery_logger = logging.getLogger("celery.task") + class ListingProcessor: semaphore: asyncio.Semaphore @@ -36,15 +39,16 @@ class ListingProcessor: for step in self.process_steps: if await step.needs_processing(listing_id): async with self.semaphore: + step_name = step.__class__.__name__ try: listing = await step.process(listing_id) + logger.debug(f"[{listing_id}] {step_name} completed") except Exception as e: - logger.error(f"Failed to process {listing_id=}: {e}") + logger.error(f"[{listing_id}] {step_name} failed: {e}") + celery_logger.error(f"[{listing_id}] {step_name} failed: {e}") return None return listing - async def listing_exists(self, listing_id: int) -> bool: ... - class Step: listing_repository: ListingRepository @@ -65,19 +69,23 @@ class FetchListingDetailsStep(Step): existing_listings = await self.listing_repository.get_listings( only_ids=[listing_id] ) - if (existing_listings) == 0: + if len(existing_listings) == 0: return True return False async def process(self, listing_id: int) -> Listing: - logger.debug(f"Fetching details for {listing_id=}") + logger.debug(f"[{listing_id}] Fetching property details from API") + celery_logger.info(f"[{listing_id}] Fetching details...") + existing_listings = await self.listing_repository.get_listings( only_ids=[listing_id] ) now = datetime.now() if len(existing_listings) > 0: # listing exists, do not refresh + logger.debug(f"[{listing_id}] Already exists, skipping refresh") return existing_listings[0] + listing_details = await detail_query(listing_id) furnish_type_str = listing_details["property"].get("letFurnishType", "unknown") @@ -124,7 +132,12 @@ class FetchListingDetailsStep(Step): additional_info=listing_details, ) await self.listing_repository.upsert_listings([listing]) - logger.debug(f"Completed fetching details for {listing_id=}") + + celery_logger.info( + f"[{listing_id}] Details fetched: £{listing.price}, " + f"{listing.number_of_bedrooms}BR, {listing.agency}" + ) + logger.debug(f"[{listing_id}] Details fetch complete") # TODO: dump to filesystem return listing @@ -140,7 +153,8 @@ class FetchImagesStep(Step): return len(listing.floorplan_image_paths) == 0 async def process(self, listing_id: int) -> Listing: - logger.debug(f"Fetching images for {listing_id=}") + logger.debug(f"[{listing_id}] Fetching floorplan images") + existing_listings = await self.listing_repository.get_listings( only_ids=[listing_id] ) @@ -152,6 +166,12 @@ class FetchImagesStep(Step): all_floorplans = listing.additional_info.get("property", {}).get( "floorplans", [] ) + + if len(all_floorplans) == 0: + logger.debug(f"[{listing_id}] No floorplans available") + return listing + + downloaded = 0 client_timeout = aiohttp.ClientTimeout(total=30) for floorplan_obj in all_floorplans: url = floorplan_obj["url"] @@ -169,8 +189,12 @@ class FetchImagesStep(Step): with open(floorplan_path, "wb") as f: f.write(await response.read()) listing.floorplan_image_paths.append(str(floorplan_path)) + downloaded += 1 + await self.listing_repository.upsert_listings([listing]) - logger.debug(f"Completed fetching images for {listing_id=}") + + celery_logger.info(f"[{listing_id}] Downloaded {downloaded} floorplan images") + logger.debug(f"[{listing_id}] Image fetch complete") return listing @@ -188,11 +212,19 @@ class DetectFloorplanStep(Step): return listings[0].square_meters is None async def process(self, listing_id: int) -> Listing: - logger.debug(f"Running floorplan detection for {listing_id=}") + logger.debug(f"[{listing_id}] Running OCR on floorplans") + listings = await self.listing_repository.get_listings(only_ids=[listing_id]) if len(listings) == 0: raise ValueError(f"Listing {listing_id} does not exist") listing = listings[0] + + if len(listing.floorplan_image_paths) == 0: + logger.debug(f"[{listing_id}] No floorplan images to process") + listing.square_meters = 0 + await self.listing_repository.upsert_listings([listing]) + return listing + sqms = [] for floorplan_path in listing.floorplan_image_paths: async with self.ocr_semaphore: @@ -201,9 +233,15 @@ class DetectFloorplanStep(Step): ) if estimated_sqm is not None: sqms.append(estimated_sqm) + max_sqm = max(sqms, default=0) # try once, if we fail, keep as 0 - # if max_sqm is not None: listing.square_meters = max_sqm await self.listing_repository.upsert_listings([listing]) - logger.debug(f"Completed running floorplan detection for {listing_id=}") + + if max_sqm > 0: + celery_logger.info(f"[{listing_id}] OCR detected {max_sqm} sqm") + else: + logger.debug(f"[{listing_id}] OCR: no square meters detected") + + logger.debug(f"[{listing_id}] OCR complete") return listing diff --git a/crawler/tasks/listing_tasks.py b/crawler/tasks/listing_tasks.py index 1fb3041..713a56d 100644 --- a/crawler/tasks/listing_tasks.py +++ b/crawler/tasks/listing_tasks.py @@ -1,5 +1,6 @@ import asyncio import logging +import time from typing import Any from celery import Task from celery.schedules import crontab @@ -9,6 +10,8 @@ from config.scraper_config import ScraperConfig from listing_processor import ListingProcessor from models.listing import Listing, QueryParameters from rec.query import create_session, listing_query +from rec.exceptions import CircuitBreakerOpenError, ThrottlingError +from rec.throttle_detector import get_throttle_metrics, reset_throttle_metrics from repositories.listing_repository import ListingRepository from database import engine from services.query_splitter import QuerySplitter, SubQuery @@ -16,6 +19,16 @@ from utils.redis_lock import redis_lock logger = logging.getLogger("uvicorn.error") +# Also configure a celery-specific logger that always outputs to stdout +celery_logger = logging.getLogger("celery.task") +if not celery_logger.handlers: + handler = logging.StreamHandler() + handler.setFormatter(logging.Formatter( + "%(asctime)s [%(levelname)s] %(name)s: %(message)s" + )) + celery_logger.addHandler(handler) + celery_logger.setLevel(logging.INFO) + SCRAPE_LOCK_NAME = "scrape_listings" @@ -23,12 +36,18 @@ SCRAPE_LOCK_NAME = "scrape_listings" def dump_listings_task(self: Task, parameters_json: str) -> dict[str, Any]: with redis_lock(SCRAPE_LOCK_NAME) as acquired: if not acquired: - logger.warning("Another scrape job is already running, skipping this execution") + msg = "Another scrape job is already running, skipping this execution" + logger.warning(msg) + celery_logger.warning(msg) self.update_state(state="SKIPPED", meta={"reason": "Another scrape job is running"}) return {"status": "skipped", "reason": "another_job_running"} + celery_logger.info(f"Acquired lock: {SCRAPE_LOCK_NAME}") logger.info(f"Acquired lock: {SCRAPE_LOCK_NAME}") + parsed_parameters = QueryParameters.model_validate_json(parameters_json) + celery_logger.info(f"Starting scrape with parameters: {parsed_parameters}") + self.update_state(state="Starting...", meta={"progress": 0}) asyncio.run(dump_listings_full(task=self, parameters=parsed_parameters)) return {"progress": 0} @@ -50,46 +69,91 @@ async def dump_listings_full( *, task: Task, parameters: QueryParameters ) -> list[Listing]: """Fetches all listings, images as well as detects floorplans""" + start_time = time.time() + celery_logger.info("=" * 60) + celery_logger.info("PHASE 1: Initializing listing fetch") + celery_logger.info("=" * 60) + repository = ListingRepository(engine) task.update_state(state="Identifying missing listings", meta={"progress": 0}) + celery_logger.info("Querying Rightmove API to identify new listings...") ids_to_process = await get_ids_to_process( parameters=parameters, repository=repository, task=task ) + + celery_logger.info(f"Found {len(ids_to_process)} new listings to process") logger.info(f"Found {len(ids_to_process)} listings to process") if len(ids_to_process) == 0: + elapsed = time.time() - start_time + celery_logger.info(f"No new listings found. Completed in {elapsed:.1f}s") task.update_state( state="No new listings found", meta={"progress": 1, "processed": 0, "total": 0, "message": "All listings are up to date"}, ) return [] + celery_logger.info("=" * 60) + celery_logger.info("PHASE 2: Processing listings (fetch details, images, OCR)") + celery_logger.info("=" * 60) + listing_processor = ListingProcessor(repository) + celery_logger.info(f"Starting processing {len(ids_to_process)} listings") logger.info(f"Starting processing {len(ids_to_process)} listings") - return await dump_listings_and_monitor( + + result = await dump_listings_and_monitor( task=task, listing_processor=listing_processor, missing_ids=ids_to_process ) + elapsed = time.time() - start_time + celery_logger.info("=" * 60) + celery_logger.info(f"COMPLETED: Processed {len(result)} listings in {elapsed:.1f}s") + celery_logger.info("=" * 60) + + return result + async def dump_listings_and_monitor( *, task: Task, listing_processor: ListingProcessor, missing_ids: set[int] ) -> list[Listing]: task_progress = {missing_id: 0 for missing_id in missing_ids} + processed_count = 0 + failed_count = 0 + start_time = time.time() async def process(missing_id: int) -> Listing | None: + nonlocal processed_count, failed_count listing = await listing_processor.process_listing(missing_id) task_progress[missing_id] = 1 + if listing is not None: + processed_count += 1 + else: + failed_count += 1 return listing async def monitor() -> None: + last_progress = 0 while (progress := sum(task_progress.values())) < len(missing_ids): progress_ratio = round(progress / len(missing_ids), 2) - logger.error( - f"Task progress: {progress_ratio * 100}% ({progress} out of {len(missing_ids)})" - ) + + # Log every 10% progress or at least every update + if progress_ratio >= last_progress + 0.1 or progress == 1: + elapsed = time.time() - start_time + rate = progress / elapsed if elapsed > 0 else 0 + eta = (len(missing_ids) - progress) / rate if rate > 0 else 0 + + celery_logger.info( + f"Progress: {progress_ratio * 100:.0f}% " + f"({progress}/{len(missing_ids)}) " + f"| Elapsed: {elapsed:.0f}s " + f"| Rate: {rate:.1f}/s " + f"| ETA: {eta:.0f}s" + ) + last_progress = progress_ratio + task.update_state( - state=f"Progress: {progress_ratio * 100}% ({progress} out of {len(missing_ids)})", + state=f"Processing: {progress_ratio * 100:.0f}% ({progress}/{len(missing_ids)})", meta={"progress": progress_ratio, "processed": progress, "total": len(missing_ids)}, ) await asyncio.sleep(1) @@ -97,7 +161,11 @@ async def dump_listings_and_monitor( processed_listings = await asyncio.gather( *[process(id) for id in missing_ids], *[monitor()] ) - filtered_listings = [l for l in processed_listings if l is not None] + filtered_listings = [listing for listing in processed_listings if listing is not None] + + celery_logger.info( + f"Processing complete: {processed_count} successful, {failed_count} failed" + ) return filtered_listings @@ -149,115 +217,171 @@ async def get_ids_to_process( config = ScraperConfig.from_env() splitter = QuerySplitter(config) + # Reset throttle metrics + reset_throttle_metrics() + def on_progress(phase: str, message: str) -> None: task.update_state(state=message, meta={"phase": phase}) + celery_logger.info(f"[{phase}] {message}") - async with create_session(config) as session: - # Phase 1 & 2: Split and probe queries - task.update_state( - state="Analyzing query and splitting by price bands...", - meta={"phase": "splitting", "progress": 0}, - ) - subqueries = await splitter.split(parameters, session, on_progress) + celery_logger.info("Starting query splitting and probing...") - total_estimated = splitter.calculate_total_estimated_results(subqueries) - logger.info( - f"Split into {len(subqueries)} subqueries, " - f"estimated {total_estimated} total results" - ) + try: + async with create_session(config) as session: + # Phase 1 & 2: Split and probe queries + task.update_state( + state="Analyzing query and splitting by price bands...", + meta={"phase": "splitting", "progress": 0}, + ) + subqueries = await splitter.split(parameters, session, on_progress) - # Phase 3: Fetch all pages for each subquery - task.update_state( - state=f"Fetching listings from {len(subqueries)} subqueries...", - meta={ - "phase": "fetching", - "subqueries": len(subqueries), - "estimated_results": total_estimated, - }, - ) - - semaphore = asyncio.Semaphore(config.max_concurrent_requests) - identifiers: set[int] = set() - - async def fetch_subquery(sq: SubQuery) -> list[dict[str, Any]]: - """Fetch all pages for a single subquery.""" - results: list[dict[str, Any]] = [] - - # Calculate how many pages we need based on estimated results - estimated = sq.estimated_results or 0 - if estimated == 0: - return results - - # Fetch pages up to max_pages_per_query or until no more results - page_size = parameters.page_size - max_pages = min( - config.max_pages_per_query, - (estimated // page_size) + 1, + total_estimated = splitter.calculate_total_estimated_results(subqueries) + celery_logger.info( + f"Query split complete: {len(subqueries)} subqueries, " + f"~{total_estimated} estimated total results" + ) + logger.info( + f"Split into {len(subqueries)} subqueries, " + f"estimated {total_estimated} total results" ) - for page_id in range(1, max_pages + 1): - async with semaphore: - await asyncio.sleep(config.request_delay_ms / 1000) - try: - result = await listing_query( - page=page_id, - channel=parameters.listing_type, - min_bedrooms=sq.min_bedrooms, - max_bedrooms=sq.max_bedrooms, - radius=parameters.radius, - min_price=sq.min_price, - max_price=sq.max_price, - district=sq.district, - page_size=page_size, - max_days_since_added=parameters.max_days_since_added, - furnish_types=parameters.furnish_types or [], - session=session, - ) - results.append(result) + # Phase 3: Fetch all pages for each subquery + task.update_state( + state=f"Fetching listings from {len(subqueries)} subqueries...", + meta={ + "phase": "fetching", + "subqueries": len(subqueries), + "estimated_results": total_estimated, + }, + ) - # Check if we've received all results - properties = result.get("properties", []) - if len(properties) < page_size: - # No more results on next page + celery_logger.info(f"Fetching pages from {len(subqueries)} subqueries...") + + semaphore = asyncio.Semaphore(config.max_concurrent_requests) + identifiers: set[int] = set() + completed_subqueries = 0 + total_pages_fetched = 0 + + async def fetch_subquery(sq: SubQuery) -> list[dict[str, Any]]: + """Fetch all pages for a single subquery.""" + nonlocal completed_subqueries, total_pages_fetched + results: list[dict[str, Any]] = [] + + # Calculate how many pages we need based on estimated results + estimated = sq.estimated_results or 0 + if estimated == 0: + completed_subqueries += 1 + return results + + # Fetch pages up to max_pages_per_query or until no more results + page_size = parameters.page_size + max_pages = min( + config.max_pages_per_query, + (estimated // page_size) + 1, + ) + + for page_id in range(1, max_pages + 1): + async with semaphore: + await asyncio.sleep(config.request_delay_ms / 1000) + try: + result = await listing_query( + page=page_id, + channel=parameters.listing_type, + min_bedrooms=sq.min_bedrooms, + max_bedrooms=sq.max_bedrooms, + radius=parameters.radius, + min_price=sq.min_price, + max_price=sq.max_price, + district=sq.district, + page_size=page_size, + max_days_since_added=parameters.max_days_since_added, + furnish_types=parameters.furnish_types or [], + session=session, + config=config, + ) + results.append(result) + total_pages_fetched += 1 + + # Check if we've received all results + properties = result.get("properties", []) + if len(properties) < page_size: + # No more results on next page + break + + except CircuitBreakerOpenError as e: + celery_logger.error(f"Circuit breaker open: {e}") break - - except Exception as e: - if "GENERIC_ERROR" in str(e): - # Reached end of results - logger.debug( - f"Max page for {sq.district}: {page_id - 1}" + except ThrottlingError as e: + celery_logger.warning( + f"Throttling on {sq.district} page {page_id}: {e}" + ) + break + except Exception as e: + if "GENERIC_ERROR" in str(e): + # Reached end of results + logger.debug( + f"Max page for {sq.district}: {page_id - 1}" + ) + break + logger.warning( + f"Error fetching page {page_id} for {sq.district}: {e}" ) break - logger.warning( - f"Error fetching page {page_id} for {sq.district}: {e}" - ) - break - return results + completed_subqueries += 1 + return results - # Fetch all subqueries concurrently - all_results = await asyncio.gather( - *[fetch_subquery(sq) for sq in subqueries] - ) + # Fetch all subqueries concurrently + all_results = await asyncio.gather( + *[fetch_subquery(sq) for sq in subqueries] + ) - # Extract identifiers from all results - for subquery_results in all_results: - for response_json in subquery_results: - if not response_json: - continue - if response_json.get("totalAvailableResults", 0) == 0: - continue - for property_data in response_json.get("properties", []): - identifier = property_data.get("identifier") - if identifier: - identifiers.add(identifier) + celery_logger.info( + f"Fetch complete: {total_pages_fetched} pages from " + f"{completed_subqueries} subqueries" + ) + # Extract identifiers from all results + for subquery_results in all_results: + for response_json in subquery_results: + if not response_json: + continue + if response_json.get("totalAvailableResults", 0) == 0: + continue + for property_data in response_json.get("properties", []): + identifier = property_data.get("identifier") + if identifier: + identifiers.add(identifier) + + except CircuitBreakerOpenError as e: + celery_logger.error(f"Circuit breaker prevented query: {e}") + # Log throttle metrics + metrics = get_throttle_metrics() + if metrics.total_requests > 0: + celery_logger.info(metrics.summary()) + return set() + finally: + # Log throttle metrics + metrics = get_throttle_metrics() + if metrics.total_requests > 0: + celery_logger.info(f"API Stats: {metrics.total_requests} requests, " + f"avg {metrics.average_response_time:.2f}s, " + f"{metrics.total_throttling_events} throttled") + + celery_logger.info(f"Found {len(identifiers)} unique listing IDs from API") logger.info(f"Found {len(identifiers)} unique listings") # Filter out listings already in the database - all_listing_ids = {l.id for l in await repository.get_listings()} + celery_logger.info("Checking database for existing listings...") + all_listing_ids = {listing.id for listing in await repository.get_listings()} new_ids = identifiers - all_listing_ids + celery_logger.info( + f"Filtering: {len(identifiers)} total, " + f"{len(all_listing_ids)} existing in DB, " + f"{len(new_ids)} new to process" + ) + task.update_state( state=f"Found {len(new_ids)} new listings to process", meta={ From 5514fa638135f51e36f47b2e702a3b4cf669af5c Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Fri, 6 Feb 2026 20:34:50 +0000 Subject: [PATCH 5/5] Remove 1000-result limit, add Redis caching and virtual scrolling - Remove hard-coded limit=1000 default from listing_geojson and streaming endpoints, allowing all matching results to be returned - Add Redis caching service (db=2, 30min TTL) that caches query results as Redis Lists for fast re-queries with reduced DB load - Integrate cache into streaming endpoint: serve from cache on hit, populate cache on miss during DB streaming - Invalidate cache after scrape completes (both success and no-new-listings) - Replace ScrollArea with react-virtuoso in ListView for virtual scrolling, keeping only ~20-30 DOM nodes regardless of list size - Handle metadata streaming message to show "0 / N" progress from start - Throttle frontend state updates with requestAnimationFrame to prevent UI jank from rapid re-renders during cached response streaming --- crawler/api/app.py | 100 ++++--- crawler/frontend/package-lock.json | 252 +++++++++++++++--- crawler/frontend/package.json | 6 +- crawler/frontend/src/App.tsx | 24 +- crawler/frontend/src/components/ListView.tsx | 151 +++++++++++ .../frontend/src/services/streamingService.ts | 137 ++++++++++ crawler/services/listing_cache.py | 99 +++++++ crawler/tasks/listing_tasks.py | 4 + 8 files changed, 695 insertions(+), 78 deletions(-) create mode 100644 crawler/frontend/src/components/ListView.tsx create mode 100644 crawler/frontend/src/services/streamingService.ts create mode 100644 crawler/services/listing_cache.py diff --git a/crawler/api/app.py b/crawler/api/app.py index 96f8446..9dd3988 100644 --- a/crawler/api/app.py +++ b/crawler/api/app.py @@ -18,6 +18,11 @@ from fastapi.middleware.cors import CORSMiddleware from ui_exporter import convert_to_geojson_feature, convert_row_to_geojson from services import listing_service, export_service, district_service, task_service +from services.listing_cache import ( + get_cached_count, + get_cached_features, + cache_features_batch, +) from opentelemetry.instrumentation.fastapi import FastAPIInstrumentor from api.metrics import metrics_app from opentelemetry.metrics import get_meter @@ -101,7 +106,7 @@ async def get_listing( async def get_listing_geojson( user: Annotated[User, Depends(get_current_user)], query_parameters: Annotated[QueryParameters, Depends(get_query_parameters)], - limit: int = 1000, # Default limit to prevent timeout + limit: int | None = None, ) -> dict: """Get listings as GeoJSON for map display.""" repository = ListingRepository(engine) @@ -118,7 +123,7 @@ async def stream_listing_geojson( user: Annotated[User, Depends(get_current_user)], query_parameters: Annotated[QueryParameters, Depends(get_query_parameters)], batch_size: int = 50, - limit: int = 1000, + limit: int | None = None, ) -> StreamingResponse: """Stream listings as NDJSON for progressive map loading. @@ -128,38 +133,67 @@ async def stream_listing_geojson( - complete: Final message with total count """ async def generate(): - repository = ListingRepository(engine) + # Check cache first + cached_count = get_cached_count(query_parameters) - # Phase 1: Fast count for progress estimation - total = repository.count_listings(query_parameters) - effective_total = min(limit, total) if limit else total + if cached_count is not None and cached_count > 0: + # Cache HIT + effective_total = min(limit, cached_count) if limit else cached_count - yield json.dumps({ - "type": "metadata", - "batch_size": batch_size, - "total_expected": effective_total, - }) + "\n" + yield json.dumps({ + "type": "metadata", + "batch_size": batch_size, + "total_expected": effective_total, + "cached": True, + }) + "\n" - # Phase 2: Stream with column projection and keyset pagination - count = 0 - batch = [] - for row in repository.stream_listings_optimized( - query_parameters, limit=limit, page_size=batch_size - ): - feature = convert_row_to_geojson(row, query_parameters.listing_type.value) - batch.append(feature) - count += 1 + count = 0 + for feature_batch in get_cached_features(query_parameters, batch_size=batch_size): + if limit and count + len(feature_batch) > limit: + feature_batch = feature_batch[:limit - count] + count += len(feature_batch) + yield json.dumps({"type": "batch", "features": feature_batch}) + "\n" + if limit and count >= limit: + break - if len(batch) >= batch_size: + yield json.dumps({"type": "complete", "total": count}) + "\n" + else: + # Cache MISS - query DB and populate cache + repository = ListingRepository(engine) + + # Phase 1: Fast count for progress estimation + total = repository.count_listings(query_parameters) + effective_total = min(limit, total) if limit else total + + yield json.dumps({ + "type": "metadata", + "batch_size": batch_size, + "total_expected": effective_total, + "cached": False, + }) + "\n" + + # Phase 2: Stream with column projection and keyset pagination + count = 0 + batch = [] + for row in repository.stream_listings_optimized( + query_parameters, limit=limit, page_size=batch_size + ): + feature = convert_row_to_geojson(row, query_parameters.listing_type.value) + batch.append(feature) + count += 1 + + if len(batch) >= batch_size: + cache_features_batch(query_parameters, batch) + yield json.dumps({"type": "batch", "features": batch}) + "\n" + batch = [] + + # Send remaining + if batch: + cache_features_batch(query_parameters, batch) yield json.dumps({"type": "batch", "features": batch}) + "\n" - batch = [] - # Send remaining - if batch: - yield json.dumps({"type": "batch", "features": batch}) + "\n" - - # Final message - yield json.dumps({"type": "complete", "total": count}) + "\n" + # Final message + yield json.dumps({"type": "complete", "total": count}) + "\n" return StreamingResponse( generate(), @@ -200,13 +234,19 @@ async def refresh_listings( async def get_task_status( user: Annotated[User, Depends(get_current_user)], task_id: str, -) -> dict[str, str]: +) -> dict[str, str | int | float | None]: """Get the status of a background task.""" status = task_service.get_task_status(task_id) return { "task_id": status.task_id, "status": status.status, - "result": json.dumps(status.result) if status.result else "", + "result": json.dumps(status.result) if status.result else None, + "progress": status.progress, + "processed": status.processed, + "total": status.total, + "message": status.message, + "error": status.error, + "traceback": status.traceback, } diff --git a/crawler/frontend/package-lock.json b/crawler/frontend/package-lock.json index c3cbbde..07c0944 100644 --- a/crawler/frontend/package-lock.json +++ b/crawler/frontend/package-lock.json @@ -9,7 +9,9 @@ "version": "0.0.0", "dependencies": { "@hookform/resolvers": "^5.1.1", + "@radix-ui/react-accordion": "^1.2.12", "@radix-ui/react-alert-dialog": "^1.1.14", + "@radix-ui/react-checkbox": "^1.3.3", "@radix-ui/react-dialog": "^1.1.14", "@radix-ui/react-hover-card": "^1.1.14", "@radix-ui/react-label": "^2.1.7", @@ -18,6 +20,7 @@ "@radix-ui/react-scroll-area": "^1.2.9", "@radix-ui/react-select": "^2.2.5", "@radix-ui/react-separator": "^1.1.7", + "@radix-ui/react-slider": "^1.3.6", "@radix-ui/react-slot": "^1.2.3", "@radix-ui/react-tooltip": "^1.2.7", "@tabler/icons-react": "^3.34.0", @@ -40,7 +43,7 @@ "react-dom": "^19.1.0", "react-hook-form": "^7.58.1", "react-oidc-context": "^3.3.0", - "rivets": "^0.9.6", + "react-virtuoso": "^4.18.1", "tailwind-merge": "^3.3.1", "tailwindcss": "^4.1.10", "zod": "^3.25.67" @@ -50,7 +53,6 @@ "@types/node": "^24.0.1", "@types/react": "^19.1.2", "@types/react-dom": "^19.1.2", - "@types/rivets": "^0.9.5", "@vitejs/plugin-react-swc": "^3.9.0", "eslint": "^9.25.0", "eslint-plugin-react-hooks": "^5.2.0", @@ -924,6 +926,43 @@ "integrity": "sha512-XnbHrrprsNqZKQhStrSwgRUQzoCI1glLzdw79xiZPoofhGICeZRSQ3dIxAKH1gb3OHfNf4d6f+vAv3kil2eggA==", "license": "MIT" }, + "node_modules/@radix-ui/react-accordion": { + "version": "1.2.12", + "resolved": "https://registry.npmjs.org/@radix-ui/react-accordion/-/react-accordion-1.2.12.tgz", + "integrity": "sha512-T4nygeh9YE9dLRPhAHSeOZi7HBXo+0kYIPJXayZfvWOWA0+n3dESrZbjfDPUABkUNym6Hd+f2IR113To8D2GPA==", + "license": "MIT", + "dependencies": { + "@radix-ui/primitive": "1.1.3", + "@radix-ui/react-collapsible": "1.1.12", + "@radix-ui/react-collection": "1.1.7", + "@radix-ui/react-compose-refs": "1.1.2", + "@radix-ui/react-context": "1.1.2", + "@radix-ui/react-direction": "1.1.1", + "@radix-ui/react-id": "1.1.1", + "@radix-ui/react-primitive": "2.1.3", + "@radix-ui/react-use-controllable-state": "1.2.2" + }, + "peerDependencies": { + "@types/react": "*", + "@types/react-dom": "*", + "react": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc", + "react-dom": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc" + }, + "peerDependenciesMeta": { + "@types/react": { + "optional": true + }, + "@types/react-dom": { + "optional": true + } + } + }, + "node_modules/@radix-ui/react-accordion/node_modules/@radix-ui/primitive": { + "version": "1.1.3", + "resolved": "https://registry.npmjs.org/@radix-ui/primitive/-/primitive-1.1.3.tgz", + "integrity": "sha512-JTF99U/6XIjCBo0wqkU5sK10glYe27MRRsfwoiq5zzOEZLHU3A3KCMa5X/azekYRCJ0HlwI0crAXS/5dEHTzDg==", + "license": "MIT" + }, "node_modules/@radix-ui/react-alert-dialog": { "version": "1.1.14", "resolved": "https://registry.npmjs.org/@radix-ui/react-alert-dialog/-/react-alert-dialog-1.1.14.tgz", @@ -975,6 +1014,126 @@ } } }, + "node_modules/@radix-ui/react-checkbox": { + "version": "1.3.3", + "resolved": "https://registry.npmjs.org/@radix-ui/react-checkbox/-/react-checkbox-1.3.3.tgz", + "integrity": "sha512-wBbpv+NQftHDdG86Qc0pIyXk5IR3tM8Vd0nWLKDcX8nNn4nXFOFwsKuqw2okA/1D/mpaAkmuyndrPJTYDNZtFw==", + "license": "MIT", + "dependencies": { + "@radix-ui/primitive": "1.1.3", + "@radix-ui/react-compose-refs": "1.1.2", + "@radix-ui/react-context": "1.1.2", + "@radix-ui/react-presence": "1.1.5", + "@radix-ui/react-primitive": "2.1.3", + "@radix-ui/react-use-controllable-state": "1.2.2", + "@radix-ui/react-use-previous": "1.1.1", + "@radix-ui/react-use-size": "1.1.1" + }, + "peerDependencies": { + "@types/react": "*", + "@types/react-dom": "*", + "react": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc", + "react-dom": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc" + }, + "peerDependenciesMeta": { + "@types/react": { + "optional": true + }, + "@types/react-dom": { + "optional": true + } + } + }, + "node_modules/@radix-ui/react-checkbox/node_modules/@radix-ui/primitive": { + "version": "1.1.3", + "resolved": "https://registry.npmjs.org/@radix-ui/primitive/-/primitive-1.1.3.tgz", + "integrity": "sha512-JTF99U/6XIjCBo0wqkU5sK10glYe27MRRsfwoiq5zzOEZLHU3A3KCMa5X/azekYRCJ0HlwI0crAXS/5dEHTzDg==", + "license": "MIT" + }, + "node_modules/@radix-ui/react-checkbox/node_modules/@radix-ui/react-presence": { + "version": "1.1.5", + "resolved": "https://registry.npmjs.org/@radix-ui/react-presence/-/react-presence-1.1.5.tgz", + "integrity": "sha512-/jfEwNDdQVBCNvjkGit4h6pMOzq8bHkopq458dPt2lMjx+eBQUohZNG9A7DtO/O5ukSbxuaNGXMjHicgwy6rQQ==", + "license": "MIT", + "dependencies": { + "@radix-ui/react-compose-refs": "1.1.2", + "@radix-ui/react-use-layout-effect": "1.1.1" + }, + "peerDependencies": { + "@types/react": "*", + "@types/react-dom": "*", + "react": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc", + "react-dom": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc" + }, + "peerDependenciesMeta": { + "@types/react": { + "optional": true + }, + "@types/react-dom": { + "optional": true + } + } + }, + "node_modules/@radix-ui/react-collapsible": { + "version": "1.1.12", + "resolved": "https://registry.npmjs.org/@radix-ui/react-collapsible/-/react-collapsible-1.1.12.tgz", + "integrity": "sha512-Uu+mSh4agx2ib1uIGPP4/CKNULyajb3p92LsVXmH2EHVMTfZWpll88XJ0j4W0z3f8NK1eYl1+Mf/szHPmcHzyA==", + "license": "MIT", + "dependencies": { + "@radix-ui/primitive": "1.1.3", + "@radix-ui/react-compose-refs": "1.1.2", + "@radix-ui/react-context": "1.1.2", + "@radix-ui/react-id": "1.1.1", + "@radix-ui/react-presence": "1.1.5", + "@radix-ui/react-primitive": "2.1.3", + "@radix-ui/react-use-controllable-state": "1.2.2", + "@radix-ui/react-use-layout-effect": "1.1.1" + }, + "peerDependencies": { + "@types/react": "*", + "@types/react-dom": "*", + "react": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc", + "react-dom": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc" + }, + "peerDependenciesMeta": { + "@types/react": { + "optional": true + }, + "@types/react-dom": { + "optional": true + } + } + }, + "node_modules/@radix-ui/react-collapsible/node_modules/@radix-ui/primitive": { + "version": "1.1.3", + "resolved": "https://registry.npmjs.org/@radix-ui/primitive/-/primitive-1.1.3.tgz", + "integrity": "sha512-JTF99U/6XIjCBo0wqkU5sK10glYe27MRRsfwoiq5zzOEZLHU3A3KCMa5X/azekYRCJ0HlwI0crAXS/5dEHTzDg==", + "license": "MIT" + }, + "node_modules/@radix-ui/react-collapsible/node_modules/@radix-ui/react-presence": { + "version": "1.1.5", + "resolved": "https://registry.npmjs.org/@radix-ui/react-presence/-/react-presence-1.1.5.tgz", + "integrity": "sha512-/jfEwNDdQVBCNvjkGit4h6pMOzq8bHkopq458dPt2lMjx+eBQUohZNG9A7DtO/O5ukSbxuaNGXMjHicgwy6rQQ==", + "license": "MIT", + "dependencies": { + "@radix-ui/react-compose-refs": "1.1.2", + "@radix-ui/react-use-layout-effect": "1.1.1" + }, + "peerDependencies": { + "@types/react": "*", + "@types/react-dom": "*", + "react": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc", + "react-dom": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc" + }, + "peerDependenciesMeta": { + "@types/react": { + "optional": true + }, + "@types/react-dom": { + "optional": true + } + } + }, "node_modules/@radix-ui/react-collection": { "version": "1.1.7", "resolved": "https://registry.npmjs.org/@radix-ui/react-collection/-/react-collection-1.1.7.tgz", @@ -1482,6 +1641,45 @@ } } }, + "node_modules/@radix-ui/react-slider": { + "version": "1.3.6", + "resolved": "https://registry.npmjs.org/@radix-ui/react-slider/-/react-slider-1.3.6.tgz", + "integrity": "sha512-JPYb1GuM1bxfjMRlNLE+BcmBC8onfCi60Blk7OBqi2MLTFdS+8401U4uFjnwkOr49BLmXxLC6JHkvAsx5OJvHw==", + "license": "MIT", + "dependencies": { + "@radix-ui/number": "1.1.1", + "@radix-ui/primitive": "1.1.3", + "@radix-ui/react-collection": "1.1.7", + "@radix-ui/react-compose-refs": "1.1.2", + "@radix-ui/react-context": "1.1.2", + "@radix-ui/react-direction": "1.1.1", + "@radix-ui/react-primitive": "2.1.3", + "@radix-ui/react-use-controllable-state": "1.2.2", + "@radix-ui/react-use-layout-effect": "1.1.1", + "@radix-ui/react-use-previous": "1.1.1", + "@radix-ui/react-use-size": "1.1.1" + }, + "peerDependencies": { + "@types/react": "*", + "@types/react-dom": "*", + "react": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc", + "react-dom": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc" + }, + "peerDependenciesMeta": { + "@types/react": { + "optional": true + }, + "@types/react-dom": { + "optional": true + } + } + }, + "node_modules/@radix-ui/react-slider/node_modules/@radix-ui/primitive": { + "version": "1.1.3", + "resolved": "https://registry.npmjs.org/@radix-ui/primitive/-/primitive-1.1.3.tgz", + "integrity": "sha512-JTF99U/6XIjCBo0wqkU5sK10glYe27MRRsfwoiq5zzOEZLHU3A3KCMa5X/azekYRCJ0HlwI0crAXS/5dEHTzDg==", + "license": "MIT" + }, "node_modules/@radix-ui/react-slot": { "version": "1.2.3", "resolved": "https://registry.npmjs.org/@radix-ui/react-slot/-/react-slot-1.2.3.tgz", @@ -2773,16 +2971,6 @@ "@types/geojson": "*" } }, - "node_modules/@types/jquery": { - "version": "3.5.32", - "resolved": "https://registry.npmjs.org/@types/jquery/-/jquery-3.5.32.tgz", - "integrity": "sha512-b9Xbf4CkMqS02YH8zACqN1xzdxc3cO735Qe5AbSUFmyOiaWAbcpqh9Wna+Uk0vgACvoQHpWDg2rGdHkYPLmCiQ==", - "dev": true, - "license": "MIT", - "dependencies": { - "@types/sizzle": "*" - } - }, "node_modules/@types/json-schema": { "version": "7.0.15", "resolved": "https://registry.npmjs.org/@types/json-schema/-/json-schema-7.0.15.tgz", @@ -2852,23 +3040,6 @@ "@types/react": "^19.0.0" } }, - "node_modules/@types/rivets": { - "version": "0.9.5", - "resolved": "https://registry.npmjs.org/@types/rivets/-/rivets-0.9.5.tgz", - "integrity": "sha512-spCtZoSOrS8kNTJNOXamCCQurqOdF1Piak8bUQVqHQNRoTLoID6O6xVX41P5W2vvlxc9UpSG75zl4CRra0l3Eg==", - "dev": true, - "license": "MIT", - "dependencies": { - "@types/jquery": "*" - } - }, - "node_modules/@types/sizzle": { - "version": "2.3.9", - "resolved": "https://registry.npmjs.org/@types/sizzle/-/sizzle-2.3.9.tgz", - "integrity": "sha512-xzLEyKB50yqCUPUJkIsrVvoWNfFUbIZI+RspLWt8u+tIW/BetMBZtgV2LY/2o+tYH8dRvQ+eoPf3NdhQCcLE2w==", - "dev": true, - "license": "MIT" - }, "node_modules/@types/supercluster": { "version": "7.1.3", "resolved": "https://registry.npmjs.org/@types/supercluster/-/supercluster-7.1.3.tgz", @@ -5348,6 +5519,16 @@ } } }, + "node_modules/react-virtuoso": { + "version": "4.18.1", + "resolved": "https://registry.npmjs.org/react-virtuoso/-/react-virtuoso-4.18.1.tgz", + "integrity": "sha512-KF474cDwaSb9+SJ380xruBB4P+yGWcVkcu26HtMqYNMTYlYbrNy8vqMkE+GpAApPPufJqgOLMoWMFG/3pJMXUA==", + "license": "MIT", + "peerDependencies": { + "react": ">=16 || >=17 || >= 18 || >= 19", + "react-dom": ">=16 || >=17 || >= 18 || >=19" + } + }, "node_modules/resolve-from": { "version": "4.0.0", "resolved": "https://registry.npmjs.org/resolve-from/-/resolve-from-4.0.0.tgz", @@ -5378,14 +5559,6 @@ "node": ">=0.10.0" } }, - "node_modules/rivets": { - "version": "0.9.6", - "resolved": "https://registry.npmjs.org/rivets/-/rivets-0.9.6.tgz", - "integrity": "sha512-KfdMjLRWw4+38ej9bRXegKZVfYo0jEacwadA5z6NTKya+YohwGemwdbxvJ52WCXODkTnR4Q8UmUC6HVxsdzkxA==", - "dependencies": { - "sightglass": "~0.2.4" - } - }, "node_modules/robust-predicates": { "version": "3.0.2", "resolved": "https://registry.npmjs.org/robust-predicates/-/robust-predicates-3.0.2.tgz", @@ -5524,11 +5697,6 @@ "node": ">=8" } }, - "node_modules/sightglass": { - "version": "0.2.6", - "resolved": "https://registry.npmjs.org/sightglass/-/sightglass-0.2.6.tgz", - "integrity": "sha512-t1fgbuhURcWc8VgZk8kJQ3QmmZk3kghDcf0wpsN8I8RaV05IUkc2b195KpGqgocKT/q8+vKk6EcB2c7N2lAd6A==" - }, "node_modules/source-map-js": { "version": "1.2.1", "resolved": "https://registry.npmjs.org/source-map-js/-/source-map-js-1.2.1.tgz", diff --git a/crawler/frontend/package.json b/crawler/frontend/package.json index 466d325..a9c66a3 100644 --- a/crawler/frontend/package.json +++ b/crawler/frontend/package.json @@ -11,7 +11,9 @@ }, "dependencies": { "@hookform/resolvers": "^5.1.1", + "@radix-ui/react-accordion": "^1.2.12", "@radix-ui/react-alert-dialog": "^1.1.14", + "@radix-ui/react-checkbox": "^1.3.3", "@radix-ui/react-dialog": "^1.1.14", "@radix-ui/react-hover-card": "^1.1.14", "@radix-ui/react-label": "^2.1.7", @@ -20,6 +22,7 @@ "@radix-ui/react-scroll-area": "^1.2.9", "@radix-ui/react-select": "^2.2.5", "@radix-ui/react-separator": "^1.1.7", + "@radix-ui/react-slider": "^1.3.6", "@radix-ui/react-slot": "^1.2.3", "@radix-ui/react-tooltip": "^1.2.7", "@tabler/icons-react": "^3.34.0", @@ -42,7 +45,7 @@ "react-dom": "^19.1.0", "react-hook-form": "^7.58.1", "react-oidc-context": "^3.3.0", - "rivets": "^0.9.6", + "react-virtuoso": "^4.18.1", "tailwind-merge": "^3.3.1", "tailwindcss": "^4.1.10", "zod": "^3.25.67" @@ -52,7 +55,6 @@ "@types/node": "^24.0.1", "@types/react": "^19.1.2", "@types/react-dom": "^19.1.2", - "@types/rivets": "^0.9.5", "@vitejs/plugin-react-swc": "^3.9.0", "eslint": "^9.25.0", "eslint-plugin-react-hooks": "^5.2.0", diff --git a/crawler/frontend/src/App.tsx b/crawler/frontend/src/App.tsx index 5e90bdf..8933d56 100644 --- a/crawler/frontend/src/App.tsx +++ b/crawler/frontend/src/App.tsx @@ -67,16 +67,32 @@ function App() { setStreamingProgress({ count: 0 }); setListingData(null); + let updateScheduled = false; + + const flushUpdate = () => { + updateScheduled = false; + setListingData({ + type: 'FeatureCollection', + features: [...accumulatedFeaturesRef.current] + }); + }; + + const scheduleUpdate = () => { + if (!updateScheduled) { + updateScheduled = true; + requestAnimationFrame(flushUpdate); + } + }; + try { for await (const batch of streamListingGeoJSON(user, parameters, (progress) => { setStreamingProgress(progress); })) { accumulatedFeaturesRef.current.push(...batch); - setListingData({ - type: 'FeatureCollection', - features: [...accumulatedFeaturesRef.current] - }); + scheduleUpdate(); } + // Final flush to ensure all data is rendered + flushUpdate(); } catch (error) { if (error instanceof Error) { setSubmitError(error.message); diff --git a/crawler/frontend/src/components/ListView.tsx b/crawler/frontend/src/components/ListView.tsx new file mode 100644 index 0000000..790146a --- /dev/null +++ b/crawler/frontend/src/components/ListView.tsx @@ -0,0 +1,151 @@ +import { useState, useMemo, useCallback } from 'react'; +import { ArrowUpDown, ArrowUp, ArrowDown } from 'lucide-react'; +import { Virtuoso } from 'react-virtuoso'; +import { Button } from './ui/button'; +import { PropertyCard } from './PropertyCard'; +import type { GeoJSONFeatureCollection, PropertyFeature, PropertyProperties } from '@/types'; + +type SortField = 'total_price' | 'qmprice' | 'qm' | 'rooms' | 'last_seen'; +type SortOrder = 'asc' | 'desc'; + +interface ListViewProps { + listingData: GeoJSONFeatureCollection; + onPropertyClick?: (property: PropertyProperties, coordinates: [number, number]) => void; + highlightedPropertyUrl?: string | null; +} + +interface SortConfig { + field: SortField; + order: SortOrder; +} + +const SORT_OPTIONS: { field: SortField; label: string }[] = [ + { field: 'total_price', label: 'Price' }, + { field: 'qmprice', label: '£/m²' }, + { field: 'qm', label: 'Size' }, + { field: 'rooms', label: 'Beds' }, + { field: 'last_seen', label: 'Last Seen' }, +]; + +export function ListView({ listingData, onPropertyClick, highlightedPropertyUrl }: ListViewProps) { + const [sortConfig, setSortConfig] = useState({ field: 'qmprice', order: 'asc' }); + + // Calculate average price per sqm for "good deal" indicator + const avgPricePerSqm = useMemo(() => { + const validPrices = listingData.features + .map((f) => f.properties.qmprice) + .filter((p): p is number => typeof p === 'number' && p > 0); + return validPrices.length > 0 + ? validPrices.reduce((a, b) => a + b, 0) / validPrices.length + : 0; + }, [listingData]); + + // Sort features + const sortedFeatures = useMemo(() => { + const features = [...listingData.features]; + + features.sort((a, b) => { + let aValue: number | string; + let bValue: number | string; + + switch (sortConfig.field) { + case 'total_price': + aValue = a.properties.total_price || 0; + bValue = b.properties.total_price || 0; + break; + case 'qmprice': + aValue = a.properties.qmprice || 0; + bValue = b.properties.qmprice || 0; + break; + case 'qm': + aValue = a.properties.qm || 0; + bValue = b.properties.qm || 0; + break; + case 'rooms': + aValue = a.properties.rooms || 0; + bValue = b.properties.rooms || 0; + break; + case 'last_seen': + aValue = new Date(a.properties.last_seen).getTime(); + bValue = new Date(b.properties.last_seen).getTime(); + break; + default: + return 0; + } + + if (typeof aValue === 'number' && typeof bValue === 'number') { + return sortConfig.order === 'asc' ? aValue - bValue : bValue - aValue; + } + return 0; + }); + + return features; + }, [listingData.features, sortConfig]); + + const handleSort = (field: SortField) => { + setSortConfig((prev) => ({ + field, + order: prev.field === field && prev.order === 'asc' ? 'desc' : 'asc', + })); + }; + + const handlePropertyClick = useCallback((feature: PropertyFeature) => { + if (onPropertyClick) { + onPropertyClick(feature.properties, feature.geometry.coordinates); + } + }, [onPropertyClick]); + + const SortIcon = ({ field }: { field: SortField }) => { + if (sortConfig.field !== field) { + return ; + } + return sortConfig.order === 'asc' + ? + : ; + }; + + return ( +
+ {/* Sort controls */} +
+ Sort: + {SORT_OPTIONS.map((option) => ( + + ))} +
+ + {/* Listing count */} +
+ Showing {sortedFeatures.length.toLocaleString()} properties +
+ + {/* Property list */} + ( +
+ handlePropertyClick(feature)} + /> +
+ )} + /> +
+ ); +} diff --git a/crawler/frontend/src/services/streamingService.ts b/crawler/frontend/src/services/streamingService.ts new file mode 100644 index 0000000..1c1d69f --- /dev/null +++ b/crawler/frontend/src/services/streamingService.ts @@ -0,0 +1,137 @@ +// Streaming service for progressive listing data loading + +import type { User } from 'oidc-client-ts'; +import type { PropertyFeature } from '@/types'; +import type { ParameterValues } from '@/components/FilterPanel'; +import { ApiError } from '@/types'; +import { API_ENDPOINTS } from '@/constants'; + +/** + * Build query string from parameters object + */ +function buildQueryString(params: Record): string { + const queryString = new URLSearchParams(); + + for (const [key, value] of Object.entries(params)) { + if (value !== undefined && value !== null && value !== '') { + if (value instanceof Date) { + queryString.append(key, value.toISOString()); + } else { + queryString.append(key, String(value)); + } + } + } + + return queryString.toString(); +} + +/** + * Build listing query parameters from form values + */ +function buildListingParams(parameters: ParameterValues): Record { + return { + listing_type: parameters.listing_type, + min_bedrooms: parameters.min_bedrooms, + max_bedrooms: parameters.max_bedrooms, + max_price: parameters.max_price, + min_price: parameters.min_price, + min_sqm: parameters.min_sqm, + max_sqm: parameters.max_sqm, + min_price_per_sqm: parameters.min_price_per_sqm, + max_price_per_sqm: parameters.max_price_per_sqm, + last_seen_days: parameters.last_seen_days, + let_date_available_from: parameters.available_from, + district_names: parameters.district || undefined, + furnish_types: parameters.furnish_types?.join(',') || undefined, + }; +} + +export interface StreamMessage { + type: 'metadata' | 'batch' | 'complete'; + features?: PropertyFeature[]; + total?: number; + total_expected?: number; + batch_size?: number; + cached?: boolean; +} + +export interface StreamingProgress { + count: number; + total?: number; +} + +/** + * Stream listing GeoJSON data as an async generator. + * Yields batches of features as they arrive from the server. + */ +export async function* streamListingGeoJSON( + user: User, + parameters: ParameterValues, + onProgress?: (progress: StreamingProgress) => void +): AsyncGenerator { + const params = buildListingParams(parameters); + const queryString = buildQueryString(params); + const url = queryString + ? `${API_ENDPOINTS.LISTING_GEOJSON_STREAM}?${queryString}` + : API_ENDPOINTS.LISTING_GEOJSON_STREAM; + + const response = await fetch(url, { + headers: { + Authorization: `Bearer ${user.access_token}`, + }, + }); + + if (!response.ok) { + throw new ApiError(`Error: ${response.status}`, response.status); + } + + if (!response.body) { + throw new Error('No response body'); + } + + const reader = response.body.getReader(); + const decoder = new TextDecoder(); + let buffer = ''; + let totalCount = 0; + + while (true) { + const { done, value } = await reader.read(); + if (done) break; + + buffer += decoder.decode(value, { stream: true }); + const lines = buffer.split('\n'); + buffer = lines.pop() || ''; // Keep incomplete line in buffer + + for (const line of lines) { + if (!line.trim()) continue; + + try { + const message: StreamMessage = JSON.parse(line); + + if (message.type === 'metadata') { + onProgress?.({ count: 0, total: message.total_expected }); + } else if (message.type === 'batch' && message.features) { + totalCount += message.features.length; + onProgress?.({ count: totalCount }); + yield message.features; + } else if (message.type === 'complete') { + onProgress?.({ count: message.total ?? totalCount, total: message.total }); + } + } catch (e) { + console.error('Failed to parse streaming message:', e); + } + } + } + + // Process any remaining data in the buffer + if (buffer.trim()) { + try { + const message: StreamMessage = JSON.parse(buffer); + if (message.type === 'batch' && message.features) { + yield message.features; + } + } catch (e) { + console.error('Failed to parse final streaming message:', e); + } + } +} diff --git a/crawler/services/listing_cache.py b/crawler/services/listing_cache.py new file mode 100644 index 0000000..c77adfc --- /dev/null +++ b/crawler/services/listing_cache.py @@ -0,0 +1,99 @@ +"""Redis-based caching for listing GeoJSON query results.""" +import hashlib +import json +import logging +import os +from typing import Generator + +import redis + +from models.listing import QueryParameters + +logger = logging.getLogger("uvicorn.error") + +CACHE_PREFIX = "listings:geojson:" +CACHE_TTL_SECONDS = 30 * 60 # 30 minutes +CACHE_DB = 2 + + +def _get_redis_client() -> redis.Redis: + """Get Redis client using Celery broker URL but overriding to db=2.""" + broker_url = os.getenv("CELERY_BROKER_URL", "redis://localhost:6379/0") + # Replace the db number in the URL + base_url = broker_url.rsplit("/", 1)[0] + return redis.from_url(f"{base_url}/{CACHE_DB}", decode_responses=True) + + +def make_cache_key(query_params: QueryParameters) -> str: + """Generate a cache key from query parameters.""" + params_json = query_params.model_dump_json() + hash_suffix = hashlib.sha256(params_json.encode()).hexdigest()[:16] + return f"{CACHE_PREFIX}{hash_suffix}" + + +def get_cached_count(query_params: QueryParameters) -> int | None: + """Return the number of cached features for a query, or None if not cached.""" + try: + client = _get_redis_client() + key = make_cache_key(query_params) + if not client.exists(key): + return None + return client.llen(key) + except redis.RedisError as e: + logger.warning(f"Redis cache read error: {e}") + return None + + +def get_cached_features( + query_params: QueryParameters, batch_size: int = 50 +) -> Generator[list[dict], None, None]: + """Yield batches of cached GeoJSON features.""" + try: + client = _get_redis_client() + key = make_cache_key(query_params) + total = client.llen(key) + + for start in range(0, total, batch_size): + end = start + batch_size - 1 + items = client.lrange(key, start, end) + batch = [json.loads(item) for item in items] + if batch: + yield batch + except redis.RedisError as e: + logger.warning(f"Redis cache read error during streaming: {e}") + + +def cache_features_batch(query_params: QueryParameters, features: list[dict]) -> None: + """Append a batch of features to the cache list.""" + if not features: + return + try: + client = _get_redis_client() + key = make_cache_key(query_params) + pipeline = client.pipeline() + for feature in features: + pipeline.rpush(key, json.dumps(feature)) + # Set/refresh TTL + pipeline.expire(key, CACHE_TTL_SECONDS) + pipeline.execute() + except redis.RedisError as e: + logger.warning(f"Redis cache write error: {e}") + + +def invalidate_cache() -> None: + """Delete all listing GeoJSON cache entries.""" + try: + client = _get_redis_client() + cursor = 0 + deleted = 0 + while True: + cursor, keys = client.scan(cursor, match=f"{CACHE_PREFIX}*", count=100) + if keys: + client.delete(*keys) + deleted += len(keys) + if cursor == 0: + break + if deleted: + logger.info(f"Invalidated {deleted} listing cache entries") + except redis.RedisError as e: + logger.warning(f"Redis cache invalidation error: {e}") diff --git a/crawler/tasks/listing_tasks.py b/crawler/tasks/listing_tasks.py index 713a56d..60bf2e6 100644 --- a/crawler/tasks/listing_tasks.py +++ b/crawler/tasks/listing_tasks.py @@ -16,6 +16,7 @@ from repositories.listing_repository import ListingRepository from database import engine from services.query_splitter import QuerySplitter, SubQuery from utils.redis_lock import redis_lock +from services.listing_cache import invalidate_cache logger = logging.getLogger("uvicorn.error") @@ -88,6 +89,7 @@ async def dump_listings_full( if len(ids_to_process) == 0: elapsed = time.time() - start_time celery_logger.info(f"No new listings found. Completed in {elapsed:.1f}s") + invalidate_cache() task.update_state( state="No new listings found", meta={"progress": 1, "processed": 0, "total": 0, "message": "All listings are up to date"}, @@ -111,6 +113,8 @@ async def dump_listings_full( celery_logger.info(f"COMPLETED: Processed {len(result)} listings in {elapsed:.1f}s") celery_logger.info("=" * 60) + invalidate_cache() + return result