wrongmove/crawler/rec/throttle_detector.py

232 lines
7.7 KiB
Python

"""Throttling detection and metrics for Rightmove API."""
from __future__ import annotations
import time
from dataclasses import dataclass, field
from typing import Any
import aiohttp
from rec.exceptions import (
InvalidResponseError,
IPBlockedError,
RateLimitError,
ServiceUnavailableError,
SlowResponseError,
UnexpectedEmptyResponseError,
)
@dataclass
class ThrottleMetrics:
"""Tracks throttling events and metrics.
Attributes:
rate_limit_count: Number of HTTP 429 errors.
service_unavailable_count: Number of HTTP 503 errors.
ip_blocked_count: Number of HTTP 403 errors.
slow_response_count: Number of slow responses.
empty_response_count: Number of unexpected empty responses.
invalid_response_count: Number of invalid/error responses.
total_requests: Total number of requests made.
total_response_time: Cumulative response time in seconds.
"""
rate_limit_count: int = 0
service_unavailable_count: int = 0
ip_blocked_count: int = 0
slow_response_count: int = 0
empty_response_count: int = 0
invalid_response_count: int = 0
total_requests: int = 0
total_response_time: float = 0.0
_start_time: float = field(default_factory=time.time)
def record_rate_limit(self) -> None:
"""Record a rate limit error (HTTP 429)."""
self.rate_limit_count += 1
def record_service_unavailable(self) -> None:
"""Record a service unavailable error (HTTP 503)."""
self.service_unavailable_count += 1
def record_ip_blocked(self) -> None:
"""Record an IP blocked error (HTTP 403)."""
self.ip_blocked_count += 1
def record_slow_response(self, response_time: float) -> None:
"""Record a slow response.
Args:
response_time: Response time in seconds.
"""
self.slow_response_count += 1
self.total_response_time += response_time
self.total_requests += 1
def record_empty_response(self) -> None:
"""Record an unexpected empty response."""
self.empty_response_count += 1
def record_invalid_response(self) -> None:
"""Record an invalid or error response."""
self.invalid_response_count += 1
def record_request(self, response_time: float) -> None:
"""Record a successful request.
Args:
response_time: Response time in seconds.
"""
self.total_requests += 1
self.total_response_time += response_time
@property
def average_response_time(self) -> float:
"""Calculate average response time in seconds."""
if self.total_requests == 0:
return 0.0
return self.total_response_time / self.total_requests
@property
def total_throttling_events(self) -> int:
"""Total number of throttling events."""
return (
self.rate_limit_count
+ self.service_unavailable_count
+ self.ip_blocked_count
+ self.slow_response_count
)
@property
def throttle_rate(self) -> float:
"""Percentage of requests that were throttled."""
if self.total_requests == 0:
return 0.0
return (self.total_throttling_events / self.total_requests) * 100
@property
def elapsed_time(self) -> float:
"""Time elapsed since metrics started tracking."""
return time.time() - self._start_time
def summary(self) -> str:
"""Generate a summary of throttling metrics."""
return (
f"Throttle Metrics Summary:\n"
f" Total Requests: {self.total_requests}\n"
f" Total Throttling Events: {self.total_throttling_events}\n"
f" Throttle Rate: {self.throttle_rate:.2f}%\n"
f" Rate Limit (429): {self.rate_limit_count}\n"
f" Service Unavailable (503): {self.service_unavailable_count}\n"
f" IP Blocked (403): {self.ip_blocked_count}\n"
f" Slow Responses: {self.slow_response_count}\n"
f" Empty Responses: {self.empty_response_count}\n"
f" Invalid Responses: {self.invalid_response_count}\n"
f" Average Response Time: {self.average_response_time:.2f}s\n"
f" Elapsed Time: {self.elapsed_time:.2f}s"
)
# Global metrics instance
_global_metrics: ThrottleMetrics | None = None
def get_throttle_metrics() -> ThrottleMetrics:
"""Get the global throttle metrics instance.
Returns:
Global ThrottleMetrics instance.
"""
global _global_metrics
if _global_metrics is None:
_global_metrics = ThrottleMetrics()
return _global_metrics
def reset_throttle_metrics() -> None:
"""Reset the global throttle metrics."""
global _global_metrics
_global_metrics = ThrottleMetrics()
def validate_response(
response: aiohttp.ClientResponse,
response_time: float,
response_body: dict[str, Any] | None,
slow_response_threshold: float,
expect_data: bool = True,
) -> None:
"""Validate an API response and raise appropriate exceptions for throttling.
Args:
response: The aiohttp response object.
response_time: Time taken for the request in seconds.
response_body: Parsed JSON response body (if available).
slow_response_threshold: Threshold in seconds for slow responses.
expect_data: Whether we expect data in the response.
Raises:
RateLimitError: If HTTP 429 is returned.
ServiceUnavailableError: If HTTP 503 is returned.
IPBlockedError: If HTTP 403 is returned.
SlowResponseError: If response time exceeds threshold.
UnexpectedEmptyResponseError: If response is empty when data is expected.
InvalidResponseError: If response contains error messages.
"""
metrics = get_throttle_metrics()
# Check HTTP status codes
if response.status == 429:
metrics.record_rate_limit()
raise RateLimitError(
f"Rate limit exceeded (HTTP 429). "
f"Response time: {response_time:.2f}s"
)
if response.status == 503:
metrics.record_service_unavailable()
raise ServiceUnavailableError(
f"Service unavailable (HTTP 503). "
f"Response time: {response_time:.2f}s"
)
if response.status == 403:
metrics.record_ip_blocked()
raise IPBlockedError(
f"Access forbidden, possible IP block (HTTP 403). "
f"Response time: {response_time:.2f}s"
)
# Check response time
if response_time > slow_response_threshold:
metrics.record_slow_response(response_time)
raise SlowResponseError(
f"Slow response detected: {response_time:.2f}s "
f"(threshold: {slow_response_threshold}s)"
)
# Check response body if available
if response_body is not None:
# Check for error messages
if "error" in response_body or "GENERIC_ERROR" in str(response_body):
metrics.record_invalid_response()
raise InvalidResponseError(
f"Error in response body: {response_body}"
)
# Check for unexpected empty responses
if expect_data:
properties = response_body.get("properties", [])
total_results = response_body.get("totalAvailableResults", 0)
# If we expect data but got none (and total shows there should be some)
if total_results > 0 and len(properties) == 0:
metrics.record_empty_response()
raise UnexpectedEmptyResponseError(
f"Expected data but got empty response. "
f"Total available: {total_results}"
)
# Record successful request
metrics.record_request(response_time)