Structured logging via JsonFormatter replaces uvicorn's default format so Loki can parse timestamps and fields. 14 business metrics (scrape stats, throttle events, circuit breaker state, cache hit rate, OCR success rate, Celery task lifecycle) are defined in a shared metrics module and instrumented across the scraper pipeline, API, and workers. Celery workers expose a Prometheus HTTP endpoint on configurable ports.
247 lines
8.3 KiB
Python
247 lines
8.3 KiB
Python
"""Throttling detection and metrics for Rightmove API."""
|
|
from __future__ import annotations
|
|
|
|
import time
|
|
from dataclasses import dataclass, field
|
|
from typing import Any
|
|
|
|
import aiohttp
|
|
|
|
from rec.exceptions import (
|
|
InvalidResponseError,
|
|
IPBlockedError,
|
|
RateLimitError,
|
|
ServiceUnavailableError,
|
|
SlowResponseError,
|
|
UnexpectedEmptyResponseError,
|
|
)
|
|
|
|
|
|
@dataclass
|
|
class ThrottleMetrics:
|
|
"""Tracks throttling events and metrics.
|
|
|
|
Attributes:
|
|
rate_limit_count: Number of HTTP 429 errors.
|
|
service_unavailable_count: Number of HTTP 503 errors.
|
|
ip_blocked_count: Number of HTTP 403 errors.
|
|
slow_response_count: Number of slow responses.
|
|
empty_response_count: Number of unexpected empty responses.
|
|
invalid_response_count: Number of invalid/error responses.
|
|
total_requests: Total number of requests made.
|
|
total_response_time: Cumulative response time in seconds.
|
|
"""
|
|
|
|
rate_limit_count: int = 0
|
|
service_unavailable_count: int = 0
|
|
ip_blocked_count: int = 0
|
|
slow_response_count: int = 0
|
|
empty_response_count: int = 0
|
|
invalid_response_count: int = 0
|
|
total_requests: int = 0
|
|
total_response_time: float = 0.0
|
|
_start_time: float = field(default_factory=time.time)
|
|
|
|
def record_rate_limit(self) -> None:
|
|
"""Record a rate limit error (HTTP 429)."""
|
|
self.rate_limit_count += 1
|
|
_increment_throttle_metric("rate_limit")
|
|
|
|
def record_service_unavailable(self) -> None:
|
|
"""Record a service unavailable error (HTTP 503)."""
|
|
self.service_unavailable_count += 1
|
|
_increment_throttle_metric("service_unavailable")
|
|
|
|
def record_ip_blocked(self) -> None:
|
|
"""Record an IP blocked error (HTTP 403)."""
|
|
self.ip_blocked_count += 1
|
|
_increment_throttle_metric("ip_blocked")
|
|
|
|
def record_slow_response(self, response_time: float) -> None:
|
|
"""Record a slow response.
|
|
|
|
Args:
|
|
response_time: Response time in seconds.
|
|
"""
|
|
self.slow_response_count += 1
|
|
self.total_response_time += response_time
|
|
self.total_requests += 1
|
|
_increment_throttle_metric("slow_response")
|
|
|
|
def record_empty_response(self) -> None:
|
|
"""Record an unexpected empty response."""
|
|
self.empty_response_count += 1
|
|
_increment_throttle_metric("empty_response")
|
|
|
|
def record_invalid_response(self) -> None:
|
|
"""Record an invalid or error response."""
|
|
self.invalid_response_count += 1
|
|
_increment_throttle_metric("invalid_response")
|
|
|
|
def record_request(self, response_time: float) -> None:
|
|
"""Record a successful request.
|
|
|
|
Args:
|
|
response_time: Response time in seconds.
|
|
"""
|
|
self.total_requests += 1
|
|
self.total_response_time += response_time
|
|
|
|
@property
|
|
def average_response_time(self) -> float:
|
|
"""Calculate average response time in seconds."""
|
|
if self.total_requests == 0:
|
|
return 0.0
|
|
return self.total_response_time / self.total_requests
|
|
|
|
@property
|
|
def total_throttling_events(self) -> int:
|
|
"""Total number of throttling events."""
|
|
return (
|
|
self.rate_limit_count
|
|
+ self.service_unavailable_count
|
|
+ self.ip_blocked_count
|
|
+ self.slow_response_count
|
|
)
|
|
|
|
@property
|
|
def throttle_rate(self) -> float:
|
|
"""Percentage of requests that were throttled."""
|
|
if self.total_requests == 0:
|
|
return 0.0
|
|
return (self.total_throttling_events / self.total_requests) * 100
|
|
|
|
@property
|
|
def elapsed_time(self) -> float:
|
|
"""Time elapsed since metrics started tracking."""
|
|
return time.time() - self._start_time
|
|
|
|
def summary(self) -> str:
|
|
"""Generate a summary of throttling metrics."""
|
|
return (
|
|
f"Throttle Metrics Summary:\n"
|
|
f" Total Requests: {self.total_requests}\n"
|
|
f" Total Throttling Events: {self.total_throttling_events}\n"
|
|
f" Throttle Rate: {self.throttle_rate:.2f}%\n"
|
|
f" Rate Limit (429): {self.rate_limit_count}\n"
|
|
f" Service Unavailable (503): {self.service_unavailable_count}\n"
|
|
f" IP Blocked (403): {self.ip_blocked_count}\n"
|
|
f" Slow Responses: {self.slow_response_count}\n"
|
|
f" Empty Responses: {self.empty_response_count}\n"
|
|
f" Invalid Responses: {self.invalid_response_count}\n"
|
|
f" Average Response Time: {self.average_response_time:.2f}s\n"
|
|
f" Elapsed Time: {self.elapsed_time:.2f}s"
|
|
)
|
|
|
|
|
|
# Global metrics instance
|
|
_global_metrics: ThrottleMetrics | None = None
|
|
|
|
|
|
def get_throttle_metrics() -> ThrottleMetrics:
|
|
"""Get the global throttle metrics instance.
|
|
|
|
Returns:
|
|
Global ThrottleMetrics instance.
|
|
"""
|
|
global _global_metrics
|
|
if _global_metrics is None:
|
|
_global_metrics = ThrottleMetrics()
|
|
return _global_metrics
|
|
|
|
|
|
def reset_throttle_metrics() -> None:
|
|
"""Reset the global throttle metrics."""
|
|
global _global_metrics
|
|
_global_metrics = ThrottleMetrics()
|
|
|
|
|
|
def _increment_throttle_metric(event_type: str) -> None:
|
|
"""Safely increment the OTel throttle counter if metrics are initialised."""
|
|
try:
|
|
from api.metrics import throttle_events_total
|
|
throttle_events_total.add(1, {"type": event_type})
|
|
except Exception:
|
|
pass # Metrics not yet initialised (e.g. during tests)
|
|
|
|
|
|
def validate_response(
|
|
response: aiohttp.ClientResponse,
|
|
response_time: float,
|
|
response_body: dict[str, Any] | None,
|
|
slow_response_threshold: float,
|
|
expect_data: bool = True,
|
|
) -> None:
|
|
"""Validate an API response and raise appropriate exceptions for throttling.
|
|
|
|
Args:
|
|
response: The aiohttp response object.
|
|
response_time: Time taken for the request in seconds.
|
|
response_body: Parsed JSON response body (if available).
|
|
slow_response_threshold: Threshold in seconds for slow responses.
|
|
expect_data: Whether we expect data in the response.
|
|
|
|
Raises:
|
|
RateLimitError: If HTTP 429 is returned.
|
|
ServiceUnavailableError: If HTTP 503 is returned.
|
|
IPBlockedError: If HTTP 403 is returned.
|
|
SlowResponseError: If response time exceeds threshold.
|
|
UnexpectedEmptyResponseError: If response is empty when data is expected.
|
|
InvalidResponseError: If response contains error messages.
|
|
"""
|
|
metrics = get_throttle_metrics()
|
|
|
|
# Check HTTP status codes
|
|
if response.status == 429:
|
|
metrics.record_rate_limit()
|
|
raise RateLimitError(
|
|
f"Rate limit exceeded (HTTP 429). "
|
|
f"Response time: {response_time:.2f}s"
|
|
)
|
|
|
|
if response.status == 503:
|
|
metrics.record_service_unavailable()
|
|
raise ServiceUnavailableError(
|
|
f"Service unavailable (HTTP 503). "
|
|
f"Response time: {response_time:.2f}s"
|
|
)
|
|
|
|
if response.status == 403:
|
|
metrics.record_ip_blocked()
|
|
raise IPBlockedError(
|
|
f"Access forbidden, possible IP block (HTTP 403). "
|
|
f"Response time: {response_time:.2f}s"
|
|
)
|
|
|
|
# Check response time
|
|
if response_time > slow_response_threshold:
|
|
metrics.record_slow_response(response_time)
|
|
raise SlowResponseError(
|
|
f"Slow response detected: {response_time:.2f}s "
|
|
f"(threshold: {slow_response_threshold}s)"
|
|
)
|
|
|
|
# Check response body if available
|
|
if response_body is not None:
|
|
# Check for error messages
|
|
if "error" in response_body or "GENERIC_ERROR" in str(response_body):
|
|
metrics.record_invalid_response()
|
|
raise InvalidResponseError(
|
|
f"Error in response body: {response_body}"
|
|
)
|
|
|
|
# Check for unexpected empty responses
|
|
if expect_data:
|
|
properties = response_body.get("properties", [])
|
|
total_results = response_body.get("totalAvailableResults", 0)
|
|
|
|
# If we expect data but got none (and total shows there should be some)
|
|
if total_results > 0 and len(properties) == 0:
|
|
metrics.record_empty_response()
|
|
raise UnexpectedEmptyResponseError(
|
|
f"Expected data but got empty response. "
|
|
f"Total available: {total_results}"
|
|
)
|
|
|
|
# Record successful request
|
|
metrics.record_request(response_time)
|