wrongmove/rec/throttle_detector.py
Viktor Barzin d6edb747d2
Add structured JSON logging, OTel business metrics, and Grafana dashboard
Structured logging via JsonFormatter replaces uvicorn's default format so
Loki can parse timestamps and fields.  14 business metrics (scrape stats,
throttle events, circuit breaker state, cache hit rate, OCR success rate,
Celery task lifecycle) are defined in a shared metrics module and
instrumented across the scraper pipeline, API, and workers.  Celery
workers expose a Prometheus HTTP endpoint on configurable ports.
2026-02-14 10:59:12 +00:00

247 lines
8.3 KiB
Python

"""Throttling detection and metrics for Rightmove API."""
from __future__ import annotations
import time
from dataclasses import dataclass, field
from typing import Any
import aiohttp
from rec.exceptions import (
InvalidResponseError,
IPBlockedError,
RateLimitError,
ServiceUnavailableError,
SlowResponseError,
UnexpectedEmptyResponseError,
)
@dataclass
class ThrottleMetrics:
"""Tracks throttling events and metrics.
Attributes:
rate_limit_count: Number of HTTP 429 errors.
service_unavailable_count: Number of HTTP 503 errors.
ip_blocked_count: Number of HTTP 403 errors.
slow_response_count: Number of slow responses.
empty_response_count: Number of unexpected empty responses.
invalid_response_count: Number of invalid/error responses.
total_requests: Total number of requests made.
total_response_time: Cumulative response time in seconds.
"""
rate_limit_count: int = 0
service_unavailable_count: int = 0
ip_blocked_count: int = 0
slow_response_count: int = 0
empty_response_count: int = 0
invalid_response_count: int = 0
total_requests: int = 0
total_response_time: float = 0.0
_start_time: float = field(default_factory=time.time)
def record_rate_limit(self) -> None:
"""Record a rate limit error (HTTP 429)."""
self.rate_limit_count += 1
_increment_throttle_metric("rate_limit")
def record_service_unavailable(self) -> None:
"""Record a service unavailable error (HTTP 503)."""
self.service_unavailable_count += 1
_increment_throttle_metric("service_unavailable")
def record_ip_blocked(self) -> None:
"""Record an IP blocked error (HTTP 403)."""
self.ip_blocked_count += 1
_increment_throttle_metric("ip_blocked")
def record_slow_response(self, response_time: float) -> None:
"""Record a slow response.
Args:
response_time: Response time in seconds.
"""
self.slow_response_count += 1
self.total_response_time += response_time
self.total_requests += 1
_increment_throttle_metric("slow_response")
def record_empty_response(self) -> None:
"""Record an unexpected empty response."""
self.empty_response_count += 1
_increment_throttle_metric("empty_response")
def record_invalid_response(self) -> None:
"""Record an invalid or error response."""
self.invalid_response_count += 1
_increment_throttle_metric("invalid_response")
def record_request(self, response_time: float) -> None:
"""Record a successful request.
Args:
response_time: Response time in seconds.
"""
self.total_requests += 1
self.total_response_time += response_time
@property
def average_response_time(self) -> float:
"""Calculate average response time in seconds."""
if self.total_requests == 0:
return 0.0
return self.total_response_time / self.total_requests
@property
def total_throttling_events(self) -> int:
"""Total number of throttling events."""
return (
self.rate_limit_count
+ self.service_unavailable_count
+ self.ip_blocked_count
+ self.slow_response_count
)
@property
def throttle_rate(self) -> float:
"""Percentage of requests that were throttled."""
if self.total_requests == 0:
return 0.0
return (self.total_throttling_events / self.total_requests) * 100
@property
def elapsed_time(self) -> float:
"""Time elapsed since metrics started tracking."""
return time.time() - self._start_time
def summary(self) -> str:
"""Generate a summary of throttling metrics."""
return (
f"Throttle Metrics Summary:\n"
f" Total Requests: {self.total_requests}\n"
f" Total Throttling Events: {self.total_throttling_events}\n"
f" Throttle Rate: {self.throttle_rate:.2f}%\n"
f" Rate Limit (429): {self.rate_limit_count}\n"
f" Service Unavailable (503): {self.service_unavailable_count}\n"
f" IP Blocked (403): {self.ip_blocked_count}\n"
f" Slow Responses: {self.slow_response_count}\n"
f" Empty Responses: {self.empty_response_count}\n"
f" Invalid Responses: {self.invalid_response_count}\n"
f" Average Response Time: {self.average_response_time:.2f}s\n"
f" Elapsed Time: {self.elapsed_time:.2f}s"
)
# Global metrics instance
_global_metrics: ThrottleMetrics | None = None
def get_throttle_metrics() -> ThrottleMetrics:
"""Get the global throttle metrics instance.
Returns:
Global ThrottleMetrics instance.
"""
global _global_metrics
if _global_metrics is None:
_global_metrics = ThrottleMetrics()
return _global_metrics
def reset_throttle_metrics() -> None:
"""Reset the global throttle metrics."""
global _global_metrics
_global_metrics = ThrottleMetrics()
def _increment_throttle_metric(event_type: str) -> None:
"""Safely increment the OTel throttle counter if metrics are initialised."""
try:
from api.metrics import throttle_events_total
throttle_events_total.add(1, {"type": event_type})
except Exception:
pass # Metrics not yet initialised (e.g. during tests)
def validate_response(
response: aiohttp.ClientResponse,
response_time: float,
response_body: dict[str, Any] | None,
slow_response_threshold: float,
expect_data: bool = True,
) -> None:
"""Validate an API response and raise appropriate exceptions for throttling.
Args:
response: The aiohttp response object.
response_time: Time taken for the request in seconds.
response_body: Parsed JSON response body (if available).
slow_response_threshold: Threshold in seconds for slow responses.
expect_data: Whether we expect data in the response.
Raises:
RateLimitError: If HTTP 429 is returned.
ServiceUnavailableError: If HTTP 503 is returned.
IPBlockedError: If HTTP 403 is returned.
SlowResponseError: If response time exceeds threshold.
UnexpectedEmptyResponseError: If response is empty when data is expected.
InvalidResponseError: If response contains error messages.
"""
metrics = get_throttle_metrics()
# Check HTTP status codes
if response.status == 429:
metrics.record_rate_limit()
raise RateLimitError(
f"Rate limit exceeded (HTTP 429). "
f"Response time: {response_time:.2f}s"
)
if response.status == 503:
metrics.record_service_unavailable()
raise ServiceUnavailableError(
f"Service unavailable (HTTP 503). "
f"Response time: {response_time:.2f}s"
)
if response.status == 403:
metrics.record_ip_blocked()
raise IPBlockedError(
f"Access forbidden, possible IP block (HTTP 403). "
f"Response time: {response_time:.2f}s"
)
# Check response time
if response_time > slow_response_threshold:
metrics.record_slow_response(response_time)
raise SlowResponseError(
f"Slow response detected: {response_time:.2f}s "
f"(threshold: {slow_response_threshold}s)"
)
# Check response body if available
if response_body is not None:
# Check for error messages
if "error" in response_body or "GENERIC_ERROR" in str(response_body):
metrics.record_invalid_response()
raise InvalidResponseError(
f"Error in response body: {response_body}"
)
# Check for unexpected empty responses
if expect_data:
properties = response_body.get("properties", [])
total_results = response_body.get("totalAvailableResults", 0)
# If we expect data but got none (and total shows there should be some)
if total_results > 0 and len(properties) == 0:
metrics.record_empty_response()
raise UnexpectedEmptyResponseError(
f"Expected data but got empty response. "
f"Total available: {total_results}"
)
# Record successful request
metrics.record_request(response_time)