Flatten repo structure: move crawler/ to root, remove vqa/ and immoweb/
The crawler subdirectory was the only active project. Moving it to the repo root simplifies paths and removes the unnecessary nesting. The vqa/ and immoweb/ directories were legacy/unused and have been removed. Updated .drone.yml, .gitignore, .claude/ docs, and skills to reflect the new flat structure.
This commit is contained in:
parent
e2247be700
commit
eafbc1ac52
221 changed files with 70 additions and 146140 deletions
232
rec/throttle_detector.py
Normal file
232
rec/throttle_detector.py
Normal file
|
|
@ -0,0 +1,232 @@
|
|||
"""Throttling detection and metrics for Rightmove API."""
|
||||
from __future__ import annotations
|
||||
|
||||
import time
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Any
|
||||
|
||||
import aiohttp
|
||||
|
||||
from rec.exceptions import (
|
||||
InvalidResponseError,
|
||||
IPBlockedError,
|
||||
RateLimitError,
|
||||
ServiceUnavailableError,
|
||||
SlowResponseError,
|
||||
UnexpectedEmptyResponseError,
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class ThrottleMetrics:
|
||||
"""Tracks throttling events and metrics.
|
||||
|
||||
Attributes:
|
||||
rate_limit_count: Number of HTTP 429 errors.
|
||||
service_unavailable_count: Number of HTTP 503 errors.
|
||||
ip_blocked_count: Number of HTTP 403 errors.
|
||||
slow_response_count: Number of slow responses.
|
||||
empty_response_count: Number of unexpected empty responses.
|
||||
invalid_response_count: Number of invalid/error responses.
|
||||
total_requests: Total number of requests made.
|
||||
total_response_time: Cumulative response time in seconds.
|
||||
"""
|
||||
|
||||
rate_limit_count: int = 0
|
||||
service_unavailable_count: int = 0
|
||||
ip_blocked_count: int = 0
|
||||
slow_response_count: int = 0
|
||||
empty_response_count: int = 0
|
||||
invalid_response_count: int = 0
|
||||
total_requests: int = 0
|
||||
total_response_time: float = 0.0
|
||||
_start_time: float = field(default_factory=time.time)
|
||||
|
||||
def record_rate_limit(self) -> None:
|
||||
"""Record a rate limit error (HTTP 429)."""
|
||||
self.rate_limit_count += 1
|
||||
|
||||
def record_service_unavailable(self) -> None:
|
||||
"""Record a service unavailable error (HTTP 503)."""
|
||||
self.service_unavailable_count += 1
|
||||
|
||||
def record_ip_blocked(self) -> None:
|
||||
"""Record an IP blocked error (HTTP 403)."""
|
||||
self.ip_blocked_count += 1
|
||||
|
||||
def record_slow_response(self, response_time: float) -> None:
|
||||
"""Record a slow response.
|
||||
|
||||
Args:
|
||||
response_time: Response time in seconds.
|
||||
"""
|
||||
self.slow_response_count += 1
|
||||
self.total_response_time += response_time
|
||||
self.total_requests += 1
|
||||
|
||||
def record_empty_response(self) -> None:
|
||||
"""Record an unexpected empty response."""
|
||||
self.empty_response_count += 1
|
||||
|
||||
def record_invalid_response(self) -> None:
|
||||
"""Record an invalid or error response."""
|
||||
self.invalid_response_count += 1
|
||||
|
||||
def record_request(self, response_time: float) -> None:
|
||||
"""Record a successful request.
|
||||
|
||||
Args:
|
||||
response_time: Response time in seconds.
|
||||
"""
|
||||
self.total_requests += 1
|
||||
self.total_response_time += response_time
|
||||
|
||||
@property
|
||||
def average_response_time(self) -> float:
|
||||
"""Calculate average response time in seconds."""
|
||||
if self.total_requests == 0:
|
||||
return 0.0
|
||||
return self.total_response_time / self.total_requests
|
||||
|
||||
@property
|
||||
def total_throttling_events(self) -> int:
|
||||
"""Total number of throttling events."""
|
||||
return (
|
||||
self.rate_limit_count
|
||||
+ self.service_unavailable_count
|
||||
+ self.ip_blocked_count
|
||||
+ self.slow_response_count
|
||||
)
|
||||
|
||||
@property
|
||||
def throttle_rate(self) -> float:
|
||||
"""Percentage of requests that were throttled."""
|
||||
if self.total_requests == 0:
|
||||
return 0.0
|
||||
return (self.total_throttling_events / self.total_requests) * 100
|
||||
|
||||
@property
|
||||
def elapsed_time(self) -> float:
|
||||
"""Time elapsed since metrics started tracking."""
|
||||
return time.time() - self._start_time
|
||||
|
||||
def summary(self) -> str:
|
||||
"""Generate a summary of throttling metrics."""
|
||||
return (
|
||||
f"Throttle Metrics Summary:\n"
|
||||
f" Total Requests: {self.total_requests}\n"
|
||||
f" Total Throttling Events: {self.total_throttling_events}\n"
|
||||
f" Throttle Rate: {self.throttle_rate:.2f}%\n"
|
||||
f" Rate Limit (429): {self.rate_limit_count}\n"
|
||||
f" Service Unavailable (503): {self.service_unavailable_count}\n"
|
||||
f" IP Blocked (403): {self.ip_blocked_count}\n"
|
||||
f" Slow Responses: {self.slow_response_count}\n"
|
||||
f" Empty Responses: {self.empty_response_count}\n"
|
||||
f" Invalid Responses: {self.invalid_response_count}\n"
|
||||
f" Average Response Time: {self.average_response_time:.2f}s\n"
|
||||
f" Elapsed Time: {self.elapsed_time:.2f}s"
|
||||
)
|
||||
|
||||
|
||||
# Global metrics instance
|
||||
_global_metrics: ThrottleMetrics | None = None
|
||||
|
||||
|
||||
def get_throttle_metrics() -> ThrottleMetrics:
|
||||
"""Get the global throttle metrics instance.
|
||||
|
||||
Returns:
|
||||
Global ThrottleMetrics instance.
|
||||
"""
|
||||
global _global_metrics
|
||||
if _global_metrics is None:
|
||||
_global_metrics = ThrottleMetrics()
|
||||
return _global_metrics
|
||||
|
||||
|
||||
def reset_throttle_metrics() -> None:
|
||||
"""Reset the global throttle metrics."""
|
||||
global _global_metrics
|
||||
_global_metrics = ThrottleMetrics()
|
||||
|
||||
|
||||
def validate_response(
|
||||
response: aiohttp.ClientResponse,
|
||||
response_time: float,
|
||||
response_body: dict[str, Any] | None,
|
||||
slow_response_threshold: float,
|
||||
expect_data: bool = True,
|
||||
) -> None:
|
||||
"""Validate an API response and raise appropriate exceptions for throttling.
|
||||
|
||||
Args:
|
||||
response: The aiohttp response object.
|
||||
response_time: Time taken for the request in seconds.
|
||||
response_body: Parsed JSON response body (if available).
|
||||
slow_response_threshold: Threshold in seconds for slow responses.
|
||||
expect_data: Whether we expect data in the response.
|
||||
|
||||
Raises:
|
||||
RateLimitError: If HTTP 429 is returned.
|
||||
ServiceUnavailableError: If HTTP 503 is returned.
|
||||
IPBlockedError: If HTTP 403 is returned.
|
||||
SlowResponseError: If response time exceeds threshold.
|
||||
UnexpectedEmptyResponseError: If response is empty when data is expected.
|
||||
InvalidResponseError: If response contains error messages.
|
||||
"""
|
||||
metrics = get_throttle_metrics()
|
||||
|
||||
# Check HTTP status codes
|
||||
if response.status == 429:
|
||||
metrics.record_rate_limit()
|
||||
raise RateLimitError(
|
||||
f"Rate limit exceeded (HTTP 429). "
|
||||
f"Response time: {response_time:.2f}s"
|
||||
)
|
||||
|
||||
if response.status == 503:
|
||||
metrics.record_service_unavailable()
|
||||
raise ServiceUnavailableError(
|
||||
f"Service unavailable (HTTP 503). "
|
||||
f"Response time: {response_time:.2f}s"
|
||||
)
|
||||
|
||||
if response.status == 403:
|
||||
metrics.record_ip_blocked()
|
||||
raise IPBlockedError(
|
||||
f"Access forbidden, possible IP block (HTTP 403). "
|
||||
f"Response time: {response_time:.2f}s"
|
||||
)
|
||||
|
||||
# Check response time
|
||||
if response_time > slow_response_threshold:
|
||||
metrics.record_slow_response(response_time)
|
||||
raise SlowResponseError(
|
||||
f"Slow response detected: {response_time:.2f}s "
|
||||
f"(threshold: {slow_response_threshold}s)"
|
||||
)
|
||||
|
||||
# Check response body if available
|
||||
if response_body is not None:
|
||||
# Check for error messages
|
||||
if "error" in response_body or "GENERIC_ERROR" in str(response_body):
|
||||
metrics.record_invalid_response()
|
||||
raise InvalidResponseError(
|
||||
f"Error in response body: {response_body}"
|
||||
)
|
||||
|
||||
# Check for unexpected empty responses
|
||||
if expect_data:
|
||||
properties = response_body.get("properties", [])
|
||||
total_results = response_body.get("totalAvailableResults", 0)
|
||||
|
||||
# If we expect data but got none (and total shows there should be some)
|
||||
if total_results > 0 and len(properties) == 0:
|
||||
metrics.record_empty_response()
|
||||
raise UnexpectedEmptyResponseError(
|
||||
f"Expected data but got empty response. "
|
||||
f"Total available: {total_results}"
|
||||
)
|
||||
|
||||
# Record successful request
|
||||
metrics.record_request(response_time)
|
||||
Loading…
Add table
Add a link
Reference in a new issue