Instrument DB query timing (11 operations across 3 repositories), streaming lifecycle (TTFB, duration, feature count), cache operation latency, listing detail step breakdown, and frontend page load / time-to-first-listing / stream download / detail load metrics. Adds 16 new OTel instruments, extends the perf ingestion endpoint with 4 new frontend metrics, and adds ~20 Grafana dashboard panels across 4 new rows (DB Query Performance, Streaming Performance, Listing Detail Breakdown, Cache Performance, Frontend Navigation).
306 lines
11 KiB
Python
306 lines
11 KiB
Python
"""OpenTelemetry metrics with Prometheus export.
|
|
|
|
Provides ``init_metrics()`` to lazily initialise the MeterProvider and all
|
|
business metric instruments. Safe to call from both the API and Celery
|
|
workers — the provider is created at most once per process.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
from opentelemetry.metrics import (
|
|
Counter,
|
|
Histogram,
|
|
Meter,
|
|
UpDownCounter,
|
|
get_meter,
|
|
set_meter_provider,
|
|
)
|
|
from opentelemetry.sdk.metrics import MeterProvider
|
|
from opentelemetry.sdk.resources import SERVICE_NAME, Resource
|
|
from opentelemetry.exporter.prometheus import PrometheusMetricReader
|
|
from prometheus_client import make_asgi_app
|
|
|
|
_reader: PrometheusMetricReader | None = None
|
|
_meter: Meter | None = None
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Scrape metrics
|
|
# ---------------------------------------------------------------------------
|
|
scrape_listings_found: Counter
|
|
scrape_listings_processed: Counter
|
|
scrape_listings_failed: Counter
|
|
scrape_duration_seconds: Histogram
|
|
scrape_pages_fetched: Counter
|
|
scrape_subqueries_total: Counter
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Throttle / circuit-breaker metrics
|
|
# ---------------------------------------------------------------------------
|
|
throttle_events_total: Counter
|
|
# circuit_breaker_state is registered as an ObservableGauge in circuit_breaker.py
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# API / cache metrics
|
|
# ---------------------------------------------------------------------------
|
|
geojson_cache_operations: Counter
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# OCR metrics
|
|
# ---------------------------------------------------------------------------
|
|
ocr_attempts: Counter
|
|
ocr_successes: Counter
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Celery task metrics
|
|
# ---------------------------------------------------------------------------
|
|
celery_tasks_total: Counter
|
|
celery_task_duration_seconds: Histogram
|
|
celery_tasks_active: UpDownCounter
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Frontend performance metrics
|
|
# ---------------------------------------------------------------------------
|
|
frontend_worker_roundtrip: Histogram
|
|
frontend_worker_compute: Histogram
|
|
frontend_main_thread: Histogram
|
|
frontend_feature_count: Histogram
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Database query metrics
|
|
# ---------------------------------------------------------------------------
|
|
db_query_duration_seconds: Histogram
|
|
db_query_rows_returned: Histogram
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Streaming lifecycle metrics
|
|
# ---------------------------------------------------------------------------
|
|
stream_time_to_first_byte_seconds: Histogram
|
|
stream_total_duration_seconds: Histogram
|
|
stream_features_total: Counter
|
|
stream_requests_total: Counter
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Cache performance metrics
|
|
# ---------------------------------------------------------------------------
|
|
cache_operation_duration_seconds: Histogram
|
|
cache_repopulation_total: Counter
|
|
cache_stale_serves_total: Counter
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Listing detail metrics
|
|
# ---------------------------------------------------------------------------
|
|
listing_detail_step_duration_seconds: Histogram
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Frontend navigation/usage metrics
|
|
# ---------------------------------------------------------------------------
|
|
frontend_page_load: Histogram
|
|
frontend_time_to_first_listing: Histogram
|
|
frontend_stream_download: Histogram
|
|
frontend_listing_detail_load: Histogram
|
|
|
|
|
|
def init_metrics(service_name: str = "realestate-crawler") -> PrometheusMetricReader:
|
|
"""Initialise the OTel MeterProvider and define all instruments.
|
|
|
|
Returns the ``PrometheusMetricReader`` so the API can mount the ASGI app.
|
|
Calling this more than once is a no-op (returns the existing reader).
|
|
"""
|
|
global _reader, _meter
|
|
global scrape_listings_found, scrape_listings_processed, scrape_listings_failed
|
|
global scrape_duration_seconds, scrape_pages_fetched, scrape_subqueries_total
|
|
global throttle_events_total
|
|
global geojson_cache_operations
|
|
global ocr_attempts, ocr_successes
|
|
global celery_tasks_total, celery_task_duration_seconds, celery_tasks_active
|
|
global frontend_worker_roundtrip, frontend_worker_compute
|
|
global frontend_main_thread, frontend_feature_count
|
|
global db_query_duration_seconds, db_query_rows_returned
|
|
global stream_time_to_first_byte_seconds, stream_total_duration_seconds
|
|
global stream_features_total, stream_requests_total
|
|
global cache_operation_duration_seconds, cache_repopulation_total
|
|
global cache_stale_serves_total
|
|
global listing_detail_step_duration_seconds
|
|
global frontend_page_load, frontend_time_to_first_listing
|
|
global frontend_stream_download, frontend_listing_detail_load
|
|
|
|
if _reader is not None:
|
|
return _reader
|
|
|
|
_reader = PrometheusMetricReader()
|
|
provider = MeterProvider(
|
|
metric_readers=[_reader],
|
|
resource=Resource.create({SERVICE_NAME: service_name}),
|
|
)
|
|
set_meter_provider(provider)
|
|
_meter = get_meter(__name__)
|
|
|
|
# -- Scrape --
|
|
scrape_listings_found = _meter.create_counter(
|
|
"scrape_listings_found_total",
|
|
description="Total listings discovered during scrape runs",
|
|
)
|
|
scrape_listings_processed = _meter.create_counter(
|
|
"scrape_listings_processed_total",
|
|
description="Total listings successfully processed",
|
|
)
|
|
scrape_listings_failed = _meter.create_counter(
|
|
"scrape_listings_failed_total",
|
|
description="Total listings that failed processing",
|
|
)
|
|
scrape_duration_seconds = _meter.create_histogram(
|
|
"scrape_duration_seconds",
|
|
description="Duration of a full scrape run in seconds",
|
|
)
|
|
scrape_pages_fetched = _meter.create_counter(
|
|
"scrape_pages_fetched_total",
|
|
description="Total API pages fetched during scraping",
|
|
)
|
|
scrape_subqueries_total = _meter.create_counter(
|
|
"scrape_subqueries_total",
|
|
description="Total subqueries executed after query splitting",
|
|
)
|
|
|
|
# -- Throttle --
|
|
throttle_events_total = _meter.create_counter(
|
|
"throttle_events_total",
|
|
description="Total throttling events by type",
|
|
)
|
|
|
|
# -- Cache --
|
|
geojson_cache_operations = _meter.create_counter(
|
|
"geojson_cache_operations_total",
|
|
description="GeoJSON cache operations (hit/miss)",
|
|
)
|
|
|
|
# -- OCR --
|
|
ocr_attempts = _meter.create_counter(
|
|
"ocr_attempts_total",
|
|
description="Total OCR detection attempts",
|
|
)
|
|
ocr_successes = _meter.create_counter(
|
|
"ocr_successes_total",
|
|
description="Total OCR detections that found square meters",
|
|
)
|
|
|
|
# -- Celery --
|
|
celery_tasks_total = _meter.create_counter(
|
|
"celery_tasks_total",
|
|
description="Total Celery tasks by name and status",
|
|
)
|
|
celery_task_duration_seconds = _meter.create_histogram(
|
|
"celery_task_duration_seconds",
|
|
description="Duration of Celery tasks in seconds",
|
|
)
|
|
celery_tasks_active = _meter.create_up_down_counter(
|
|
"celery_tasks_active",
|
|
description="Currently active Celery tasks",
|
|
)
|
|
|
|
# -- Frontend performance --
|
|
frontend_worker_roundtrip = _meter.create_histogram(
|
|
"frontend_worker_roundtrip_seconds",
|
|
description="Browser worker message round-trip time",
|
|
)
|
|
frontend_worker_compute = _meter.create_histogram(
|
|
"frontend_worker_compute_seconds",
|
|
description="Computation time inside the web worker",
|
|
)
|
|
frontend_main_thread = _meter.create_histogram(
|
|
"frontend_main_thread_seconds",
|
|
description="Main-thread blocking operation duration",
|
|
)
|
|
frontend_feature_count = _meter.create_histogram(
|
|
"frontend_feature_count",
|
|
description="Number of features per heatmap load",
|
|
)
|
|
|
|
# -- Database query timing --
|
|
db_query_duration_seconds = _meter.create_histogram(
|
|
"db_query_duration_seconds",
|
|
description="Duration of individual database queries in seconds",
|
|
)
|
|
db_query_rows_returned = _meter.create_histogram(
|
|
"db_query_rows_returned",
|
|
description="Number of rows returned per database query",
|
|
)
|
|
|
|
# -- Streaming lifecycle --
|
|
stream_time_to_first_byte_seconds = _meter.create_histogram(
|
|
"stream_time_to_first_byte_seconds",
|
|
description="Time from handler entry to first NDJSON line",
|
|
)
|
|
stream_total_duration_seconds = _meter.create_histogram(
|
|
"stream_total_duration_seconds",
|
|
description="Total wall-clock time for a streaming response",
|
|
)
|
|
stream_features_total = _meter.create_counter(
|
|
"stream_features_total",
|
|
description="Total GeoJSON features streamed to clients",
|
|
)
|
|
stream_requests_total = _meter.create_counter(
|
|
"stream_requests_total",
|
|
description="Total streaming requests served",
|
|
)
|
|
|
|
# -- Cache performance --
|
|
cache_operation_duration_seconds = _meter.create_histogram(
|
|
"cache_operation_duration_seconds",
|
|
description="Redis cache operation latency in seconds",
|
|
)
|
|
cache_repopulation_total = _meter.create_counter(
|
|
"cache_repopulation_total",
|
|
description="Cache repopulation events by result",
|
|
)
|
|
cache_stale_serves_total = _meter.create_counter(
|
|
"cache_stale_serves_total",
|
|
description="Number of times stale cache was served during repopulation",
|
|
)
|
|
|
|
# -- Listing detail --
|
|
listing_detail_step_duration_seconds = _meter.create_histogram(
|
|
"listing_detail_step_duration_seconds",
|
|
description="Per-step timing in listing detail endpoint",
|
|
)
|
|
|
|
# -- Frontend navigation/usage --
|
|
frontend_page_load = _meter.create_histogram(
|
|
"frontend_page_load_seconds",
|
|
description="Full page or filter load to data rendered",
|
|
)
|
|
frontend_time_to_first_listing = _meter.create_histogram(
|
|
"frontend_time_to_first_listing_seconds",
|
|
description="Time from load trigger to first listing batch on screen",
|
|
)
|
|
frontend_stream_download = _meter.create_histogram(
|
|
"frontend_stream_download_seconds",
|
|
description="Client-side total stream download duration",
|
|
)
|
|
frontend_listing_detail_load = _meter.create_histogram(
|
|
"frontend_listing_detail_load_seconds",
|
|
description="Time from click to listing detail data rendered",
|
|
)
|
|
|
|
return _reader
|
|
|
|
|
|
def record_db_query(
|
|
operation: str,
|
|
model: str,
|
|
duration: float,
|
|
rows: int | None = None,
|
|
) -> None:
|
|
"""Record a database query timing metric.
|
|
|
|
Safe to call even when ``init_metrics()`` has not been called (e.g.
|
|
from CLI usage) — silently no-ops in that case.
|
|
"""
|
|
if _meter is None:
|
|
return
|
|
db_query_duration_seconds.record(duration, {"operation": operation, "model": model})
|
|
if rows is not None:
|
|
db_query_rows_returned.record(rows, {"operation": operation})
|
|
|
|
|
|
def get_metrics_asgi_app(): # type: ignore[no-untyped-def]
|
|
"""Return the Prometheus ASGI app for mounting at /metrics."""
|
|
return make_asgi_app()
|