Add navigation & usage metrics for end-user experience visibility

Instrument DB query timing (11 operations across 3 repositories),
streaming lifecycle (TTFB, duration, feature count), cache operation
latency, listing detail step breakdown, and frontend page load /
time-to-first-listing / stream download / detail load metrics.

Adds 16 new OTel instruments, extends the perf ingestion endpoint
with 4 new frontend metrics, and adds ~20 Grafana dashboard panels
across 4 new rows (DB Query Performance, Streaming Performance,
Listing Detail Breakdown, Cache Performance, Frontend Navigation).
This commit is contained in:
Viktor Barzin 2026-02-23 20:28:42 +00:00
parent 1ae00b7cbf
commit 35f1987ac1
No known key found for this signature in database
GPG key ID: 0EB088298288D958
11 changed files with 1236 additions and 26 deletions

View file

@ -64,6 +64,40 @@ frontend_worker_compute: Histogram
frontend_main_thread: Histogram
frontend_feature_count: Histogram
# ---------------------------------------------------------------------------
# Database query metrics
# ---------------------------------------------------------------------------
db_query_duration_seconds: Histogram
db_query_rows_returned: Histogram
# ---------------------------------------------------------------------------
# Streaming lifecycle metrics
# ---------------------------------------------------------------------------
stream_time_to_first_byte_seconds: Histogram
stream_total_duration_seconds: Histogram
stream_features_total: Counter
stream_requests_total: Counter
# ---------------------------------------------------------------------------
# Cache performance metrics
# ---------------------------------------------------------------------------
cache_operation_duration_seconds: Histogram
cache_repopulation_total: Counter
cache_stale_serves_total: Counter
# ---------------------------------------------------------------------------
# Listing detail metrics
# ---------------------------------------------------------------------------
listing_detail_step_duration_seconds: Histogram
# ---------------------------------------------------------------------------
# Frontend navigation/usage metrics
# ---------------------------------------------------------------------------
frontend_page_load: Histogram
frontend_time_to_first_listing: Histogram
frontend_stream_download: Histogram
frontend_listing_detail_load: Histogram
def init_metrics(service_name: str = "realestate-crawler") -> PrometheusMetricReader:
"""Initialise the OTel MeterProvider and define all instruments.
@ -80,6 +114,14 @@ def init_metrics(service_name: str = "realestate-crawler") -> PrometheusMetricRe
global celery_tasks_total, celery_task_duration_seconds, celery_tasks_active
global frontend_worker_roundtrip, frontend_worker_compute
global frontend_main_thread, frontend_feature_count
global db_query_duration_seconds, db_query_rows_returned
global stream_time_to_first_byte_seconds, stream_total_duration_seconds
global stream_features_total, stream_requests_total
global cache_operation_duration_seconds, cache_repopulation_total
global cache_stale_serves_total
global listing_detail_step_duration_seconds
global frontend_page_load, frontend_time_to_first_listing
global frontend_stream_download, frontend_listing_detail_load
if _reader is not None:
return _reader
@ -172,9 +214,93 @@ def init_metrics(service_name: str = "realestate-crawler") -> PrometheusMetricRe
description="Number of features per heatmap load",
)
# -- Database query timing --
db_query_duration_seconds = _meter.create_histogram(
"db_query_duration_seconds",
description="Duration of individual database queries in seconds",
)
db_query_rows_returned = _meter.create_histogram(
"db_query_rows_returned",
description="Number of rows returned per database query",
)
# -- Streaming lifecycle --
stream_time_to_first_byte_seconds = _meter.create_histogram(
"stream_time_to_first_byte_seconds",
description="Time from handler entry to first NDJSON line",
)
stream_total_duration_seconds = _meter.create_histogram(
"stream_total_duration_seconds",
description="Total wall-clock time for a streaming response",
)
stream_features_total = _meter.create_counter(
"stream_features_total",
description="Total GeoJSON features streamed to clients",
)
stream_requests_total = _meter.create_counter(
"stream_requests_total",
description="Total streaming requests served",
)
# -- Cache performance --
cache_operation_duration_seconds = _meter.create_histogram(
"cache_operation_duration_seconds",
description="Redis cache operation latency in seconds",
)
cache_repopulation_total = _meter.create_counter(
"cache_repopulation_total",
description="Cache repopulation events by result",
)
cache_stale_serves_total = _meter.create_counter(
"cache_stale_serves_total",
description="Number of times stale cache was served during repopulation",
)
# -- Listing detail --
listing_detail_step_duration_seconds = _meter.create_histogram(
"listing_detail_step_duration_seconds",
description="Per-step timing in listing detail endpoint",
)
# -- Frontend navigation/usage --
frontend_page_load = _meter.create_histogram(
"frontend_page_load_seconds",
description="Full page or filter load to data rendered",
)
frontend_time_to_first_listing = _meter.create_histogram(
"frontend_time_to_first_listing_seconds",
description="Time from load trigger to first listing batch on screen",
)
frontend_stream_download = _meter.create_histogram(
"frontend_stream_download_seconds",
description="Client-side total stream download duration",
)
frontend_listing_detail_load = _meter.create_histogram(
"frontend_listing_detail_load_seconds",
description="Time from click to listing detail data rendered",
)
return _reader
def record_db_query(
operation: str,
model: str,
duration: float,
rows: int | None = None,
) -> None:
"""Record a database query timing metric.
Safe to call even when ``init_metrics()`` has not been called (e.g.
from CLI usage) silently no-ops in that case.
"""
if _meter is None:
return
db_query_duration_seconds.record(duration, {"operation": operation, "model": model})
if rows is not None:
db_query_rows_returned.record(rows, {"operation": operation})
def get_metrics_asgi_app(): # type: ignore[no-untyped-def]
"""Return the Prometheus ASGI app for mounting at /metrics."""
return make_asgi_app()