Add navigation & usage metrics for end-user experience visibility

Instrument DB query timing (11 operations across 3 repositories), streaming lifecycle (TTFB, duration, feature count), cache operation latency, listing detail step breakdown, and frontend page load / time-to-first-listing / stream download / detail load metrics. Adds 16 new OTel instruments, extends the perf ingestion endpoint with 4 new frontend metrics, and adds ~20 Grafana dashboard panels across 4 new rows (DB Query Performance, Streaming Performance, Listing Detail Breakdown, Cache Performance, Frontend Navigation).
2026-02-23 20:28:42 +00:00 · 2026-02-23 20:28:42 +00:00 · 35f1987ac1
commit 35f1987ac1
parent 1ae00b7cbf
11 changed files with 1236 additions and 26 deletions
--- a/services/listing_cache.py
+++ b/services/listing_cache.py
@ -3,16 +3,25 @@ import hashlib
 import json
 import logging
 import os
+import time
 import uuid
 from typing import Generator
 from urllib.parse import urlparse, urlunparse

 import redis

+import api.metrics as app_metrics
 from models.listing import QueryParameters

 logger = logging.getLogger(__name__)

+
+def _record_cache_op(operation: str, duration: float) -> None:
+    """Record a cache operation timing metric, no-op if metrics aren't initialized."""
+    if app_metrics._meter is None:
+        return
+    app_metrics.cache_operation_duration_seconds.record(duration, {"operation": operation})
+
 CACHE_PREFIX = "listings:geojson:"
 STAGING_PREFIX = "listings:geojson:staging:"
 CACHE_TTL_SECONDS = 24 * 60 * 60  # 24 hours
@ -40,11 +49,15 @@ def make_cache_key(query_params: QueryParameters) -> str:
 def get_cached_count(query_params: QueryParameters) -> int | None:
    """Return the number of cached features for a query, or None if not cached."""
    try:
+        t0 = time.monotonic()
        client = _get_redis_client()
        key = make_cache_key(query_params)
        if not client.exists(key):
+            _record_cache_op("check", time.monotonic() - t0)
            return None
-        return client.llen(key)
+        count = client.llen(key)
+        _record_cache_op("check", time.monotonic() - t0)
+        return count
    except redis.RedisError as e:
        logger.warning(f"Redis cache read error: {e}")
        return None
@ -61,7 +74,9 @@ def get_cached_features(

        for start in range(0, total, batch_size):
            end = start + batch_size - 1
+            t0 = time.monotonic()
            items = client.lrange(key, start, end)
+            _record_cache_op("read_batch", time.monotonic() - t0)
            batch = [json.loads(item) for item in items]
            if batch:
                yield batch
@ -100,12 +115,14 @@ def cache_features_batch_staged(staging_key: str, features: list[dict]) -> None:
    if not features:
        return
    try:
+        t0 = time.monotonic()
        client = _get_redis_client()
        pipeline = client.pipeline()
        for feature in features:
            pipeline.rpush(staging_key, json.dumps(feature))
        pipeline.expire(staging_key, STAGING_TTL_SECONDS)
        pipeline.execute()
+        _record_cache_op("write_batch", time.monotonic() - t0)
    except redis.RedisError as e:
        logger.warning(f"Redis staged cache write error: {e}")

@ -113,11 +130,13 @@ def cache_features_batch_staged(staging_key: str, features: list[dict]) -> None:
 def finalize_cache_population(staging_key: str, query_params: QueryParameters) -> None:
    """Atomically rename the staging key to the live cache key and set TTL."""
    try:
+        t0 = time.monotonic()
        client = _get_redis_client()
        live_key = make_cache_key(query_params)
        # RENAME is atomic — replaces the live key in one operation
        client.rename(staging_key, live_key)
        client.expire(live_key, CACHE_TTL_SECONDS)
+        _record_cache_op("finalize", time.monotonic() - t0)
        logger.debug(f"Finalized cache population for {live_key}")
    except redis.RedisError as e:
        logger.warning(f"Redis cache finalize error: {e}")