Add navigation & usage metrics for end-user experience visibility

Instrument DB query timing (11 operations across 3 repositories),
streaming lifecycle (TTFB, duration, feature count), cache operation
latency, listing detail step breakdown, and frontend page load /
time-to-first-listing / stream download / detail load metrics.

Adds 16 new OTel instruments, extends the perf ingestion endpoint
with 4 new frontend metrics, and adds ~20 Grafana dashboard panels
across 4 new rows (DB Query Performance, Streaming Performance,
Listing Detail Breakdown, Cache Performance, Frontend Navigation).
This commit is contained in:
Viktor Barzin 2026-02-23 20:28:42 +00:00
parent 1ae00b7cbf
commit 35f1987ac1
No known key found for this signature in database
GPG key ID: 0EB088298288D958
11 changed files with 1236 additions and 26 deletions

View file

@ -3,16 +3,25 @@ import hashlib
import json
import logging
import os
import time
import uuid
from typing import Generator
from urllib.parse import urlparse, urlunparse
import redis
import api.metrics as app_metrics
from models.listing import QueryParameters
logger = logging.getLogger(__name__)
def _record_cache_op(operation: str, duration: float) -> None:
"""Record a cache operation timing metric, no-op if metrics aren't initialized."""
if app_metrics._meter is None:
return
app_metrics.cache_operation_duration_seconds.record(duration, {"operation": operation})
CACHE_PREFIX = "listings:geojson:"
STAGING_PREFIX = "listings:geojson:staging:"
CACHE_TTL_SECONDS = 24 * 60 * 60 # 24 hours
@ -40,11 +49,15 @@ def make_cache_key(query_params: QueryParameters) -> str:
def get_cached_count(query_params: QueryParameters) -> int | None:
"""Return the number of cached features for a query, or None if not cached."""
try:
t0 = time.monotonic()
client = _get_redis_client()
key = make_cache_key(query_params)
if not client.exists(key):
_record_cache_op("check", time.monotonic() - t0)
return None
return client.llen(key)
count = client.llen(key)
_record_cache_op("check", time.monotonic() - t0)
return count
except redis.RedisError as e:
logger.warning(f"Redis cache read error: {e}")
return None
@ -61,7 +74,9 @@ def get_cached_features(
for start in range(0, total, batch_size):
end = start + batch_size - 1
t0 = time.monotonic()
items = client.lrange(key, start, end)
_record_cache_op("read_batch", time.monotonic() - t0)
batch = [json.loads(item) for item in items]
if batch:
yield batch
@ -100,12 +115,14 @@ def cache_features_batch_staged(staging_key: str, features: list[dict]) -> None:
if not features:
return
try:
t0 = time.monotonic()
client = _get_redis_client()
pipeline = client.pipeline()
for feature in features:
pipeline.rpush(staging_key, json.dumps(feature))
pipeline.expire(staging_key, STAGING_TTL_SECONDS)
pipeline.execute()
_record_cache_op("write_batch", time.monotonic() - t0)
except redis.RedisError as e:
logger.warning(f"Redis staged cache write error: {e}")
@ -113,11 +130,13 @@ def cache_features_batch_staged(staging_key: str, features: list[dict]) -> None:
def finalize_cache_population(staging_key: str, query_params: QueryParameters) -> None:
"""Atomically rename the staging key to the live cache key and set TTL."""
try:
t0 = time.monotonic()
client = _get_redis_client()
live_key = make_cache_key(query_params)
# RENAME is atomic — replaces the live key in one operation
client.rename(staging_key, live_key)
client.expire(live_key, CACHE_TTL_SECONDS)
_record_cache_op("finalize", time.monotonic() - t0)
logger.debug(f"Finalized cache population for {live_key}")
except redis.RedisError as e:
logger.warning(f"Redis cache finalize error: {e}")