Add navigation & usage metrics for end-user experience visibility

Instrument DB query timing (11 operations across 3 repositories),
streaming lifecycle (TTFB, duration, feature count), cache operation
latency, listing detail step breakdown, and frontend page load /
time-to-first-listing / stream download / detail load metrics.

Adds 16 new OTel instruments, extends the perf ingestion endpoint
with 4 new frontend metrics, and adds ~20 Grafana dashboard panels
across 4 new rows (DB Query Performance, Streaming Performance,
Listing Detail Breakdown, Cache Performance, Frontend Navigation).
This commit is contained in:
Viktor Barzin 2026-02-23 20:28:42 +00:00
parent 1ae00b7cbf
commit 35f1987ac1
No known key found for this signature in database
GPG key ID: 0EB088298288D958
11 changed files with 1236 additions and 26 deletions

View file

@ -440,8 +440,10 @@ async def _stream_from_db(
async def _repopulate_cache_background(query_parameters: QueryParameters) -> None:
"""Repopulate the cache from DB in the background (fire-and-forget)."""
if not acquire_repopulation_lock(query_parameters):
app_metrics.cache_repopulation_total.add(1, {"result": "skipped"})
logger.debug("Skipping background repopulation — already in progress")
return
app_metrics.cache_repopulation_total.add(1, {"result": "started"})
try:
logger.info("Starting background cache repopulation for stale entry")
repository = ListingRepository(engine)
@ -453,14 +455,46 @@ async def _repopulate_cache_background(query_parameters: QueryParameters) -> Non
feature = convert_row_to_geojson(row, query_parameters.listing_type.value)
cache_features_batch_staged(staging_key, [feature])
finalize_cache_population(staging_key, query_parameters)
app_metrics.cache_repopulation_total.add(1, {"result": "completed"})
logger.info("Background cache repopulation completed")
except Exception:
delete_staging_key(staging_key)
raise
except Exception:
app_metrics.cache_repopulation_total.add(1, {"result": "failed"})
logger.exception("Background cache repopulation failed")
async def _instrumented_stream(
inner: AsyncGenerator[str, None],
source: str,
) -> AsyncGenerator[str, None]:
"""Wrap a streaming generator to record TTFB, total duration, and feature count."""
t0 = time.monotonic()
first_yielded = False
feature_count = 0
try:
async for chunk in inner:
if not first_yielded:
app_metrics.stream_time_to_first_byte_seconds.record(
time.monotonic() - t0, {"source": source}
)
first_yielded = True
# Count features from batch messages
try:
msg = json.loads(chunk)
if msg.get("type") == "batch" and "features" in msg:
feature_count += len(msg["features"])
except (json.JSONDecodeError, TypeError):
pass
yield chunk
finally:
duration = time.monotonic() - t0
app_metrics.stream_total_duration_seconds.record(duration, {"source": source})
app_metrics.stream_features_total.add(feature_count, {"source": source})
app_metrics.stream_requests_total.add(1, {"source": source})
@app.get("/api/listing_geojson/stream")
async def stream_listing_geojson(
user: Annotated[User, Depends(get_current_user)],
@ -501,21 +535,28 @@ async def stream_listing_geojson(
app_metrics.geojson_cache_operations.add(1, {"result": "hit"})
stale = is_cache_stale(query_parameters)
if stale:
app_metrics.cache_stale_serves_total.add(1)
# Fire-and-forget background repopulation
asyncio.create_task(_repopulate_cache_background(query_parameters))
generator = _stream_from_cache(
query_parameters, batch_size, limit,
user_email=user.email,
decision_filter=decision_filter,
stale=stale,
generator = _instrumented_stream(
_stream_from_cache(
query_parameters, batch_size, limit,
user_email=user.email,
decision_filter=decision_filter,
stale=stale,
),
source="cache",
)
else:
app_metrics.geojson_cache_operations.add(1, {"result": "miss"})
generator = _stream_from_db(
query_parameters, batch_size, limit, poi_distances_lookup,
skip_cache=include_poi_distances,
user_email=user.email,
decision_filter=decision_filter,
generator = _instrumented_stream(
_stream_from_db(
query_parameters, batch_size, limit, poi_distances_lookup,
skip_cache=include_poi_distances,
user_email=user.email,
decision_filter=decision_filter,
),
source="db",
)
return StreamingResponse(
@ -660,9 +701,13 @@ async def get_listing_detail(
"""Get detailed information for a single listing."""
repository = ListingRepository(engine)
lt = ListingType(listing_type)
t_step = time.monotonic()
listings = await repository.get_listings(
only_ids=[listing_id], listing_type=lt
)
app_metrics.listing_detail_step_duration_seconds.record(
time.monotonic() - t_step, {"step": "fetch_listing"}
)
if not listings:
raise HTTPException(status_code=404, detail="Listing not found")
@ -737,6 +782,7 @@ async def get_listing_detail(
furnish_type_val = str(ft)
# Load user's decision for this listing
t_step = time.monotonic()
decision_val: str | None = None
user_id = _get_user_id_safe(user.email)
if user_id is not None:
@ -746,8 +792,12 @@ async def get_listing_detail(
if d.listing_id == listing_id and d.listing_type == listing_type:
decision_val = d.decision
break
app_metrics.listing_detail_step_duration_seconds.record(
time.monotonic() - t_step, {"step": "load_decision"}
)
# Load POI distances
t_step = time.monotonic()
poi_distances_list: list[dict] = []
if user_id is not None:
poi_repo = POIRepository(engine)
@ -765,6 +815,9 @@ async def get_listing_detail(
"duration_seconds": d.duration_seconds,
"distance_meters": d.distance_meters,
})
app_metrics.listing_detail_step_duration_seconds.record(
time.monotonic() - t_step, {"step": "load_poi_distances"}
)
return ListingDetailResponse(
id=listing.id,