From 1595bddfc2f4316ed500348a8b0b150ef7aba3c7 Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Mon, 22 Jun 2026 08:17:59 +0000 Subject: [PATCH] feat(monitoring): Tempo + OTel Collector for tripit tracing, hardened (ADR-0032 Phase 2) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Re-land Phase 2 after the first attempt's two failure modes, both fixed: - tempo.resources set under the correct single-binary chart key (was OOMKilled on the namespace LimitRange default when mis-placed at top level). - atomic=true + cleanup_on_fail=true on BOTH helm releases — a failed install auto-rolls-back instead of leaving a stuck/orphaned release (memory #6479). Tempo (single-binary, proxmox-lvm 20Gi, 30d) + OTel Collector (contrib; otlp -> redaction -> batch -> tempo) + Tempo datasource + additive trace_id->Tempo derivedField on Loki + tripit LOG_FORMAT=json/OTEL_EXPORTER_OTLP_ENDPOINT. Co-Authored-By: Claude Opus 4.8 --- ...6-06-21-tripit-observability-tempo-otel.md | 45 +++++++++++ stacks/monitoring/modules/monitoring/loki.tf | 13 +++ .../modules/monitoring/otel-collector.tf | 19 +++++ .../modules/monitoring/otel-collector.yaml | 79 +++++++++++++++++++ stacks/monitoring/modules/monitoring/tempo.tf | 49 ++++++++++++ .../monitoring/modules/monitoring/tempo.yaml | 38 +++++++++ stacks/tripit/main.tf | 6 ++ 7 files changed, 249 insertions(+) create mode 100644 docs/plans/2026-06-21-tripit-observability-tempo-otel.md create mode 100644 stacks/monitoring/modules/monitoring/otel-collector.tf create mode 100644 stacks/monitoring/modules/monitoring/otel-collector.yaml create mode 100644 stacks/monitoring/modules/monitoring/tempo.tf create mode 100644 stacks/monitoring/modules/monitoring/tempo.yaml diff --git a/docs/plans/2026-06-21-tripit-observability-tempo-otel.md b/docs/plans/2026-06-21-tripit-observability-tempo-otel.md new file mode 100644 index 00000000..89940c1b --- /dev/null +++ b/docs/plans/2026-06-21-tripit-observability-tempo-otel.md @@ -0,0 +1,45 @@ +# Tracing capability: Grafana Tempo + OpenTelemetry Collector + +**Status:** implemented (Phase 2) · 2026-06-22 · driver: TripIt observability +**Companion to:** `tripit` repo `docs/adr/0032-observability-otel-traces-and-content-logging.md` +**Extends:** [monitoring architecture](../architecture/monitoring.md) + +## Why + +The monitoring stack has metrics (Prometheus), logs (Loki, 30d) and alerting, but +had **no distributed tracing**. TripIt added end-to-end OpenTelemetry instrumentation +to reproduce failed user flows and measure performance; its spans need a home, and +logs↔traces need to correlate. This is a **new shared cluster capability** — TripIt +is just the first consumer. + +## What landed (`stacks/monitoring/modules/monitoring/`) + +1. **Grafana Tempo** (`tempo.tf` / `tempo.yaml`) — single-binary, `filesystem` + storage on a `proxmox-lvm` PVC (20Gi), 30-day retention, OTLP receivers. + `tempo.resources` set explicitly (req 256Mi / limit 2Gi) — the single-binary + chart ignores a top-level `resources:` and the pod otherwise OOMs on the + namespace LimitRange default. +2. **OpenTelemetry Collector** (`otel-collector.tf` / `otel-collector.yaml`) — + contrib image (the `redaction` processor is contrib-only), one + `otlp -> redaction -> batch -> otlp/tempo` traces pipeline. The redaction + processor is the **deny-list backstop** (drops bearer/JWT/PEM-shaped values). +3. **Grafana correlation** — a `tempo` datasource (`tempo.tf`), and a + `derivedFields` addition on the **Loki** datasource (`loki.tf`) pulling + `trace_id` out of tripit's JSON logs and deep-linking to Tempo. Additive (no + `uid` change) so existing dashboards are unaffected. +4. **App flip** (`stacks/tripit/main.tf`) — tripit gets `LOG_FORMAT=json` + + `OTEL_EXPORTER_OTLP_ENDPOINT` pointed at the Collector. + +Both helm releases use **`atomic=true` + `cleanup_on_fail=true`**: a failed install +auto-rolls-back rather than leaving a stuck `failed` release (the first-attempt +failure mode — see infra memory #6479). + +## Notes + +- **Cardinality:** `trace_id` / `session.id` are span attributes / log fields, never + Prometheus or Loki labels. +- **Privacy:** ADR-0032 records the accepted trade-off that TripIt logs user content + to shared monitoring; the Collector redaction processor enforces the deny-list on + the trace path. +- **Apply:** Terraform-only, presence-claimed (`stack:monitoring`). Update + `docs/architecture/monitoring.md` (components table + diagram) once stable. diff --git a/stacks/monitoring/modules/monitoring/loki.tf b/stacks/monitoring/modules/monitoring/loki.tf index cfb160bb..763e0b2c 100644 --- a/stacks/monitoring/modules/monitoring/loki.tf +++ b/stacks/monitoring/modules/monitoring/loki.tf @@ -524,6 +524,19 @@ resource "kubernetes_config_map" "grafana_loki_datasource" { access = "proxy" url = "http://loki.monitoring.svc.cluster.local:3100" isDefault = false + jsonData = { + # Log -> trace (tripit ADR-0032): pull trace_id out of tripit's JSON log + # lines and deep-link to the trace in Tempo. Additive — does NOT set a + # uid, so existing dashboards' references to this datasource are + # unaffected. + derivedFields = [{ + name = "trace_id" + matcherRegex = "\"trace_id\":\\s*\"([a-f0-9]{32})\"" + url = "$${__value.raw}" + datasourceUid = "tempo" + urlDisplayLabel = "View trace in Tempo" + }] + } }] }) } diff --git a/stacks/monitoring/modules/monitoring/otel-collector.tf b/stacks/monitoring/modules/monitoring/otel-collector.tf new file mode 100644 index 00000000..0bd854be --- /dev/null +++ b/stacks/monitoring/modules/monitoring/otel-collector.tf @@ -0,0 +1,19 @@ +# OpenTelemetry Collector — the OTLP ingress + redaction backstop in front of +# Tempo (tripit ADR-0032). Apps export OTLP here; it redacts deny-listed values, +# buffers, and forwards to Tempo. atomic + cleanup_on_fail so a failed install +# auto-rolls-back (no stuck/orphaned release — memory #6479). +resource "helm_release" "otel_collector" { + namespace = kubernetes_namespace.monitoring.metadata[0].name + create_namespace = true + name = "otel-collector" + + repository = "https://open-telemetry.github.io/opentelemetry-helm-charts" + chart = "opentelemetry-collector" + + values = [file("${path.module}/otel-collector.yaml")] + timeout = 600 + atomic = true + cleanup_on_fail = true + + depends_on = [helm_release.tempo] +} diff --git a/stacks/monitoring/modules/monitoring/otel-collector.yaml b/stacks/monitoring/modules/monitoring/otel-collector.yaml new file mode 100644 index 00000000..882ab488 --- /dev/null +++ b/stacks/monitoring/modules/monitoring/otel-collector.yaml @@ -0,0 +1,79 @@ +# OpenTelemetry Collector — OTLP ingress + deny-list redaction backstop in front +# of Tempo (tripit ADR-0032). Contrib image (the `redaction` processor is contrib- +# only). Traces-only: the chart's default jaeger/zipkin/prometheus receivers and +# metrics/logs pipelines are nulled out so the collector runs a single clean +# otlp -> redaction -> batch -> tempo pipeline. +mode: deployment +replicaCount: 1 + +image: + repository: otel/opentelemetry-collector-contrib + +command: + name: otelcol-contrib + +presets: + kubernetesAttributes: + enabled: false + +# Only the OTLP ports; drop the chart's default jaeger/zipkin/metrics ports. +ports: + otlp: + enabled: true + containerPort: 4317 + servicePort: 4317 + protocol: TCP + otlp-http: + enabled: true + containerPort: 4318 + servicePort: 4318 + protocol: TCP + jaeger-compact: { enabled: false } + jaeger-thrift: { enabled: false } + jaeger-grpc: { enabled: false } + zipkin: { enabled: false } + metrics: { enabled: false } + +resources: + requests: + cpu: 50m + memory: 128Mi + limits: + memory: 512Mi + +config: + # Null the chart defaults we don't use (configured-but-unused components fail + # collector startup). + receivers: + jaeger: null + zipkin: null + prometheus: null + otlp: + protocols: + grpc: + endpoint: ${env:MY_POD_IP}:4317 + http: + endpoint: ${env:MY_POD_IP}:4318 + processors: + # Deny-list backstop (ADR-0032): drop credential-shaped attribute VALUES even + # if an app accidentally puts one on a span. In-app hygiene is primary. + redaction: + allow_all_keys: true + blocked_values: + - "(?i)bearer\\s+[a-z0-9._~+/=-]+" + - "eyJ[a-zA-Z0-9_-]{5,}\\.[a-zA-Z0-9_-]{5,}\\.[a-zA-Z0-9_-]+" + - "-----BEGIN[\\s\\S]*?-----END[^-]*-----" + batch: {} + exporters: + otlp/tempo: + endpoint: tempo.monitoring.svc.cluster.local:4317 + tls: + insecure: true + service: + pipelines: + traces: + receivers: [otlp] + processors: [redaction, batch] + exporters: [otlp/tempo] + metrics: null + logs: null diff --git a/stacks/monitoring/modules/monitoring/tempo.tf b/stacks/monitoring/modules/monitoring/tempo.tf new file mode 100644 index 00000000..75c177b1 --- /dev/null +++ b/stacks/monitoring/modules/monitoring/tempo.tf @@ -0,0 +1,49 @@ +# Grafana Tempo — trace store for the TripIt observability stack (tripit ADR-0032, +# infra plan docs/plans/2026-06-21-tripit-observability-tempo-otel.md). Phase 2: +# the app already trace-correlates its logs on Loki (Phase 1); this adds the trace +# UI + logs<->traces correlation. Additive to the monitoring stack. +# +# atomic + cleanup_on_fail: a failed install auto-rolls-back instead of leaving a +# stuck `failed` release that terraform can't upgrade and a config-revert can't +# destroy (the 2026-06-21 first-attempt failure mode — see memory #6479). +resource "helm_release" "tempo" { + namespace = kubernetes_namespace.monitoring.metadata[0].name + create_namespace = true + name = "tempo" + + repository = "https://grafana.github.io/helm-charts" + chart = "tempo" # single-binary (filesystem) — the Loki-scale, single-writer twin + + values = [file("${path.module}/tempo.yaml")] + timeout = 600 + atomic = true + cleanup_on_fail = true +} + +# Grafana Tempo datasource. The reverse direction (Loki log -> Tempo trace) is the +# derivedField added to the Loki datasource in loki.tf. Discovered by the Grafana +# sidecar via the grafana_datasource label, same as the Loki datasource. +resource "kubernetes_config_map" "grafana_tempo_datasource" { + metadata { + name = "grafana-tempo-datasource" + namespace = kubernetes_namespace.monitoring.metadata[0].name + labels = { + grafana_datasource = "1" + } + } + data = { + "tempo-datasource.yaml" = yamlencode({ + apiVersion = 1 + datasources = [{ + name = "Tempo" + type = "tempo" + access = "proxy" + uid = "tempo" + url = "http://tempo.monitoring.svc.cluster.local:3100" + isDefault = false + }] + }) + } + + depends_on = [helm_release.tempo] +} diff --git a/stacks/monitoring/modules/monitoring/tempo.yaml b/stacks/monitoring/modules/monitoring/tempo.yaml new file mode 100644 index 00000000..49766422 --- /dev/null +++ b/stacks/monitoring/modules/monitoring/tempo.yaml @@ -0,0 +1,38 @@ +# Grafana Tempo — single-binary trace store for the TripIt observability stack +# (tripit ADR-0032). Mirrors Loki: filesystem storage on a proxmox-lvm PVC, +# SingleBinary, ingests OTLP from the OTel Collector. Additive — independent of +# Loki/Prometheus/Grafana. +tempo: + retention: 720h # 30d, matching Loki + storage: + trace: + backend: local + local: + path: /var/tempo/traces + wal: + path: /var/tempo/wal + # OTLP ingest (from the OTel Collector). gRPC 4317 / HTTP 4318. + receivers: + otlp: + protocols: + grpc: + endpoint: 0.0.0.0:4317 + http: + endpoint: 0.0.0.0:4318 + # Tempo query/HTTP API — the Grafana datasource URL targets this (3100). + server: + http_listen_port: 3100 + # Container resources — the single-binary chart key is tempo.resources (NOT a + # top-level `resources:`, which is silently ignored → the pod falls to the + # namespace LimitRange default ~256Mi and is OOMKilled, exit 137). + resources: + requests: + cpu: 100m + memory: 256Mi + limits: + memory: 2Gi + +persistence: + enabled: true + size: 20Gi + storageClassName: proxmox-lvm diff --git a/stacks/tripit/main.tf b/stacks/tripit/main.tf index 4fc3f58d..4822accc 100644 --- a/stacks/tripit/main.tf +++ b/stacks/tripit/main.tf @@ -131,6 +131,12 @@ locals { # not-yet-visited countries + real UK bank-holiday leave windows + fares). # `claude_agent` mode requires images >= the #23 slice (already deployed). RESEARCH_PROVIDER = "claude_agent" + # Observability (tripit ADR-0032, Phase 2): structured JSON logs (so Loki's + # trace_id derivedField links to Tempo) + OTLP trace export to the in-cluster + # OTel Collector -> Tempo. Both require images >= the observability slice + # (v0.81.0, already live), so they land after that rollout. + LOG_FORMAT = "json" + OTEL_EXPORTER_OTLP_ENDPOINT = "http://otel-collector-opentelemetry-collector.monitoring.svc.cluster.local:4318/v1/traces" # Stay cover photos (tripit issue #47, ADR-0017): auto-fetch each picked # city's Wikipedia lead image (keyless REST summary API, "City, Country" # first), downloaded into the app's STORAGE_DIR (never hotlinked) and