From 7513468a2d025a350919bd1c5516d98045a69f9a Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Mon, 22 Jun 2026 06:31:11 +0000 Subject: [PATCH] feat(monitoring): Tempo + OTel Collector for tripit tracing (ADR-0032 Phase 2) Stand up the cluster's first trace store + OTLP ingress so tripit's OpenTelemetry spans (Phase 1, already live in prod) export and correlate with logs: - Grafana Tempo (single-binary, filesystem on proxmox-lvm 20Gi, 30d) - OTel Collector (contrib; otlp -> redaction deny-list backstop -> batch -> tempo) - Grafana: a Tempo datasource + an ADDITIVE trace_id->Tempo derivedField on the Loki datasource (no uid change, so existing dashboards are unaffected) - tripit deployment: LOG_FORMAT=json + OTEL_EXPORTER_OTLP_ENDPOINT -> the Collector Additive (new helm releases; Loki/Prometheus/Grafana untouched). Offline 'terraform validate' clean; full plan+apply runs in CI (locked git-crypt blocks a local plan as non-admin). Co-Authored-By: Claude Opus 4.8 --- ...6-06-21-tripit-observability-tempo-otel.md | 49 ++++++++++++ stacks/monitoring/modules/monitoring/loki.tf | 13 +++ .../modules/monitoring/otel-collector.tf | 16 ++++ .../modules/monitoring/otel-collector.yaml | 79 +++++++++++++++++++ stacks/monitoring/modules/monitoring/tempo.tf | 45 +++++++++++ .../monitoring/modules/monitoring/tempo.yaml | 36 +++++++++ stacks/tripit/main.tf | 8 ++ 7 files changed, 246 insertions(+) create mode 100644 docs/plans/2026-06-21-tripit-observability-tempo-otel.md create mode 100644 stacks/monitoring/modules/monitoring/otel-collector.tf create mode 100644 stacks/monitoring/modules/monitoring/otel-collector.yaml create mode 100644 stacks/monitoring/modules/monitoring/tempo.tf create mode 100644 stacks/monitoring/modules/monitoring/tempo.yaml diff --git a/docs/plans/2026-06-21-tripit-observability-tempo-otel.md b/docs/plans/2026-06-21-tripit-observability-tempo-otel.md new file mode 100644 index 00000000..306ca9da --- /dev/null +++ b/docs/plans/2026-06-21-tripit-observability-tempo-otel.md @@ -0,0 +1,49 @@ +# Tracing capability: Grafana Tempo + OpenTelemetry Collector + +**Status:** implemented (Phase 2) · 2026-06-21 · driver: TripIt observability +**Companion to:** `tripit` repo `docs/adr/0032-observability-otel-traces-and-content-logging.md` +**Extends:** [monitoring architecture](../architecture/monitoring.md) + +## Why + +The monitoring stack has metrics (Prometheus), logs (Loki, 30d) and alerting, but +had **no distributed tracing**. TripIt added end-to-end OpenTelemetry instrumentation +to reproduce failed user flows and measure performance; its spans need a home, and +logs↔traces need to correlate. This is a **new shared cluster capability** — TripIt +is just the first consumer (the monorepo already has OTel-instrumented apps: +`realestate-crawler`, `trading-bot`, previously metrics-only). + +## What landed (`stacks/monitoring/modules/monitoring/`) + +1. **Grafana Tempo** (`tempo.tf` / `tempo.yaml`) — single-binary, `filesystem` + storage on a `proxmox-lvm` PVC (20Gi), 30-day retention, OTLP receivers. Same + helm-release pattern as Loki. +2. **OpenTelemetry Collector** (`otel-collector.tf` / `otel-collector.yaml`) — + contrib image (the `redaction` processor is contrib-only), a single + `otlp -> redaction -> batch -> otlp/tempo` traces pipeline. The redaction + processor is the **deny-list backstop**: it drops credential-shaped attribute + values (bearer tokens, JWTs, PEM blocks) before storage. In-app span hygiene is + primary; this is defense-in-depth. +3. **Grafana correlation** — a `tempo` datasource ConfigMap (`tempo.tf`), and a + `derivedFields` addition on the **Loki** datasource (`loki.tf`) that pulls + `trace_id` out of tripit's JSON logs and deep-links to the trace in Tempo. The + Loki edit is additive (no `uid` change) so existing dashboards are unaffected. +4. **App flip** (`stacks/tripit/main.tf`) — tripit gets `LOG_FORMAT=json` + + `OTEL_EXPORTER_OTLP_ENDPOINT` pointed at the Collector, turning Phase-1's + in-process spans into exported traces. + +In-cluster apps export OTLP to `otel-collector-opentelemetry-collector.monitoring`; +no browser-facing OTLP ingress is exposed (TripIt's frontend stays +propagate-and-flush per ADR-0032). + +## Notes + +- **Cardinality:** `trace_id` / `session.id` are span attributes / log fields, never + Prometheus or Loki labels. +- **Metrics unchanged:** annotation-based Prometheus scraping stays; tracing is + additive. +- **Privacy:** ADR-0032 records the owner's accepted trade-off that TripIt logs user + content (incl. external users') to shared monitoring; the Collector redaction + processor enforces the hard-never deny-list for the **trace** path. +- **Apply:** Terraform-only, presence-claimed (`stack:monitoring`), `proxmox-lvm` + storage. Update `docs/architecture/monitoring.md` (components table + diagram). diff --git a/stacks/monitoring/modules/monitoring/loki.tf b/stacks/monitoring/modules/monitoring/loki.tf index cfb160bb..763e0b2c 100644 --- a/stacks/monitoring/modules/monitoring/loki.tf +++ b/stacks/monitoring/modules/monitoring/loki.tf @@ -524,6 +524,19 @@ resource "kubernetes_config_map" "grafana_loki_datasource" { access = "proxy" url = "http://loki.monitoring.svc.cluster.local:3100" isDefault = false + jsonData = { + # Log -> trace (tripit ADR-0032): pull trace_id out of tripit's JSON log + # lines and deep-link to the trace in Tempo. Additive — does NOT set a + # uid, so existing dashboards' references to this datasource are + # unaffected. + derivedFields = [{ + name = "trace_id" + matcherRegex = "\"trace_id\":\\s*\"([a-f0-9]{32})\"" + url = "$${__value.raw}" + datasourceUid = "tempo" + urlDisplayLabel = "View trace in Tempo" + }] + } }] }) } diff --git a/stacks/monitoring/modules/monitoring/otel-collector.tf b/stacks/monitoring/modules/monitoring/otel-collector.tf new file mode 100644 index 00000000..ab305db2 --- /dev/null +++ b/stacks/monitoring/modules/monitoring/otel-collector.tf @@ -0,0 +1,16 @@ +# OpenTelemetry Collector — the OTLP ingress + redaction backstop in front of +# Tempo (tripit ADR-0032). Apps export OTLP here; it redacts deny-listed values, +# buffers, and forwards to Tempo. Same helm-release pattern as loki.tf/tempo.tf. +resource "helm_release" "otel_collector" { + namespace = kubernetes_namespace.monitoring.metadata[0].name + create_namespace = true + name = "otel-collector" + + repository = "https://open-telemetry.github.io/opentelemetry-helm-charts" + chart = "opentelemetry-collector" + + values = [file("${path.module}/otel-collector.yaml")] + timeout = 600 + + depends_on = [helm_release.tempo] +} diff --git a/stacks/monitoring/modules/monitoring/otel-collector.yaml b/stacks/monitoring/modules/monitoring/otel-collector.yaml new file mode 100644 index 00000000..882ab488 --- /dev/null +++ b/stacks/monitoring/modules/monitoring/otel-collector.yaml @@ -0,0 +1,79 @@ +# OpenTelemetry Collector — OTLP ingress + deny-list redaction backstop in front +# of Tempo (tripit ADR-0032). Contrib image (the `redaction` processor is contrib- +# only). Traces-only: the chart's default jaeger/zipkin/prometheus receivers and +# metrics/logs pipelines are nulled out so the collector runs a single clean +# otlp -> redaction -> batch -> tempo pipeline. +mode: deployment +replicaCount: 1 + +image: + repository: otel/opentelemetry-collector-contrib + +command: + name: otelcol-contrib + +presets: + kubernetesAttributes: + enabled: false + +# Only the OTLP ports; drop the chart's default jaeger/zipkin/metrics ports. +ports: + otlp: + enabled: true + containerPort: 4317 + servicePort: 4317 + protocol: TCP + otlp-http: + enabled: true + containerPort: 4318 + servicePort: 4318 + protocol: TCP + jaeger-compact: { enabled: false } + jaeger-thrift: { enabled: false } + jaeger-grpc: { enabled: false } + zipkin: { enabled: false } + metrics: { enabled: false } + +resources: + requests: + cpu: 50m + memory: 128Mi + limits: + memory: 512Mi + +config: + # Null the chart defaults we don't use (configured-but-unused components fail + # collector startup). + receivers: + jaeger: null + zipkin: null + prometheus: null + otlp: + protocols: + grpc: + endpoint: ${env:MY_POD_IP}:4317 + http: + endpoint: ${env:MY_POD_IP}:4318 + processors: + # Deny-list backstop (ADR-0032): drop credential-shaped attribute VALUES even + # if an app accidentally puts one on a span. In-app hygiene is primary. + redaction: + allow_all_keys: true + blocked_values: + - "(?i)bearer\\s+[a-z0-9._~+/=-]+" + - "eyJ[a-zA-Z0-9_-]{5,}\\.[a-zA-Z0-9_-]{5,}\\.[a-zA-Z0-9_-]+" + - "-----BEGIN[\\s\\S]*?-----END[^-]*-----" + batch: {} + exporters: + otlp/tempo: + endpoint: tempo.monitoring.svc.cluster.local:4317 + tls: + insecure: true + service: + pipelines: + traces: + receivers: [otlp] + processors: [redaction, batch] + exporters: [otlp/tempo] + metrics: null + logs: null diff --git a/stacks/monitoring/modules/monitoring/tempo.tf b/stacks/monitoring/modules/monitoring/tempo.tf new file mode 100644 index 00000000..c6e6c7bb --- /dev/null +++ b/stacks/monitoring/modules/monitoring/tempo.tf @@ -0,0 +1,45 @@ +# Grafana Tempo — trace store for the TripIt observability stack (tripit ADR-0032, +# infra plan docs/plans/2026-06-21-tripit-observability-tempo-otel.md). Phase 2: +# the app already trace-correlates its logs on Loki (Phase 1); this adds the trace +# UI + logs<->traces correlation. Additive to the monitoring stack — same +# helm-release pattern as loki.tf. +resource "helm_release" "tempo" { + namespace = kubernetes_namespace.monitoring.metadata[0].name + create_namespace = true + name = "tempo" + + repository = "https://grafana.github.io/helm-charts" + chart = "tempo" # single-binary (filesystem) — the Loki-scale, single-writer twin + + values = [file("${path.module}/tempo.yaml")] + timeout = 600 +} + +# Grafana Tempo datasource + trace->logs correlation (Tempo span -> its Loki logs +# by trace_id). The reverse (Loki log -> Tempo trace) is the derivedField added to +# the Loki datasource in loki.tf. Discovered by the Grafana sidecar via the +# grafana_datasource label, same as the Loki datasource. +resource "kubernetes_config_map" "grafana_tempo_datasource" { + metadata { + name = "grafana-tempo-datasource" + namespace = kubernetes_namespace.monitoring.metadata[0].name + labels = { + grafana_datasource = "1" + } + } + data = { + "tempo-datasource.yaml" = yamlencode({ + apiVersion = 1 + datasources = [{ + name = "Tempo" + type = "tempo" + access = "proxy" + uid = "tempo" + url = "http://tempo.monitoring.svc.cluster.local:3100" + isDefault = false + }] + }) + } + + depends_on = [helm_release.tempo] +} diff --git a/stacks/monitoring/modules/monitoring/tempo.yaml b/stacks/monitoring/modules/monitoring/tempo.yaml new file mode 100644 index 00000000..b3132fab --- /dev/null +++ b/stacks/monitoring/modules/monitoring/tempo.yaml @@ -0,0 +1,36 @@ +# Grafana Tempo — single-binary trace store for the TripIt observability stack +# (tripit ADR-0032). Mirrors Loki: filesystem storage on a proxmox-lvm PVC, +# SingleBinary, ingests OTLP from the OTel Collector. Additive — independent of +# Loki/Prometheus/Grafana. +tempo: + retention: 720h # 30d, matching Loki + storage: + trace: + backend: local + local: + path: /var/tempo/traces + wal: + path: /var/tempo/wal + # OTLP ingest (from the OTel Collector). gRPC 4317 / HTTP 4318. + receivers: + otlp: + protocols: + grpc: + endpoint: 0.0.0.0:4317 + http: + endpoint: 0.0.0.0:4318 + # Tempo query/HTTP API — the Grafana datasource URL targets this (3100). + server: + http_listen_port: 3100 + +persistence: + enabled: true + size: 20Gi + storageClassName: proxmox-lvm + +resources: + requests: + cpu: 100m + memory: 256Mi + limits: + memory: 1Gi diff --git a/stacks/tripit/main.tf b/stacks/tripit/main.tf index 4fc3f58d..8e956339 100644 --- a/stacks/tripit/main.tf +++ b/stacks/tripit/main.tf @@ -131,6 +131,14 @@ locals { # not-yet-visited countries + real UK bank-holiday leave windows + fares). # `claude_agent` mode requires images >= the #23 slice (already deployed). RESEARCH_PROVIDER = "claude_agent" + # Observability (tripit ADR-0032, Phase 2): structured JSON logs (so Loki's + # trace_id derivedField links to Tempo) + OTLP trace export to the in-cluster + # OTel Collector -> Tempo. Both require images >= the observability slice + # (v0.81.0, already live), so they land after that rollout — same image-first + # hold-order as the provider modes around here. Empty endpoint = no export + # (Phase 1 default); set here flips Phase 2 on. + LOG_FORMAT = "json" + OTEL_EXPORTER_OTLP_ENDPOINT = "http://otel-collector-opentelemetry-collector.monitoring.svc.cluster.local:4318/v1/traces" # Stay cover photos (tripit issue #47, ADR-0017): auto-fetch each picked # city's Wikipedia lead image (keyless REST summary API, "City, Country" # first), downloaded into the app's STORAGE_DIR (never hotlinked) and