diff --git a/docs/plans/2026-06-21-tripit-observability-tempo-otel.md b/docs/plans/2026-06-21-tripit-observability-tempo-otel.md deleted file mode 100644 index 89940c1b..00000000 --- a/docs/plans/2026-06-21-tripit-observability-tempo-otel.md +++ /dev/null @@ -1,45 +0,0 @@ -# Tracing capability: Grafana Tempo + OpenTelemetry Collector - -**Status:** implemented (Phase 2) · 2026-06-22 · driver: TripIt observability -**Companion to:** `tripit` repo `docs/adr/0032-observability-otel-traces-and-content-logging.md` -**Extends:** [monitoring architecture](../architecture/monitoring.md) - -## Why - -The monitoring stack has metrics (Prometheus), logs (Loki, 30d) and alerting, but -had **no distributed tracing**. TripIt added end-to-end OpenTelemetry instrumentation -to reproduce failed user flows and measure performance; its spans need a home, and -logs↔traces need to correlate. This is a **new shared cluster capability** — TripIt -is just the first consumer. - -## What landed (`stacks/monitoring/modules/monitoring/`) - -1. **Grafana Tempo** (`tempo.tf` / `tempo.yaml`) — single-binary, `filesystem` - storage on a `proxmox-lvm` PVC (20Gi), 30-day retention, OTLP receivers. - `tempo.resources` set explicitly (req 256Mi / limit 2Gi) — the single-binary - chart ignores a top-level `resources:` and the pod otherwise OOMs on the - namespace LimitRange default. -2. **OpenTelemetry Collector** (`otel-collector.tf` / `otel-collector.yaml`) — - contrib image (the `redaction` processor is contrib-only), one - `otlp -> redaction -> batch -> otlp/tempo` traces pipeline. The redaction - processor is the **deny-list backstop** (drops bearer/JWT/PEM-shaped values). -3. **Grafana correlation** — a `tempo` datasource (`tempo.tf`), and a - `derivedFields` addition on the **Loki** datasource (`loki.tf`) pulling - `trace_id` out of tripit's JSON logs and deep-linking to Tempo. Additive (no - `uid` change) so existing dashboards are unaffected. -4. **App flip** (`stacks/tripit/main.tf`) — tripit gets `LOG_FORMAT=json` + - `OTEL_EXPORTER_OTLP_ENDPOINT` pointed at the Collector. - -Both helm releases use **`atomic=true` + `cleanup_on_fail=true`**: a failed install -auto-rolls-back rather than leaving a stuck `failed` release (the first-attempt -failure mode — see infra memory #6479). - -## Notes - -- **Cardinality:** `trace_id` / `session.id` are span attributes / log fields, never - Prometheus or Loki labels. -- **Privacy:** ADR-0032 records the accepted trade-off that TripIt logs user content - to shared monitoring; the Collector redaction processor enforces the deny-list on - the trace path. -- **Apply:** Terraform-only, presence-claimed (`stack:monitoring`). Update - `docs/architecture/monitoring.md` (components table + diagram) once stable. diff --git a/stacks/monitoring/modules/monitoring/loki.tf b/stacks/monitoring/modules/monitoring/loki.tf index 763e0b2c..cfb160bb 100644 --- a/stacks/monitoring/modules/monitoring/loki.tf +++ b/stacks/monitoring/modules/monitoring/loki.tf @@ -524,19 +524,6 @@ resource "kubernetes_config_map" "grafana_loki_datasource" { access = "proxy" url = "http://loki.monitoring.svc.cluster.local:3100" isDefault = false - jsonData = { - # Log -> trace (tripit ADR-0032): pull trace_id out of tripit's JSON log - # lines and deep-link to the trace in Tempo. Additive — does NOT set a - # uid, so existing dashboards' references to this datasource are - # unaffected. - derivedFields = [{ - name = "trace_id" - matcherRegex = "\"trace_id\":\\s*\"([a-f0-9]{32})\"" - url = "$${__value.raw}" - datasourceUid = "tempo" - urlDisplayLabel = "View trace in Tempo" - }] - } }] }) } diff --git a/stacks/monitoring/modules/monitoring/otel-collector.tf b/stacks/monitoring/modules/monitoring/otel-collector.tf deleted file mode 100644 index 0bd854be..00000000 --- a/stacks/monitoring/modules/monitoring/otel-collector.tf +++ /dev/null @@ -1,19 +0,0 @@ -# OpenTelemetry Collector — the OTLP ingress + redaction backstop in front of -# Tempo (tripit ADR-0032). Apps export OTLP here; it redacts deny-listed values, -# buffers, and forwards to Tempo. atomic + cleanup_on_fail so a failed install -# auto-rolls-back (no stuck/orphaned release — memory #6479). -resource "helm_release" "otel_collector" { - namespace = kubernetes_namespace.monitoring.metadata[0].name - create_namespace = true - name = "otel-collector" - - repository = "https://open-telemetry.github.io/opentelemetry-helm-charts" - chart = "opentelemetry-collector" - - values = [file("${path.module}/otel-collector.yaml")] - timeout = 600 - atomic = true - cleanup_on_fail = true - - depends_on = [helm_release.tempo] -} diff --git a/stacks/monitoring/modules/monitoring/otel-collector.yaml b/stacks/monitoring/modules/monitoring/otel-collector.yaml deleted file mode 100644 index 882ab488..00000000 --- a/stacks/monitoring/modules/monitoring/otel-collector.yaml +++ /dev/null @@ -1,79 +0,0 @@ -# OpenTelemetry Collector — OTLP ingress + deny-list redaction backstop in front -# of Tempo (tripit ADR-0032). Contrib image (the `redaction` processor is contrib- -# only). Traces-only: the chart's default jaeger/zipkin/prometheus receivers and -# metrics/logs pipelines are nulled out so the collector runs a single clean -# otlp -> redaction -> batch -> tempo pipeline. -mode: deployment -replicaCount: 1 - -image: - repository: otel/opentelemetry-collector-contrib - -command: - name: otelcol-contrib - -presets: - kubernetesAttributes: - enabled: false - -# Only the OTLP ports; drop the chart's default jaeger/zipkin/metrics ports. -ports: - otlp: - enabled: true - containerPort: 4317 - servicePort: 4317 - protocol: TCP - otlp-http: - enabled: true - containerPort: 4318 - servicePort: 4318 - protocol: TCP - jaeger-compact: { enabled: false } - jaeger-thrift: { enabled: false } - jaeger-grpc: { enabled: false } - zipkin: { enabled: false } - metrics: { enabled: false } - -resources: - requests: - cpu: 50m - memory: 128Mi - limits: - memory: 512Mi - -config: - # Null the chart defaults we don't use (configured-but-unused components fail - # collector startup). - receivers: - jaeger: null - zipkin: null - prometheus: null - otlp: - protocols: - grpc: - endpoint: ${env:MY_POD_IP}:4317 - http: - endpoint: ${env:MY_POD_IP}:4318 - processors: - # Deny-list backstop (ADR-0032): drop credential-shaped attribute VALUES even - # if an app accidentally puts one on a span. In-app hygiene is primary. - redaction: - allow_all_keys: true - blocked_values: - - "(?i)bearer\\s+[a-z0-9._~+/=-]+" - - "eyJ[a-zA-Z0-9_-]{5,}\\.[a-zA-Z0-9_-]{5,}\\.[a-zA-Z0-9_-]+" - - "-----BEGIN[\\s\\S]*?-----END[^-]*-----" - batch: {} - exporters: - otlp/tempo: - endpoint: tempo.monitoring.svc.cluster.local:4317 - tls: - insecure: true - service: - pipelines: - traces: - receivers: [otlp] - processors: [redaction, batch] - exporters: [otlp/tempo] - metrics: null - logs: null diff --git a/stacks/monitoring/modules/monitoring/tempo.tf b/stacks/monitoring/modules/monitoring/tempo.tf deleted file mode 100644 index 75c177b1..00000000 --- a/stacks/monitoring/modules/monitoring/tempo.tf +++ /dev/null @@ -1,49 +0,0 @@ -# Grafana Tempo — trace store for the TripIt observability stack (tripit ADR-0032, -# infra plan docs/plans/2026-06-21-tripit-observability-tempo-otel.md). Phase 2: -# the app already trace-correlates its logs on Loki (Phase 1); this adds the trace -# UI + logs<->traces correlation. Additive to the monitoring stack. -# -# atomic + cleanup_on_fail: a failed install auto-rolls-back instead of leaving a -# stuck `failed` release that terraform can't upgrade and a config-revert can't -# destroy (the 2026-06-21 first-attempt failure mode — see memory #6479). -resource "helm_release" "tempo" { - namespace = kubernetes_namespace.monitoring.metadata[0].name - create_namespace = true - name = "tempo" - - repository = "https://grafana.github.io/helm-charts" - chart = "tempo" # single-binary (filesystem) — the Loki-scale, single-writer twin - - values = [file("${path.module}/tempo.yaml")] - timeout = 600 - atomic = true - cleanup_on_fail = true -} - -# Grafana Tempo datasource. The reverse direction (Loki log -> Tempo trace) is the -# derivedField added to the Loki datasource in loki.tf. Discovered by the Grafana -# sidecar via the grafana_datasource label, same as the Loki datasource. -resource "kubernetes_config_map" "grafana_tempo_datasource" { - metadata { - name = "grafana-tempo-datasource" - namespace = kubernetes_namespace.monitoring.metadata[0].name - labels = { - grafana_datasource = "1" - } - } - data = { - "tempo-datasource.yaml" = yamlencode({ - apiVersion = 1 - datasources = [{ - name = "Tempo" - type = "tempo" - access = "proxy" - uid = "tempo" - url = "http://tempo.monitoring.svc.cluster.local:3100" - isDefault = false - }] - }) - } - - depends_on = [helm_release.tempo] -} diff --git a/stacks/monitoring/modules/monitoring/tempo.yaml b/stacks/monitoring/modules/monitoring/tempo.yaml deleted file mode 100644 index 49766422..00000000 --- a/stacks/monitoring/modules/monitoring/tempo.yaml +++ /dev/null @@ -1,38 +0,0 @@ -# Grafana Tempo — single-binary trace store for the TripIt observability stack -# (tripit ADR-0032). Mirrors Loki: filesystem storage on a proxmox-lvm PVC, -# SingleBinary, ingests OTLP from the OTel Collector. Additive — independent of -# Loki/Prometheus/Grafana. -tempo: - retention: 720h # 30d, matching Loki - storage: - trace: - backend: local - local: - path: /var/tempo/traces - wal: - path: /var/tempo/wal - # OTLP ingest (from the OTel Collector). gRPC 4317 / HTTP 4318. - receivers: - otlp: - protocols: - grpc: - endpoint: 0.0.0.0:4317 - http: - endpoint: 0.0.0.0:4318 - # Tempo query/HTTP API — the Grafana datasource URL targets this (3100). - server: - http_listen_port: 3100 - # Container resources — the single-binary chart key is tempo.resources (NOT a - # top-level `resources:`, which is silently ignored → the pod falls to the - # namespace LimitRange default ~256Mi and is OOMKilled, exit 137). - resources: - requests: - cpu: 100m - memory: 256Mi - limits: - memory: 2Gi - -persistence: - enabled: true - size: 20Gi - storageClassName: proxmox-lvm diff --git a/stacks/tripit/main.tf b/stacks/tripit/main.tf index 4822accc..4fc3f58d 100644 --- a/stacks/tripit/main.tf +++ b/stacks/tripit/main.tf @@ -131,12 +131,6 @@ locals { # not-yet-visited countries + real UK bank-holiday leave windows + fares). # `claude_agent` mode requires images >= the #23 slice (already deployed). RESEARCH_PROVIDER = "claude_agent" - # Observability (tripit ADR-0032, Phase 2): structured JSON logs (so Loki's - # trace_id derivedField links to Tempo) + OTLP trace export to the in-cluster - # OTel Collector -> Tempo. Both require images >= the observability slice - # (v0.81.0, already live), so they land after that rollout. - LOG_FORMAT = "json" - OTEL_EXPORTER_OTLP_ENDPOINT = "http://otel-collector-opentelemetry-collector.monitoring.svc.cluster.local:4318/v1/traces" # Stay cover photos (tripit issue #47, ADR-0017): auto-fetch each picked # city's Wikipedia lead image (keyless REST summary API, "City, Country" # first), downloaded into the app's STORAGE_DIR (never hotlinked) and