Revert "feat(monitoring): Tempo + OTel Collector for tripit tracing, hardened (ADR-0032 Phase 2)"
All checks were successful
ci/woodpecker/push/default Pipeline was successful

This reverts commit 1595bddfc2.
This commit is contained in:
Viktor Barzin 2026-06-22 08:31:17 +00:00
parent 1595bddfc2
commit aeed461591
7 changed files with 0 additions and 249 deletions

View file

@ -1,45 +0,0 @@
# Tracing capability: Grafana Tempo + OpenTelemetry Collector
**Status:** implemented (Phase 2) · 2026-06-22 · driver: TripIt observability
**Companion to:** `tripit` repo `docs/adr/0032-observability-otel-traces-and-content-logging.md`
**Extends:** [monitoring architecture](../architecture/monitoring.md)
## Why
The monitoring stack has metrics (Prometheus), logs (Loki, 30d) and alerting, but
had **no distributed tracing**. TripIt added end-to-end OpenTelemetry instrumentation
to reproduce failed user flows and measure performance; its spans need a home, and
logs↔traces need to correlate. This is a **new shared cluster capability** — TripIt
is just the first consumer.
## What landed (`stacks/monitoring/modules/monitoring/`)
1. **Grafana Tempo** (`tempo.tf` / `tempo.yaml`) — single-binary, `filesystem`
storage on a `proxmox-lvm` PVC (20Gi), 30-day retention, OTLP receivers.
`tempo.resources` set explicitly (req 256Mi / limit 2Gi) — the single-binary
chart ignores a top-level `resources:` and the pod otherwise OOMs on the
namespace LimitRange default.
2. **OpenTelemetry Collector** (`otel-collector.tf` / `otel-collector.yaml`) —
contrib image (the `redaction` processor is contrib-only), one
`otlp -> redaction -> batch -> otlp/tempo` traces pipeline. The redaction
processor is the **deny-list backstop** (drops bearer/JWT/PEM-shaped values).
3. **Grafana correlation** — a `tempo` datasource (`tempo.tf`), and a
`derivedFields` addition on the **Loki** datasource (`loki.tf`) pulling
`trace_id` out of tripit's JSON logs and deep-linking to Tempo. Additive (no
`uid` change) so existing dashboards are unaffected.
4. **App flip** (`stacks/tripit/main.tf`) — tripit gets `LOG_FORMAT=json` +
`OTEL_EXPORTER_OTLP_ENDPOINT` pointed at the Collector.
Both helm releases use **`atomic=true` + `cleanup_on_fail=true`**: a failed install
auto-rolls-back rather than leaving a stuck `failed` release (the first-attempt
failure mode — see infra memory #6479).
## Notes
- **Cardinality:** `trace_id` / `session.id` are span attributes / log fields, never
Prometheus or Loki labels.
- **Privacy:** ADR-0032 records the accepted trade-off that TripIt logs user content
to shared monitoring; the Collector redaction processor enforces the deny-list on
the trace path.
- **Apply:** Terraform-only, presence-claimed (`stack:monitoring`). Update
`docs/architecture/monitoring.md` (components table + diagram) once stable.

View file

@ -524,19 +524,6 @@ resource "kubernetes_config_map" "grafana_loki_datasource" {
access = "proxy" access = "proxy"
url = "http://loki.monitoring.svc.cluster.local:3100" url = "http://loki.monitoring.svc.cluster.local:3100"
isDefault = false isDefault = false
jsonData = {
# Log -> trace (tripit ADR-0032): pull trace_id out of tripit's JSON log
# lines and deep-link to the trace in Tempo. Additive does NOT set a
# uid, so existing dashboards' references to this datasource are
# unaffected.
derivedFields = [{
name = "trace_id"
matcherRegex = "\"trace_id\":\\s*\"([a-f0-9]{32})\""
url = "$${__value.raw}"
datasourceUid = "tempo"
urlDisplayLabel = "View trace in Tempo"
}]
}
}] }]
}) })
} }

View file

@ -1,19 +0,0 @@
# OpenTelemetry Collector the OTLP ingress + redaction backstop in front of
# Tempo (tripit ADR-0032). Apps export OTLP here; it redacts deny-listed values,
# buffers, and forwards to Tempo. atomic + cleanup_on_fail so a failed install
# auto-rolls-back (no stuck/orphaned release memory #6479).
resource "helm_release" "otel_collector" {
namespace = kubernetes_namespace.monitoring.metadata[0].name
create_namespace = true
name = "otel-collector"
repository = "https://open-telemetry.github.io/opentelemetry-helm-charts"
chart = "opentelemetry-collector"
values = [file("${path.module}/otel-collector.yaml")]
timeout = 600
atomic = true
cleanup_on_fail = true
depends_on = [helm_release.tempo]
}

View file

@ -1,79 +0,0 @@
# OpenTelemetry Collector — OTLP ingress + deny-list redaction backstop in front
# of Tempo (tripit ADR-0032). Contrib image (the `redaction` processor is contrib-
# only). Traces-only: the chart's default jaeger/zipkin/prometheus receivers and
# metrics/logs pipelines are nulled out so the collector runs a single clean
# otlp -> redaction -> batch -> tempo pipeline.
mode: deployment
replicaCount: 1
image:
repository: otel/opentelemetry-collector-contrib
command:
name: otelcol-contrib
presets:
kubernetesAttributes:
enabled: false
# Only the OTLP ports; drop the chart's default jaeger/zipkin/metrics ports.
ports:
otlp:
enabled: true
containerPort: 4317
servicePort: 4317
protocol: TCP
otlp-http:
enabled: true
containerPort: 4318
servicePort: 4318
protocol: TCP
jaeger-compact: { enabled: false }
jaeger-thrift: { enabled: false }
jaeger-grpc: { enabled: false }
zipkin: { enabled: false }
metrics: { enabled: false }
resources:
requests:
cpu: 50m
memory: 128Mi
limits:
memory: 512Mi
config:
# Null the chart defaults we don't use (configured-but-unused components fail
# collector startup).
receivers:
jaeger: null
zipkin: null
prometheus: null
otlp:
protocols:
grpc:
endpoint: ${env:MY_POD_IP}:4317
http:
endpoint: ${env:MY_POD_IP}:4318
processors:
# Deny-list backstop (ADR-0032): drop credential-shaped attribute VALUES even
# if an app accidentally puts one on a span. In-app hygiene is primary.
redaction:
allow_all_keys: true
blocked_values:
- "(?i)bearer\\s+[a-z0-9._~+/=-]+"
- "eyJ[a-zA-Z0-9_-]{5,}\\.[a-zA-Z0-9_-]{5,}\\.[a-zA-Z0-9_-]+"
- "-----BEGIN[\\s\\S]*?-----END[^-]*-----"
batch: {}
exporters:
otlp/tempo:
endpoint: tempo.monitoring.svc.cluster.local:4317
tls:
insecure: true
service:
pipelines:
traces:
receivers: [otlp]
processors: [redaction, batch]
exporters: [otlp/tempo]
metrics: null
logs: null

View file

@ -1,49 +0,0 @@
# Grafana Tempo trace store for the TripIt observability stack (tripit ADR-0032,
# infra plan docs/plans/2026-06-21-tripit-observability-tempo-otel.md). Phase 2:
# the app already trace-correlates its logs on Loki (Phase 1); this adds the trace
# UI + logs<->traces correlation. Additive to the monitoring stack.
#
# atomic + cleanup_on_fail: a failed install auto-rolls-back instead of leaving a
# stuck `failed` release that terraform can't upgrade and a config-revert can't
# destroy (the 2026-06-21 first-attempt failure mode see memory #6479).
resource "helm_release" "tempo" {
namespace = kubernetes_namespace.monitoring.metadata[0].name
create_namespace = true
name = "tempo"
repository = "https://grafana.github.io/helm-charts"
chart = "tempo" # single-binary (filesystem) the Loki-scale, single-writer twin
values = [file("${path.module}/tempo.yaml")]
timeout = 600
atomic = true
cleanup_on_fail = true
}
# Grafana Tempo datasource. The reverse direction (Loki log -> Tempo trace) is the
# derivedField added to the Loki datasource in loki.tf. Discovered by the Grafana
# sidecar via the grafana_datasource label, same as the Loki datasource.
resource "kubernetes_config_map" "grafana_tempo_datasource" {
metadata {
name = "grafana-tempo-datasource"
namespace = kubernetes_namespace.monitoring.metadata[0].name
labels = {
grafana_datasource = "1"
}
}
data = {
"tempo-datasource.yaml" = yamlencode({
apiVersion = 1
datasources = [{
name = "Tempo"
type = "tempo"
access = "proxy"
uid = "tempo"
url = "http://tempo.monitoring.svc.cluster.local:3100"
isDefault = false
}]
})
}
depends_on = [helm_release.tempo]
}

View file

@ -1,38 +0,0 @@
# Grafana Tempo — single-binary trace store for the TripIt observability stack
# (tripit ADR-0032). Mirrors Loki: filesystem storage on a proxmox-lvm PVC,
# SingleBinary, ingests OTLP from the OTel Collector. Additive — independent of
# Loki/Prometheus/Grafana.
tempo:
retention: 720h # 30d, matching Loki
storage:
trace:
backend: local
local:
path: /var/tempo/traces
wal:
path: /var/tempo/wal
# OTLP ingest (from the OTel Collector). gRPC 4317 / HTTP 4318.
receivers:
otlp:
protocols:
grpc:
endpoint: 0.0.0.0:4317
http:
endpoint: 0.0.0.0:4318
# Tempo query/HTTP API — the Grafana datasource URL targets this (3100).
server:
http_listen_port: 3100
# Container resources — the single-binary chart key is tempo.resources (NOT a
# top-level `resources:`, which is silently ignored → the pod falls to the
# namespace LimitRange default ~256Mi and is OOMKilled, exit 137).
resources:
requests:
cpu: 100m
memory: 256Mi
limits:
memory: 2Gi
persistence:
enabled: true
size: 20Gi
storageClassName: proxmox-lvm

View file

@ -131,12 +131,6 @@ locals {
# not-yet-visited countries + real UK bank-holiday leave windows + fares). # not-yet-visited countries + real UK bank-holiday leave windows + fares).
# `claude_agent` mode requires images >= the #23 slice (already deployed). # `claude_agent` mode requires images >= the #23 slice (already deployed).
RESEARCH_PROVIDER = "claude_agent" RESEARCH_PROVIDER = "claude_agent"
# Observability (tripit ADR-0032, Phase 2): structured JSON logs (so Loki's
# trace_id derivedField links to Tempo) + OTLP trace export to the in-cluster
# OTel Collector -> Tempo. Both require images >= the observability slice
# (v0.81.0, already live), so they land after that rollout.
LOG_FORMAT = "json"
OTEL_EXPORTER_OTLP_ENDPOINT = "http://otel-collector-opentelemetry-collector.monitoring.svc.cluster.local:4318/v1/traces"
# Stay cover photos (tripit issue #47, ADR-0017): auto-fetch each picked # Stay cover photos (tripit issue #47, ADR-0017): auto-fetch each picked
# city's Wikipedia lead image (keyless REST summary API, "City, Country" # city's Wikipedia lead image (keyless REST summary API, "City, Country"
# first), downloaded into the app's STORAGE_DIR (never hotlinked) and # first), downloaded into the app's STORAGE_DIR (never hotlinked) and