feat(monitoring): Tempo + OTel Collector for tripit tracing, hardened (ADR-0032 Phase 2)
Some checks failed
ci/woodpecker/push/default Pipeline failed
Some checks failed
ci/woodpecker/push/default Pipeline failed
Re-land Phase 2 after the first attempt's two failure modes, both fixed: - tempo.resources set under the correct single-binary chart key (was OOMKilled on the namespace LimitRange default when mis-placed at top level). - atomic=true + cleanup_on_fail=true on BOTH helm releases — a failed install auto-rolls-back instead of leaving a stuck/orphaned release (memory #6479). Tempo (single-binary, proxmox-lvm 20Gi, 30d) + OTel Collector (contrib; otlp -> redaction -> batch -> tempo) + Tempo datasource + additive trace_id->Tempo derivedField on Loki + tripit LOG_FORMAT=json/OTEL_EXPORTER_OTLP_ENDPOINT. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
parent
a0897de7c3
commit
1595bddfc2
7 changed files with 249 additions and 0 deletions
|
|
@ -524,6 +524,19 @@ resource "kubernetes_config_map" "grafana_loki_datasource" {
|
|||
access = "proxy"
|
||||
url = "http://loki.monitoring.svc.cluster.local:3100"
|
||||
isDefault = false
|
||||
jsonData = {
|
||||
# Log -> trace (tripit ADR-0032): pull trace_id out of tripit's JSON log
|
||||
# lines and deep-link to the trace in Tempo. Additive — does NOT set a
|
||||
# uid, so existing dashboards' references to this datasource are
|
||||
# unaffected.
|
||||
derivedFields = [{
|
||||
name = "trace_id"
|
||||
matcherRegex = "\"trace_id\":\\s*\"([a-f0-9]{32})\""
|
||||
url = "$${__value.raw}"
|
||||
datasourceUid = "tempo"
|
||||
urlDisplayLabel = "View trace in Tempo"
|
||||
}]
|
||||
}
|
||||
}]
|
||||
})
|
||||
}
|
||||
|
|
|
|||
19
stacks/monitoring/modules/monitoring/otel-collector.tf
Normal file
19
stacks/monitoring/modules/monitoring/otel-collector.tf
Normal file
|
|
@ -0,0 +1,19 @@
|
|||
# OpenTelemetry Collector — the OTLP ingress + redaction backstop in front of
|
||||
# Tempo (tripit ADR-0032). Apps export OTLP here; it redacts deny-listed values,
|
||||
# buffers, and forwards to Tempo. atomic + cleanup_on_fail so a failed install
|
||||
# auto-rolls-back (no stuck/orphaned release — memory #6479).
|
||||
resource "helm_release" "otel_collector" {
|
||||
namespace = kubernetes_namespace.monitoring.metadata[0].name
|
||||
create_namespace = true
|
||||
name = "otel-collector"
|
||||
|
||||
repository = "https://open-telemetry.github.io/opentelemetry-helm-charts"
|
||||
chart = "opentelemetry-collector"
|
||||
|
||||
values = [file("${path.module}/otel-collector.yaml")]
|
||||
timeout = 600
|
||||
atomic = true
|
||||
cleanup_on_fail = true
|
||||
|
||||
depends_on = [helm_release.tempo]
|
||||
}
|
||||
79
stacks/monitoring/modules/monitoring/otel-collector.yaml
Normal file
79
stacks/monitoring/modules/monitoring/otel-collector.yaml
Normal file
|
|
@ -0,0 +1,79 @@
|
|||
# OpenTelemetry Collector — OTLP ingress + deny-list redaction backstop in front
|
||||
# of Tempo (tripit ADR-0032). Contrib image (the `redaction` processor is contrib-
|
||||
# only). Traces-only: the chart's default jaeger/zipkin/prometheus receivers and
|
||||
# metrics/logs pipelines are nulled out so the collector runs a single clean
|
||||
# otlp -> redaction -> batch -> tempo pipeline.
|
||||
mode: deployment
|
||||
replicaCount: 1
|
||||
|
||||
image:
|
||||
repository: otel/opentelemetry-collector-contrib
|
||||
|
||||
command:
|
||||
name: otelcol-contrib
|
||||
|
||||
presets:
|
||||
kubernetesAttributes:
|
||||
enabled: false
|
||||
|
||||
# Only the OTLP ports; drop the chart's default jaeger/zipkin/metrics ports.
|
||||
ports:
|
||||
otlp:
|
||||
enabled: true
|
||||
containerPort: 4317
|
||||
servicePort: 4317
|
||||
protocol: TCP
|
||||
otlp-http:
|
||||
enabled: true
|
||||
containerPort: 4318
|
||||
servicePort: 4318
|
||||
protocol: TCP
|
||||
jaeger-compact: { enabled: false }
|
||||
jaeger-thrift: { enabled: false }
|
||||
jaeger-grpc: { enabled: false }
|
||||
zipkin: { enabled: false }
|
||||
metrics: { enabled: false }
|
||||
|
||||
resources:
|
||||
requests:
|
||||
cpu: 50m
|
||||
memory: 128Mi
|
||||
limits:
|
||||
memory: 512Mi
|
||||
|
||||
config:
|
||||
# Null the chart defaults we don't use (configured-but-unused components fail
|
||||
# collector startup).
|
||||
receivers:
|
||||
jaeger: null
|
||||
zipkin: null
|
||||
prometheus: null
|
||||
otlp:
|
||||
protocols:
|
||||
grpc:
|
||||
endpoint: ${env:MY_POD_IP}:4317
|
||||
http:
|
||||
endpoint: ${env:MY_POD_IP}:4318
|
||||
processors:
|
||||
# Deny-list backstop (ADR-0032): drop credential-shaped attribute VALUES even
|
||||
# if an app accidentally puts one on a span. In-app hygiene is primary.
|
||||
redaction:
|
||||
allow_all_keys: true
|
||||
blocked_values:
|
||||
- "(?i)bearer\\s+[a-z0-9._~+/=-]+"
|
||||
- "eyJ[a-zA-Z0-9_-]{5,}\\.[a-zA-Z0-9_-]{5,}\\.[a-zA-Z0-9_-]+"
|
||||
- "-----BEGIN[\\s\\S]*?-----END[^-]*-----"
|
||||
batch: {}
|
||||
exporters:
|
||||
otlp/tempo:
|
||||
endpoint: tempo.monitoring.svc.cluster.local:4317
|
||||
tls:
|
||||
insecure: true
|
||||
service:
|
||||
pipelines:
|
||||
traces:
|
||||
receivers: [otlp]
|
||||
processors: [redaction, batch]
|
||||
exporters: [otlp/tempo]
|
||||
metrics: null
|
||||
logs: null
|
||||
49
stacks/monitoring/modules/monitoring/tempo.tf
Normal file
49
stacks/monitoring/modules/monitoring/tempo.tf
Normal file
|
|
@ -0,0 +1,49 @@
|
|||
# Grafana Tempo — trace store for the TripIt observability stack (tripit ADR-0032,
|
||||
# infra plan docs/plans/2026-06-21-tripit-observability-tempo-otel.md). Phase 2:
|
||||
# the app already trace-correlates its logs on Loki (Phase 1); this adds the trace
|
||||
# UI + logs<->traces correlation. Additive to the monitoring stack.
|
||||
#
|
||||
# atomic + cleanup_on_fail: a failed install auto-rolls-back instead of leaving a
|
||||
# stuck `failed` release that terraform can't upgrade and a config-revert can't
|
||||
# destroy (the 2026-06-21 first-attempt failure mode — see memory #6479).
|
||||
resource "helm_release" "tempo" {
|
||||
namespace = kubernetes_namespace.monitoring.metadata[0].name
|
||||
create_namespace = true
|
||||
name = "tempo"
|
||||
|
||||
repository = "https://grafana.github.io/helm-charts"
|
||||
chart = "tempo" # single-binary (filesystem) — the Loki-scale, single-writer twin
|
||||
|
||||
values = [file("${path.module}/tempo.yaml")]
|
||||
timeout = 600
|
||||
atomic = true
|
||||
cleanup_on_fail = true
|
||||
}
|
||||
|
||||
# Grafana Tempo datasource. The reverse direction (Loki log -> Tempo trace) is the
|
||||
# derivedField added to the Loki datasource in loki.tf. Discovered by the Grafana
|
||||
# sidecar via the grafana_datasource label, same as the Loki datasource.
|
||||
resource "kubernetes_config_map" "grafana_tempo_datasource" {
|
||||
metadata {
|
||||
name = "grafana-tempo-datasource"
|
||||
namespace = kubernetes_namespace.monitoring.metadata[0].name
|
||||
labels = {
|
||||
grafana_datasource = "1"
|
||||
}
|
||||
}
|
||||
data = {
|
||||
"tempo-datasource.yaml" = yamlencode({
|
||||
apiVersion = 1
|
||||
datasources = [{
|
||||
name = "Tempo"
|
||||
type = "tempo"
|
||||
access = "proxy"
|
||||
uid = "tempo"
|
||||
url = "http://tempo.monitoring.svc.cluster.local:3100"
|
||||
isDefault = false
|
||||
}]
|
||||
})
|
||||
}
|
||||
|
||||
depends_on = [helm_release.tempo]
|
||||
}
|
||||
38
stacks/monitoring/modules/monitoring/tempo.yaml
Normal file
38
stacks/monitoring/modules/monitoring/tempo.yaml
Normal file
|
|
@ -0,0 +1,38 @@
|
|||
# Grafana Tempo — single-binary trace store for the TripIt observability stack
|
||||
# (tripit ADR-0032). Mirrors Loki: filesystem storage on a proxmox-lvm PVC,
|
||||
# SingleBinary, ingests OTLP from the OTel Collector. Additive — independent of
|
||||
# Loki/Prometheus/Grafana.
|
||||
tempo:
|
||||
retention: 720h # 30d, matching Loki
|
||||
storage:
|
||||
trace:
|
||||
backend: local
|
||||
local:
|
||||
path: /var/tempo/traces
|
||||
wal:
|
||||
path: /var/tempo/wal
|
||||
# OTLP ingest (from the OTel Collector). gRPC 4317 / HTTP 4318.
|
||||
receivers:
|
||||
otlp:
|
||||
protocols:
|
||||
grpc:
|
||||
endpoint: 0.0.0.0:4317
|
||||
http:
|
||||
endpoint: 0.0.0.0:4318
|
||||
# Tempo query/HTTP API — the Grafana datasource URL targets this (3100).
|
||||
server:
|
||||
http_listen_port: 3100
|
||||
# Container resources — the single-binary chart key is tempo.resources (NOT a
|
||||
# top-level `resources:`, which is silently ignored → the pod falls to the
|
||||
# namespace LimitRange default ~256Mi and is OOMKilled, exit 137).
|
||||
resources:
|
||||
requests:
|
||||
cpu: 100m
|
||||
memory: 256Mi
|
||||
limits:
|
||||
memory: 2Gi
|
||||
|
||||
persistence:
|
||||
enabled: true
|
||||
size: 20Gi
|
||||
storageClassName: proxmox-lvm
|
||||
Loading…
Add table
Add a link
Reference in a new issue