Monitoring overhaul: reduce noise, add coverage gaps, auto-load dashboards

Noise reduction (8 alerts tuned):
- PoisonFountainDown: 2m→5m, critical→warning (fail-open service)
- NodeExporterDown: 2m→5m (flaps during node restarts)
- PowerOutage: add for:1m (debounce transient voltage dips)
- New Tailscale client: add for:5m (debounce headscale reauths)
- NoNodeLoadData: use absent() instead of OR vector(0)==0
- NodeHighCPUUsage: 30%→60% (normal for 70+ services)
- HighMemoryUsage GPU: 12GB/5m→14GB/15m (T4=16GB, model loading)
- PrometheusStorageFull: 50GiB→150GiB (TSDB cap is 180GB)

Alert regrouping:
- Move MailServerDown, HackmdDown, PrivatebinDown → new "Application Health"
- Move New Tailscale client → "Infrastructure Health"

New alerts (14):
- Networking: Cloudflared (2), MetalLB (2), Technitium DNS
- Storage: NFS CSI, iSCSI CSI controllers
- Critical Services: PgBouncer, CNPG operator, MySQL operator
- Infra Health: CrowdSec, Kyverno, Sealed Secrets, Woodpecker

Inhibit rules:
- Consolidate 3 NodeDown rules into 1 comprehensive rule
- Extend NFS rule to suppress NFS-dependent services
- Add PowerOutage → downstream suppression

Dashboard loading:
- Add for_each ConfigMap in grafana.tf to auto-load all 18 dashboards
- Remove duplicate caretta dashboard ConfigMap from caretta.tf
This commit is contained in:
Viktor Barzin 2026-03-14 10:22:22 +00:00
parent a6f71fc6f0
commit 6377a8b85b
3 changed files with 161 additions and 73 deletions

View file

@ -59,15 +59,4 @@ resource "kubernetes_service" "caretta_metrics" {
}
}
resource "kubernetes_config_map" "caretta_grafana_dashboard" {
metadata {
name = "caretta-grafana-dashboard"
namespace = kubernetes_namespace.monitoring.metadata[0].name
labels = {
grafana_dashboard = "1"
}
}
data = {
"caretta-dashboard.json" = file("${path.module}/dashboards/caretta-dashboard.json")
}
}
# Caretta dashboard is now loaded via the grafana_dashboards for_each in grafana.tf

View file

@ -67,6 +67,21 @@ resource "kubernetes_persistent_volume" "alertmanager_pv" {
# }
# }
resource "kubernetes_config_map" "grafana_dashboards" {
for_each = fileset("${path.module}/dashboards", "*.json")
metadata {
name = "grafana-dashboard-${replace(trimsuffix(each.value, ".json"), "_", "-")}"
namespace = kubernetes_namespace.monitoring.metadata[0].name
labels = {
grafana_dashboard = "1"
}
}
data = {
(each.value) = file("${path.module}/dashboards/${each.value}")
}
}
resource "helm_release" "grafana" {
namespace = kubernetes_namespace.monitoring.metadata[0].name
create_namespace = true

View file

@ -68,26 +68,16 @@ alertmanager:
- severity = info
continue: false
inhibit_rules:
# Node down makes node-condition alerts redundant
# Node down suppresses workload and service alerts (cascade protection)
- source_matchers:
- alertname = NodeDown
target_matchers:
- alertname =~ "NodeNotReady|NodeConditionBad"
# Node down suppresses workload alerts (cascade protection)
- source_matchers:
- alertname = NodeDown
target_matchers:
- alertname =~ "PodCrashLooping|ContainerOOMKilled|DeploymentReplicasMismatch|StatefulSetReplicasMismatch|DaemonSetMissingPods|ScrapeTargetDown|NodeLowFreeMemory"
# Node down suppresses service-specific alerts
- source_matchers:
- alertname = NodeDown
target_matchers:
- alertname =~ "PostgreSQLDown|MySQLDown|RedisDown|HeadscaleDown|AuthentikDown|PoisonFountainDown|HackmdDown|PrivatebinDown|MailServerDown"
# NFS down causes mass pod failures
- alertname =~ "NodeNotReady|NodeConditionBad|PodCrashLooping|ContainerOOMKilled|DeploymentReplicasMismatch|StatefulSetReplicasMismatch|DaemonSetMissingPods|ScrapeTargetDown|NodeLowFreeMemory|PostgreSQLDown|MySQLDown|RedisDown|HeadscaleDown|AuthentikDown|PoisonFountainDown|HackmdDown|PrivatebinDown|MailServerDown|NodeExporterDown|DockerRegistryDown|HomeAssistantDown|CloudflaredDown|TechnitiumDNSDown"
# NFS down causes mass pod failures and NFS-dependent service outages
- source_matchers:
- alertname = NFSServerUnresponsive
target_matchers:
- alertname =~ "PodCrashLooping|ContainerOOMKilled|DeploymentReplicasMismatch|StatefulSetReplicasMismatch|DaemonSetMissingPods|ScrapeTargetDown"
- alertname =~ "PodCrashLooping|ContainerOOMKilled|DeploymentReplicasMismatch|StatefulSetReplicasMismatch|DaemonSetMissingPods|ScrapeTargetDown|PostgreSQLDown|MySQLDown|RedisDown|AuthentikDown|PoisonFountainDown|HackmdDown|PrivatebinDown|MailServerDown|HomeAssistantDown"
# Traefik down makes service-level alerts noise
- source_matchers:
- alertname = TraefikDown
@ -103,6 +93,11 @@ alertmanager:
- alertname = PowerOutage
target_matchers:
- alertname = OnBattery
# Power outage suppresses everything downstream
- source_matchers:
- alertname = PowerOutage
target_matchers:
- alertname =~ "NodeDown|NFSServerUnresponsive|NodeExporterDown|CloudflaredDown|MetalLBSpeakerDown|MetalLBControllerDown"
receivers:
- name: slack-critical
slack_configs:
@ -292,12 +287,12 @@ serverFiles:
annotations:
summary: "GPU util: {{ $value | printf \"%.0f\" }}% (threshold: 50%)"
- alert: HighMemoryUsage
expr: nvidia_tesla_t4_DCGM_FI_DEV_FB_USED / 1024 > 12
for: 5m
expr: nvidia_tesla_t4_DCGM_FI_DEV_FB_USED / 1024 > 14
for: 15m
labels:
severity: info
annotations:
summary: "VRAM used: {{ $value | printf \"%.1f\" }} GB (threshold: 12 GB)"
summary: "VRAM used: {{ $value | printf \"%.1f\" }} GB (threshold: 14 GB)"
- name: Power
rules:
- alert: OnBattery
@ -316,6 +311,7 @@ serverFiles:
summary: "UPS battery low: {{ $value | printf \"%.0f\" }} min remaining (threshold: 25 min)"
- alert: PowerOutage
expr: ups_upsInputVoltage < 150
for: 1m
labels:
severity: critical
annotations:
@ -428,12 +424,12 @@ serverFiles:
annotations:
summary: "Scrape target down: {{ $labels.job }}/{{ $labels.instance }}"
- alert: PrometheusStorageFull
expr: (prometheus_tsdb_storage_blocks_bytes / (1024*1024*1024)) > 50
expr: (prometheus_tsdb_storage_blocks_bytes / (1024*1024*1024)) > 150
for: 30m
labels:
severity: warning
annotations:
summary: "Prometheus TSDB: {{ $value | printf \"%.0f\" }} GiB (threshold: 50 GiB)"
summary: "Prometheus TSDB: {{ $value | printf \"%.0f\" }} GiB (threshold: 150 GiB)"
- alert: PrometheusNotificationsFailing
expr: rate(prometheus_notifications_errors_total[5m]) > 0
for: 10m
@ -455,6 +451,41 @@ serverFiles:
severity: critical
annotations:
summary: "etcd backup CronJob has never completed successfully"
- alert: New Tailscale client
expr: irate(headscale_machine_registrations_total{action="reauth"}[5m]) > 0
for: 5m
labels:
severity: info
annotations:
summary: "New Tailscale client registered"
- alert: CrowdSecDown
expr: up{job="crowdsec"} == 0
for: 10m
labels:
severity: warning
annotations:
summary: "CrowdSec LAPI down — WAF/IDS degraded (Traefik plugin fails open)"
- alert: KyvernoDown
expr: (kube_deployment_status_replicas_available{namespace="kyverno", deployment="kyverno-admission-controller"} or on() vector(0)) < 1
for: 10m
labels:
severity: warning
annotations:
summary: "Kyverno admission controller down — policy enforcement disabled"
- alert: SealedSecretsDown
expr: (kube_deployment_status_replicas_available{namespace="sealed-secrets", deployment="sealed-secrets"} or on() vector(0)) < 1
for: 30m
labels:
severity: warning
annotations:
summary: "Sealed Secrets controller down — new secrets can't be decrypted"
- alert: WoodpeckerDown
expr: (kube_statefulset_status_replicas_ready{namespace="woodpecker", statefulset="woodpecker-server"} or on() vector(0)) < 1
for: 15m
labels:
severity: warning
annotations:
summary: "Woodpecker CI server down — CI/CD pipelines not running"
- name: Critical Services
rules:
- alert: PostgreSQLDown
@ -494,11 +525,32 @@ serverFiles:
summary: "Authentik auth server has no available replicas"
- alert: PoisonFountainDown
expr: (kube_deployment_status_replicas_available{namespace="poison-fountain", deployment="poison-fountain"} or on() vector(0)) < 1
for: 2m
for: 5m
labels:
severity: warning
annotations:
summary: "Poison Fountain is down - AI bot blocking degraded to fail-open"
- alert: PgBouncerDown
expr: (kube_deployment_status_replicas_available{namespace="authentik", deployment="pgbouncer"} or on() vector(0)) < 1
for: 5m
labels:
severity: critical
annotations:
summary: "Poison Fountain is down - AI bot blocking degraded to fail-open"
summary: "PgBouncer down — Authentik cannot reach PostgreSQL"
- alert: CNPGOperatorDown
expr: (kube_deployment_status_replicas_available{namespace="cnpg-system", deployment="cnpg-cloudnative-pg"} or on() vector(0)) < 1
for: 10m
labels:
severity: warning
annotations:
summary: "CNPG operator down — PostgreSQL failover/management degraded"
- alert: MySQLOperatorDown
expr: (kube_deployment_status_replicas_available{namespace="mysql-operator", deployment="mysql-operator"} or on() vector(0)) < 1
for: 10m
labels:
severity: warning
annotations:
summary: "MySQL operator down — InnoDB Cluster management degraded"
- name: Cluster
rules:
- alert: NodeDown
@ -523,12 +575,12 @@ serverFiles:
annotations:
summary: "Registry cache hit rate: {{ $value | printf \"%.0f\" }}% (threshold: 25%)"
- alert: NodeHighCPUUsage
expr: pve_cpu_usage_ratio * 100 > 30
expr: pve_cpu_usage_ratio * 100 > 60
for: 6h
labels:
severity: info
annotations:
summary: "CPU usage on {{ $labels.node }}: {{ $value | printf \"%.0f\" }}% (threshold: 30%)"
summary: "CPU usage on {{ $labels.node }}: {{ $value | printf \"%.0f\" }}% (threshold: 60%)"
- alert: NodeLowFreeMemory
expr: ((1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) or on() vector(1)) * 100 > 95
for: 10m
@ -587,7 +639,7 @@ serverFiles:
summary: "Memory usage on {{ $labels.instance }}: {{ $value | printf \"%.0f\" }}% (threshold: 85%)"
- alert: NodeExporterDown
expr: up{job="prometheus-prometheus-node-exporter"} == 0
for: 2m
for: 5m
labels:
severity: critical
annotations:
@ -600,7 +652,7 @@ serverFiles:
annotations:
summary: "IOWait on {{ $labels.instance }}: {{ $value | printf \"%.0f\" }}% (threshold: 30%)"
- alert: NoNodeLoadData
expr: (node_load1 OR on() vector(0)) == 0
expr: absent(node_load1)
for: 10m
labels:
severity: info
@ -681,6 +733,74 @@ serverFiles:
# severity: page
# annotations:
# summary: OpenWRT high memory usage. Can cause services getting stuck.
# MailServerDown, HackmdDown, PrivatebinDown moved to "Application Health" group
# New Tailscale client moved to "Infrastructure Health" group
- name: "Networking & Access"
rules:
- alert: CloudflaredDown
expr: (kube_deployment_status_replicas_available{namespace="cloudflared", deployment="cloudflared"} or on() vector(0)) < 1
for: 5m
labels:
severity: critical
annotations:
summary: "Cloudflared tunnel down — external access via Cloudflare broken"
- alert: CloudflaredDegraded
expr: |
(
kube_deployment_spec_replicas{namespace="cloudflared", deployment="cloudflared"}
- on(namespace, deployment) kube_deployment_status_replicas_available{namespace="cloudflared", deployment="cloudflared"}
) > 0
for: 10m
labels:
severity: warning
annotations:
summary: "Cloudflared: {{ $value | printf \"%.0f\" }} replica(s) unavailable"
- alert: MetalLBSpeakerDown
expr: |
(
kube_daemonset_status_desired_number_scheduled{namespace="metallb-system", daemonset="metallb-speaker"}
- on(namespace, daemonset) kube_daemonset_status_number_ready{namespace="metallb-system", daemonset="metallb-speaker"}
) > 0
for: 5m
labels:
severity: critical
annotations:
summary: "MetalLB speaker: {{ $value | printf \"%.0f\" }} pod(s) missing"
- alert: MetalLBControllerDown
expr: (kube_deployment_status_replicas_available{namespace="metallb-system", deployment="controller"} or on() vector(0)) < 1
for: 5m
labels:
severity: critical
annotations:
summary: "MetalLB controller down — no LoadBalancer IP management"
- alert: TechnitiumDNSDown
expr: |
(kube_deployment_status_replicas_available{namespace="technitium", deployment="technitium"} or on() vector(0))
+ (kube_deployment_status_replicas_available{namespace="technitium", deployment="technitium-secondary"} or on() vector(0))
< 1
for: 5m
labels:
severity: critical
annotations:
summary: "Both Technitium DNS instances down — internal DNS broken"
- name: "Storage Drivers"
rules:
- alert: NFSCSIControllerDown
expr: (kube_deployment_status_replicas_available{namespace="nfs-csi", deployment="csi-nfs-controller"} or on() vector(0)) < 1
for: 5m
labels:
severity: critical
annotations:
summary: "NFS CSI controller down — new NFS volume provisioning broken"
- alert: ISCSICSIControllerDown
expr: (kube_deployment_status_replicas_available{namespace="iscsi-csi", deployment="democratic-csi-iscsi-controller"} or on() vector(0)) < 1
for: 5m
labels:
severity: critical
annotations:
summary: "iSCSI CSI controller down — new iSCSI volume provisioning broken"
- name: "Application Health"
rules:
- alert: MailServerDown
expr: (kube_deployment_status_replicas_available{namespace="mailserver", deployment="mailserver"} or on() vector(0)) < 1
for: 5m
@ -702,42 +822,6 @@ serverFiles:
severity: warning
annotations:
summary: "Privatebin has no available replicas"
# - name: London OpenWRT Down
# rules:
# - alert: OpenWRT client unreachable
# expr: (openwrt_node_openwrt_info or on() vector(0)) == 0
# for: 10m
# labels:
# severity: page
# annotations:
# summary: London OpenWRT router unreachable through VPN
# - alert: OpenWRT high system load
# expr: openwrt_node_load1 > 0.9
# for: 15m
# labels:
# severity: page
# annotations:
# summary: High system load on OpenWRT
# - alert: Finance app webhook exceptions
# expr: changes(webhook_failure_total[5m]) >= 1
# for: 1m
# labels:
# severity: page
# annotations:
# summary: Finance app webhook exceptions
# - alert: Finance app unhandled exceptions
# expr: changes(flask_http_request_exceptions_total[5m]) >= 1
# for: 1m
# labels:
# severity: page
# annotations:
# summary: Finance app unhandled exceptions
- alert: New Tailscale client
expr: irate(headscale_machine_registrations_total{action="reauth"}[5m]) > 0
labels:
severity: info
annotations:
summary: "New Tailscale client registered"
extraScrapeConfigs: |
- job_name: 'proxmox-host'