Noise reduction (8 alerts tuned): - PoisonFountainDown: 2m→5m, critical→warning (fail-open service) - NodeExporterDown: 2m→5m (flaps during node restarts) - PowerOutage: add for:1m (debounce transient voltage dips) - New Tailscale client: add for:5m (debounce headscale reauths) - NoNodeLoadData: use absent() instead of OR vector(0)==0 - NodeHighCPUUsage: 30%→60% (normal for 70+ services) - HighMemoryUsage GPU: 12GB/5m→14GB/15m (T4=16GB, model loading) - PrometheusStorageFull: 50GiB→150GiB (TSDB cap is 180GB) Alert regrouping: - Move MailServerDown, HackmdDown, PrivatebinDown → new "Application Health" - Move New Tailscale client → "Infrastructure Health" New alerts (14): - Networking: Cloudflared (2), MetalLB (2), Technitium DNS - Storage: NFS CSI, iSCSI CSI controllers - Critical Services: PgBouncer, CNPG operator, MySQL operator - Infra Health: CrowdSec, Kyverno, Sealed Secrets, Woodpecker Inhibit rules: - Consolidate 3 NodeDown rules into 1 comprehensive rule - Extend NFS rule to suppress NFS-dependent services - Add PowerOutage → downstream suppression Dashboard loading: - Add for_each ConfigMap in grafana.tf to auto-load all 18 dashboards - Remove duplicate caretta dashboard ConfigMap from caretta.tf
96 lines
2.4 KiB
HCL
96 lines
2.4 KiB
HCL
|
|
|
|
# resource "kubernetes_persistent_volume" "prometheus_grafana_pv" {
|
|
# metadata {
|
|
# name = "grafana-pv"
|
|
# }
|
|
# spec {
|
|
# capacity = {
|
|
# "storage" = "2Gi"
|
|
# }
|
|
# access_modes = ["ReadWriteOnce"]
|
|
# persistent_volume_source {
|
|
# nfs {
|
|
# path = "/mnt/main/grafana"
|
|
# server = var.nfs_server
|
|
# }
|
|
# # iscsi {
|
|
# # target_portal = "iscsi.viktorbarzin.lan:3260"
|
|
# # iqn = "iqn.2020-12.lan.viktorbarzin:storage:monitoring:grafana"
|
|
# # lun = 0
|
|
# # fs_type = "ext4"
|
|
# # }
|
|
# }
|
|
# }
|
|
# }
|
|
|
|
resource "kubernetes_persistent_volume" "alertmanager_pv" {
|
|
metadata {
|
|
name = "alertmanager-pv"
|
|
}
|
|
spec {
|
|
capacity = {
|
|
"storage" = "2Gi"
|
|
}
|
|
access_modes = ["ReadWriteOnce"]
|
|
persistent_volume_source {
|
|
csi {
|
|
driver = "nfs.csi.k8s.io"
|
|
volume_handle = "alertmanager-pv"
|
|
volume_attributes = {
|
|
server = var.nfs_server
|
|
share = "/mnt/main/alertmanager"
|
|
}
|
|
}
|
|
}
|
|
mount_options = [
|
|
"soft",
|
|
"timeo=30",
|
|
"retrans=3",
|
|
"actimeo=5",
|
|
]
|
|
storage_class_name = "nfs-truenas"
|
|
}
|
|
}
|
|
# resource "kubernetes_persistent_volume_claim" "grafana_pvc" {
|
|
# metadata {
|
|
# name = "grafana-pvc"
|
|
# namespace = kubernetes_namespace.monitoring.metadata[0].name
|
|
# }
|
|
# spec {
|
|
# access_modes = ["ReadWriteOnce"]
|
|
# resources {
|
|
# requests = {
|
|
# "storage" = "2Gi"
|
|
# }
|
|
# }
|
|
# }
|
|
# }
|
|
|
|
resource "kubernetes_config_map" "grafana_dashboards" {
|
|
for_each = fileset("${path.module}/dashboards", "*.json")
|
|
|
|
metadata {
|
|
name = "grafana-dashboard-${replace(trimsuffix(each.value, ".json"), "_", "-")}"
|
|
namespace = kubernetes_namespace.monitoring.metadata[0].name
|
|
labels = {
|
|
grafana_dashboard = "1"
|
|
}
|
|
}
|
|
data = {
|
|
(each.value) = file("${path.module}/dashboards/${each.value}")
|
|
}
|
|
}
|
|
|
|
resource "helm_release" "grafana" {
|
|
namespace = kubernetes_namespace.monitoring.metadata[0].name
|
|
create_namespace = true
|
|
name = "grafana"
|
|
atomic = true
|
|
timeout = 600
|
|
|
|
repository = "https://grafana.github.io/helm-charts"
|
|
chart = "grafana"
|
|
|
|
values = [templatefile("${path.module}/grafana_chart_values.yaml", { db_password = var.grafana_db_password, grafana_admin_password = var.grafana_admin_password, mysql_host = var.mysql_host })]
|
|
}
|