From 0901dd5f61c84b842de6d8bc13537bc0ca40577e Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Tue, 14 Apr 2026 17:52:13 +0000 Subject: [PATCH] state(monitoring): update encrypted state --- .../modules/monitoring/prometheus.tf | 20 +++++++++++++++++++ .../monitoring/prometheus_chart_values.tpl | 8 ++++++++ 2 files changed, 28 insertions(+) diff --git a/stacks/monitoring/modules/monitoring/prometheus.tf b/stacks/monitoring/modules/monitoring/prometheus.tf index 7fccfc92..f9cd3e13 100644 --- a/stacks/monitoring/modules/monitoring/prometheus.tf +++ b/stacks/monitoring/modules/monitoring/prometheus.tf @@ -30,6 +30,26 @@ module "nfs_prometheus_backup_host" { nfs_path = "/srv/nfs/prometheus-backup" } +resource "kubernetes_persistent_volume_claim" "alertmanager_pvc" { + wait_until_bound = false + metadata { + name = "alertmanager-pvc" + namespace = kubernetes_namespace.monitoring.metadata[0].name + annotations = { + "resize.topolvm.io/threshold" = "80%" + "resize.topolvm.io/increase" = "100%" + "resize.topolvm.io/storage_limit" = "10Gi" + } + } + spec { + access_modes = ["ReadWriteOnce"] + storage_class_name = "proxmox-lvm-encrypted" + resources { + requests = { storage = "2Gi" } + } + } +} + resource "helm_release" "prometheus" { namespace = kubernetes_namespace.monitoring.metadata[0].name create_namespace = true diff --git a/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl b/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl index bcc5f982..01a5ae14 100755 --- a/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl +++ b/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl @@ -1717,6 +1717,14 @@ serverFiles: severity: critical annotations: summary: ">5 pods stuck in ContainerCreating with sudden increase — possible NFS or storage outage" + - alert: NFSHighRPCRetransmissions + expr: | + sum by (instance) (rate(node_nfs_rpc_retransmissions_total[5m])) > 5 + for: 5m + labels: + severity: warning + annotations: + summary: "Node {{ $labels.instance }}: NFS RPC retransmission rate {{ $value | printf \"%.1f\" }}/s — NFS server (192.168.1.127) may be degraded or unreachable" - name: "Application Health" rules: - alert: MailServerDown