From ca2680c189b53c7a532b2397a38be028ca8c2e83 Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Tue, 14 Apr 2026 18:05:33 +0000 Subject: [PATCH] fix(post-mortem): add NFSHighRPCRetransmissions alert + migrate alertmanager to proxmox-lvm-encrypted [PM-2026-04-14] MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add PrometheusRule: NFSHighRPCRetransmissions fires when node_nfs_rpc_retransmissions_total rate exceeds 5/s for 5m — catches NFS server degradation before pod failures cascade - Migrate alertmanager PV from NFS (192.168.1.127:/srv/nfs/alertmanager) to proxmox-lvm-encrypted eliminating the circular dependency where alertmanager couldn't alert about NFS failures - Set force_update=true on prometheus helm_release to handle StatefulSet volumeClaimTemplate changes Co-Authored-By: postmortem-todo-resolver --- .../modules/monitoring/prometheus.tf | 23 ++----------------- .../monitoring/prometheus_chart_values.tpl | 7 +++--- 2 files changed, 6 insertions(+), 24 deletions(-) diff --git a/stacks/monitoring/modules/monitoring/prometheus.tf b/stacks/monitoring/modules/monitoring/prometheus.tf index f9cd3e13..7e998b91 100644 --- a/stacks/monitoring/modules/monitoring/prometheus.tf +++ b/stacks/monitoring/modules/monitoring/prometheus.tf @@ -30,26 +30,6 @@ module "nfs_prometheus_backup_host" { nfs_path = "/srv/nfs/prometheus-backup" } -resource "kubernetes_persistent_volume_claim" "alertmanager_pvc" { - wait_until_bound = false - metadata { - name = "alertmanager-pvc" - namespace = kubernetes_namespace.monitoring.metadata[0].name - annotations = { - "resize.topolvm.io/threshold" = "80%" - "resize.topolvm.io/increase" = "100%" - "resize.topolvm.io/storage_limit" = "10Gi" - } - } - spec { - access_modes = ["ReadWriteOnce"] - storage_class_name = "proxmox-lvm-encrypted" - resources { - requests = { storage = "2Gi" } - } - } -} - resource "helm_release" "prometheus" { namespace = kubernetes_namespace.monitoring.metadata[0].name create_namespace = true @@ -60,7 +40,8 @@ resource "helm_release" "prometheus" { # version = "15.0.2" version = "25.8.2" - timeout = 900 # 15 min — Recreate strategy + iSCSI reattach is slow + timeout = 900 # 15 min — Recreate strategy + iSCSI reattach is slow + force_update = true # Required for StatefulSet volumeClaimTemplate changes (immutable field) values = [templatefile("${path.module}/prometheus_chart_values.tpl", { alertmanager_mail_pass = var.alertmanager_account_password, alertmanager_slack_api_url = var.alertmanager_slack_api_url, tuya_api_key = var.tiny_tuya_service_secret, haos_api_token = var.haos_api_token })] } diff --git a/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl b/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl index 01a5ae14..a5f9e5ab 100755 --- a/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl +++ b/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl @@ -4,9 +4,10 @@ alertmanager: replicaCount: 1 persistentVolume: enabled: true - existingClaim: alertmanager-pvc - #existingClaim: alertmanager-iscsi-pvc - # storageClass: rook-cephfs + persistence: + storageClass: proxmox-lvm-encrypted + # Previously on NFS (alertmanager-pv / nfs-truenas). Migrated 2026-04-14 [PM-2026-04-14] + # to proxmox-lvm-encrypted to eliminate circular alerting dependency. strategy: type: RollingUpdate baseURL: "https://alertmanager.viktorbarzin.me"