From ea18116da9d2c01a591c635e2a2333e084a90286 Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Tue, 14 Apr 2026 10:28:27 +0000 Subject: [PATCH] =?UTF-8?q?fix:=20NFS=20outage=20recovery=20=E2=80=94=20mi?= =?UTF-8?q?grate=20to=20NFSv4,=20add=20alerting?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit NFS server restart broke NFSv3 (lockd kernel bug on PVE 6.14). All 52 NFS PVs patched to nfsvers=4, NFSv3 disabled on PVE. Changes: - nfs_volume module: add nfsvers=4 mount option - nfs-csi StorageClass: add nfsvers=4 mount option - dbaas: MySQL serverInstances 3→1, mysql-native-password=ON - monitoring: add NFSCSINodeDown and NFSMountFailures alerts [ci skip] Co-Authored-By: Claude Opus 4.6 (1M context) --- modules/kubernetes/nfs_volume/main.tf | 1 + stacks/dbaas/modules/dbaas/main.tf | 3 ++- .../monitoring/prometheus_chart_values.tpl | 17 +++++++++++++++++ stacks/nfs-csi/modules/nfs-csi/main.tf | 1 + 4 files changed, 21 insertions(+), 1 deletion(-) diff --git a/modules/kubernetes/nfs_volume/main.tf b/modules/kubernetes/nfs_volume/main.tf index 6d703998..a2a35d88 100644 --- a/modules/kubernetes/nfs_volume/main.tf +++ b/modules/kubernetes/nfs_volume/main.tf @@ -44,6 +44,7 @@ resource "kubernetes_persistent_volume" "this" { volume_mode = "Filesystem" mount_options = [ + "nfsvers=4", "soft", "timeo=30", "retrans=3", diff --git a/stacks/dbaas/modules/dbaas/main.tf b/stacks/dbaas/modules/dbaas/main.tf index d9722d4b..7b4035e6 100644 --- a/stacks/dbaas/modules/dbaas/main.tf +++ b/stacks/dbaas/modules/dbaas/main.tf @@ -180,7 +180,7 @@ resource "helm_release" "mysql_cluster" { version = "2.2.7" values = [yamlencode({ - serverInstances = 3 + serverInstances = 1 routerInstances = 1 serverVersion = "8.4.4" @@ -216,6 +216,7 @@ resource "helm_release" "mysql_cluster" { mycnf = <<-EOT [mysqld] skip-name-resolve + mysql-native-password=ON # Auto-recovery after crashes: rejoin group without manual intervention group_replication_autorejoin_tries=2016 group_replication_exit_state_action=OFFLINE_MODE diff --git a/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl b/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl index 86920720..bcc5f982 100755 --- a/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl +++ b/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl @@ -1700,6 +1700,23 @@ serverFiles: annotations: summary: "NFS CSI controller down — new NFS volume provisioning broken" # ISCSICSIControllerDown alert removed — democratic-csi replaced by proxmox-csi (2026-04-05) + - alert: NFSCSINodeDown + expr: kube_daemonset_status_number_unavailable{namespace="nfs-csi", daemonset="csi-nfs-node"} > 0 + for: 10m + labels: + severity: critical + annotations: + summary: "{{ $value }} NFS CSI node pod(s) unavailable — NFS mounts will fail on affected nodes" + - alert: NFSMountFailures + expr: | + count(kube_pod_container_status_waiting_reason{reason="ContainerCreating"} == 1) > 5 + and on() + count(kube_pod_container_status_waiting_reason{reason="ContainerCreating"} == 1) > 2 * count(kube_pod_container_status_waiting_reason{reason="ContainerCreating"} offset 10m == 1 or on() vector(0)) + for: 10m + labels: + severity: critical + annotations: + summary: ">5 pods stuck in ContainerCreating with sudden increase — possible NFS or storage outage" - name: "Application Health" rules: - alert: MailServerDown diff --git a/stacks/nfs-csi/modules/nfs-csi/main.tf b/stacks/nfs-csi/modules/nfs-csi/main.tf index c68c4875..ebe10dc7 100644 --- a/stacks/nfs-csi/modules/nfs-csi/main.tf +++ b/stacks/nfs-csi/modules/nfs-csi/main.tf @@ -80,6 +80,7 @@ resource "kubernetes_storage_class" "nfs_truenas" { volume_binding_mode = "Immediate" mount_options = [ + "nfsvers=4", "soft", "timeo=30", "retrans=3",