diff --git a/modules/kubernetes/nfs_volume/main.tf b/modules/kubernetes/nfs_volume/main.tf index 6d703998..a2a35d88 100644 --- a/modules/kubernetes/nfs_volume/main.tf +++ b/modules/kubernetes/nfs_volume/main.tf @@ -44,6 +44,7 @@ resource "kubernetes_persistent_volume" "this" { volume_mode = "Filesystem" mount_options = [ + "nfsvers=4", "soft", "timeo=30", "retrans=3", diff --git a/stacks/dbaas/modules/dbaas/main.tf b/stacks/dbaas/modules/dbaas/main.tf index d9722d4b..7b4035e6 100644 --- a/stacks/dbaas/modules/dbaas/main.tf +++ b/stacks/dbaas/modules/dbaas/main.tf @@ -180,7 +180,7 @@ resource "helm_release" "mysql_cluster" { version = "2.2.7" values = [yamlencode({ - serverInstances = 3 + serverInstances = 1 routerInstances = 1 serverVersion = "8.4.4" @@ -216,6 +216,7 @@ resource "helm_release" "mysql_cluster" { mycnf = <<-EOT [mysqld] skip-name-resolve + mysql-native-password=ON # Auto-recovery after crashes: rejoin group without manual intervention group_replication_autorejoin_tries=2016 group_replication_exit_state_action=OFFLINE_MODE diff --git a/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl b/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl index 86920720..bcc5f982 100755 --- a/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl +++ b/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl @@ -1700,6 +1700,23 @@ serverFiles: annotations: summary: "NFS CSI controller down — new NFS volume provisioning broken" # ISCSICSIControllerDown alert removed — democratic-csi replaced by proxmox-csi (2026-04-05) + - alert: NFSCSINodeDown + expr: kube_daemonset_status_number_unavailable{namespace="nfs-csi", daemonset="csi-nfs-node"} > 0 + for: 10m + labels: + severity: critical + annotations: + summary: "{{ $value }} NFS CSI node pod(s) unavailable — NFS mounts will fail on affected nodes" + - alert: NFSMountFailures + expr: | + count(kube_pod_container_status_waiting_reason{reason="ContainerCreating"} == 1) > 5 + and on() + count(kube_pod_container_status_waiting_reason{reason="ContainerCreating"} == 1) > 2 * count(kube_pod_container_status_waiting_reason{reason="ContainerCreating"} offset 10m == 1 or on() vector(0)) + for: 10m + labels: + severity: critical + annotations: + summary: ">5 pods stuck in ContainerCreating with sudden increase — possible NFS or storage outage" - name: "Application Health" rules: - alert: MailServerDown diff --git a/stacks/nfs-csi/modules/nfs-csi/main.tf b/stacks/nfs-csi/modules/nfs-csi/main.tf index c68c4875..ebe10dc7 100644 --- a/stacks/nfs-csi/modules/nfs-csi/main.tf +++ b/stacks/nfs-csi/modules/nfs-csi/main.tf @@ -80,6 +80,7 @@ resource "kubernetes_storage_class" "nfs_truenas" { volume_binding_mode = "Immediate" mount_options = [ + "nfsvers=4", "soft", "timeo=30", "retrans=3",