fix: NFS outage recovery — migrate to NFSv4, add alerting
NFS server restart broke NFSv3 (lockd kernel bug on PVE 6.14). All 52 NFS PVs patched to nfsvers=4, NFSv3 disabled on PVE. Changes: - nfs_volume module: add nfsvers=4 mount option - nfs-csi StorageClass: add nfsvers=4 mount option - dbaas: MySQL serverInstances 3→1, mysql-native-password=ON - monitoring: add NFSCSINodeDown and NFSMountFailures alerts [ci skip] Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
92900b5e08
commit
ea18116da9
4 changed files with 21 additions and 1 deletions
|
|
@ -44,6 +44,7 @@ resource "kubernetes_persistent_volume" "this" {
|
|||
volume_mode = "Filesystem"
|
||||
|
||||
mount_options = [
|
||||
"nfsvers=4",
|
||||
"soft",
|
||||
"timeo=30",
|
||||
"retrans=3",
|
||||
|
|
|
|||
|
|
@ -180,7 +180,7 @@ resource "helm_release" "mysql_cluster" {
|
|||
version = "2.2.7"
|
||||
|
||||
values = [yamlencode({
|
||||
serverInstances = 3
|
||||
serverInstances = 1
|
||||
routerInstances = 1
|
||||
serverVersion = "8.4.4"
|
||||
|
||||
|
|
@ -216,6 +216,7 @@ resource "helm_release" "mysql_cluster" {
|
|||
mycnf = <<-EOT
|
||||
[mysqld]
|
||||
skip-name-resolve
|
||||
mysql-native-password=ON
|
||||
# Auto-recovery after crashes: rejoin group without manual intervention
|
||||
group_replication_autorejoin_tries=2016
|
||||
group_replication_exit_state_action=OFFLINE_MODE
|
||||
|
|
|
|||
|
|
@ -1700,6 +1700,23 @@ serverFiles:
|
|||
annotations:
|
||||
summary: "NFS CSI controller down — new NFS volume provisioning broken"
|
||||
# ISCSICSIControllerDown alert removed — democratic-csi replaced by proxmox-csi (2026-04-05)
|
||||
- alert: NFSCSINodeDown
|
||||
expr: kube_daemonset_status_number_unavailable{namespace="nfs-csi", daemonset="csi-nfs-node"} > 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "{{ $value }} NFS CSI node pod(s) unavailable — NFS mounts will fail on affected nodes"
|
||||
- alert: NFSMountFailures
|
||||
expr: |
|
||||
count(kube_pod_container_status_waiting_reason{reason="ContainerCreating"} == 1) > 5
|
||||
and on()
|
||||
count(kube_pod_container_status_waiting_reason{reason="ContainerCreating"} == 1) > 2 * count(kube_pod_container_status_waiting_reason{reason="ContainerCreating"} offset 10m == 1 or on() vector(0))
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: ">5 pods stuck in ContainerCreating with sudden increase — possible NFS or storage outage"
|
||||
- name: "Application Health"
|
||||
rules:
|
||||
- alert: MailServerDown
|
||||
|
|
|
|||
|
|
@ -80,6 +80,7 @@ resource "kubernetes_storage_class" "nfs_truenas" {
|
|||
volume_binding_mode = "Immediate"
|
||||
|
||||
mount_options = [
|
||||
"nfsvers=4",
|
||||
"soft",
|
||||
"timeo=30",
|
||||
"retrans=3",
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue