From aded77d5ab8ac73646107f9833947d93d1838371 Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Tue, 26 May 2026 02:45:13 +0000 Subject: [PATCH] monitoring: alerts for proxmox-csi LUN saturation per node MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Vaultwarden + 18 pods got stuck for 7h on 2026-05-26 when k8s-node4 went down: surviving workloads piled onto node1 and hit the csi.proxmox.sinextra.dev/max-volume-attachments=28 cap. The Proxmox VM also had 5 stale scsi entries (PVCs long-migrated to other nodes but never removed from VM config), which bypassed the K8s scheduler safety until the plugin returned 'no free lun found' at attach time. Three new alerts on the kube_volumeattachment_info count per node: - warning at 24/28 (>= 85%), 10m - critical at 27/28 (1 slot left), 3m - critical at 28/28 (cap reached), 1m Also whitelisted kube_volumeattachment_info — the metric was being dropped by the disk-write-reduction filter (id=559) and the alert queries returned zero series until it's kept. Co-Authored-By: Claude Opus 4.7 --- .../monitoring/prometheus_chart_values.tpl | 31 ++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-) diff --git a/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl b/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl index 65cfd0a8..d27c17f5 100755 --- a/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl +++ b/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl @@ -528,7 +528,7 @@ serverFiles: action: drop # Whitelist: only keep essential kube-state-metrics, node-exporter, and coredns metrics - source_labels: [__name__] - regex: 'kube_cronjob_status_last_successful_time|kube_deployment_spec_replicas|kube_deployment_status_replicas_available|kube_deployment_status_replicas_unavailable|kube_job_status_failed|kube_job_status_start_time|kube_node_info|kube_node_status_allocatable|kube_node_status_capacity|kube_node_status_condition|kube_persistentvolumeclaim_status_phase|kube_pod_container_resource_limits|kube_pod_container_resource_requests|kube_pod_container_status_restarts_total|kube_pod_container_status_running|kube_pod_container_status_waiting_reason|kube_pod_info|kube_pod_status_phase|kube_pod_status_ready|kube_pod_status_reason|kube_pod_status_conditions|kube_resourcequota|kube_statefulset_replicas|kube_statefulset_status_replicas_ready|kube_daemonset_status_desired_number_scheduled|kube_daemonset_status_number_ready|kube_node_spec_unschedulable|node_cpu_seconds_total|node_disk_io_time_seconds_total|node_disk_read_bytes_total|node_disk_written_bytes_total|node_disk_reads_completed_total|node_disk_writes_completed_total|node_filesystem_avail_bytes|node_filesystem_size_bytes|node_filesystem_device_error|node_filesystem_readonly|node_hwmon_chip_names|node_hwmon_temp_celsius|node_load1|node_load15|node_load5|node_memory_MemAvailable_bytes|node_memory_MemTotal_bytes|node_memory_Buffers_bytes|node_memory_Cached_bytes|node_memory_MemFree_bytes|node_memory_SwapTotal_bytes|node_memory_SwapFree_bytes|node_network_receive_bytes_total|node_network_transmit_bytes_total|node_nfs_requests_total|node_uname_info|node_vmstat_oom_kill|coredns_cache_entries|coredns_cache_hits_total|coredns_cache_misses_total|coredns_dns_requests_total|coredns_dns_responses_total|coredns_forward_requests_total|coredns_forward_responses_total|coredns_build_info|process_cpu_seconds_total|process_resident_memory_bytes|process_start_time_seconds|up|pve_.*' + regex: 'kube_cronjob_status_last_successful_time|kube_deployment_spec_replicas|kube_deployment_status_replicas_available|kube_deployment_status_replicas_unavailable|kube_job_status_failed|kube_job_status_start_time|kube_node_info|kube_node_status_allocatable|kube_node_status_capacity|kube_node_status_condition|kube_persistentvolumeclaim_status_phase|kube_volumeattachment_info|kube_pod_container_resource_limits|kube_pod_container_resource_requests|kube_pod_container_status_restarts_total|kube_pod_container_status_running|kube_pod_container_status_waiting_reason|kube_pod_info|kube_pod_status_phase|kube_pod_status_ready|kube_pod_status_reason|kube_pod_status_conditions|kube_resourcequota|kube_statefulset_replicas|kube_statefulset_status_replicas_ready|kube_daemonset_status_desired_number_scheduled|kube_daemonset_status_number_ready|kube_node_spec_unschedulable|node_cpu_seconds_total|node_disk_io_time_seconds_total|node_disk_read_bytes_total|node_disk_written_bytes_total|node_disk_reads_completed_total|node_disk_writes_completed_total|node_filesystem_avail_bytes|node_filesystem_size_bytes|node_filesystem_device_error|node_filesystem_readonly|node_hwmon_chip_names|node_hwmon_temp_celsius|node_load1|node_load15|node_load5|node_memory_MemAvailable_bytes|node_memory_MemTotal_bytes|node_memory_Buffers_bytes|node_memory_Cached_bytes|node_memory_MemFree_bytes|node_memory_SwapTotal_bytes|node_memory_SwapFree_bytes|node_network_receive_bytes_total|node_network_transmit_bytes_total|node_nfs_requests_total|node_uname_info|node_vmstat_oom_kill|coredns_cache_entries|coredns_cache_hits_total|coredns_cache_misses_total|coredns_dns_requests_total|coredns_dns_responses_total|coredns_forward_requests_total|coredns_forward_responses_total|coredns_build_info|process_cpu_seconds_total|process_resident_memory_bytes|process_start_time_seconds|up|pve_.*' action: keep - job_name: kubernetes-service-endpoints-slow honor_labels: true @@ -2372,6 +2372,35 @@ serverFiles: severity: warning annotations: summary: "Node {{ $labels.instance }}: NFS RPC retransmission rate {{ $value | printf \"%.1f\" }}/s — NFS server (192.168.1.127) may be degraded or unreachable" + # Proxmox CSI per-node LUN saturation. The plugin enforces + # csi.proxmox.sinextra.dev/max-volume-attachments=28 (set on every k8s-node* + # by stacks/proxmox-csi). QEMU's virtio-scsi-pci hard cap is 30 LUNs. + # When K8s-side VolumeAttachments approach the cap, new PVCs fail to + # attach with "no free lun found" — vaultwarden + 18 pods stuck 2026-05-26. + - alert: ProxmoxCSILunUsageHigh + expr: count by (node) (kube_volumeattachment_info{node=~"k8s-node.*"}) >= 24 + for: 10m + labels: + severity: warning + annotations: + summary: "{{ $labels.node }}: {{ $value }}/28 CSI volumes attached (>= 85% of cap)" + description: "Approaching the proxmox-csi-plugin per-node cap of 28 attachments. Workloads scheduled to this node with new PVCs may fail to attach. Consider rebalancing or migrating PVCs to other nodes." + - alert: ProxmoxCSILunUsageCritical + expr: count by (node) (kube_volumeattachment_info{node=~"k8s-node.*"}) >= 27 + for: 3m + labels: + severity: critical + annotations: + summary: "{{ $labels.node }}: {{ $value }}/28 CSI volumes attached — 1 slot left" + description: "Only 1 LUN slot remains before the proxmox-csi cap. Next PVC attach to this node will fail with 'no free lun found'." + - alert: ProxmoxCSILunCapReached + expr: count by (node) (kube_volumeattachment_info{node=~"k8s-node.*"}) >= 28 + for: 1m + labels: + severity: critical + annotations: + summary: "{{ $labels.node }}: at proxmox-csi LUN cap (28/28) — attaches WILL fail" + description: "Pods needing new PVC attachments on {{ $labels.node }} will fail with 'no free lun found'. Detach unused volumes from this node's Proxmox VM config, or migrate PVCs to a less-loaded node." - name: "Application Health" rules: - alert: MailServerDown