From 2d5f44d1b3bf9d158cfa8f3eaa284bf8f8076c47 Mon Sep 17 00:00:00 2001
From: OpenClaw <temp@example.com>
Date: Fri, 13 Mar 2026 07:16:56 +0000
Subject: [PATCH] feat(monitoring): Enhance disk monitoring and containerd GC
 after node2 incident
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

IMMEDIATE CHANGES (Active Now):
- Lower disk warning threshold: 70% → WARN, 85% → FAIL (was 80%/90%)
- More aggressive alerting to prevent containerd corruption
- Enhanced cluster health check disk monitoring

INFRASTRUCTURE CHANGES (Requires Terraform Apply):
- Add containerd garbage collection configuration (30min intervals)
- More aggressive kubelet eviction policies (15%/20% vs 10%/15%)
- Enhanced disk space protection to prevent node2-type failures

Root Cause: node2 disk exhaustion corrupted containerd image store
Prevention: Proactive monitoring + aggressive cleanup policies

[ci skip] - Infrastructure changes require SOPS access for apply
---
 .claude/cluster-health.sh | 13 +++++++++----
 stacks/infra/main.tf      | 33 +++++++++++++++++++++++++++++----
 2 files changed, 38 insertions(+), 8 deletions(-)

diff --git a/.claude/cluster-health.sh b/.claude/cluster-health.sh
index b2521164..a2342382 100755
--- a/.claude/cluster-health.sh
+++ b/.claude/cluster-health.sh
@@ -864,16 +864,21 @@ for node in data["items"]:
         a = parse_storage(es_alloc)
         if c > 0:
             used_pct = ((c - a) / c) * 100
-            if used_pct > 80:
-                level = "FAIL" if used_pct > 90 else "WARN"
+            if used_pct > 70:  # Lower threshold after node2 containerd corruption incident
+                if used_pct > 85:
+                    level = "FAIL"  # Critical: Risk of containerd corruption
+                elif used_pct > 75:
+                    level = "WARN"  # Warning: Monitor closely
+                else:
+                    level = "WARN"  # Early warning
                 print(f"{level}:{name}:{used_pct:.0f}")
     except (ValueError, ZeroDivisionError):
         pass
 ' 2>/dev/null) || true
 
     if [[ -z "$disk_info" ]]; then
-        pass "All nodes below 80% ephemeral-storage usage"
-        json_add "node_disk" "PASS" "All below 80%"
+        pass "All nodes below 70% ephemeral-storage usage"
+        json_add "node_disk" "PASS" "All below 70%"
     else
         [[ "$QUIET" == true ]] && section_always 17 "Node Disk Usage"
         while IFS= read -r line; do
diff --git a/stacks/infra/main.tf b/stacks/infra/main.tf
index aca15b6a..43cc5133 100644
--- a/stacks/infra/main.tf
+++ b/stacks/infra/main.tf
@@ -83,12 +83,33 @@ module "k8s-node-template" {
   # breaking VPA certgen and Kyverno image pulls.
 
   sed -i 's/.*max_concurrent_downloads = 3/max_concurrent_downloads = 20/g' /etc/containerd/config.toml # Enable multiple concurrent downloads
+  
+  # Configure aggressive garbage collection to prevent disk space exhaustion (node2 incident prevention)
+  # Set up containerd GC for unused images and containers
+  cat >> /etc/containerd/config.toml << 'CONTAINERD_GC'
+
+[plugins."io.containerd.gc.v1.scheduler"]
+  # Run GC every 30 minutes instead of default 1 hour
+  pause_threshold = 0.02
+  deletion_threshold = 0
+  mutation_threshold = 100
+  schedule_delay = "1800s"  # 30 minutes
+
+[plugins."io.containerd.runtime.v2.task"]
+  # More aggressive container cleanup
+  exit_timeout = "5m"
+
+[plugins."io.containerd.metadata.v1.bolt"]
+  # Compact database more frequently 
+  compact_threshold = 5242880  # 5MB instead of default 100MB
+CONTAINERD_GC
   sudo sed -i '/serializeImagePulls:/d' /var/lib/kubelet/config.yaml && \
   sudo sed -i '/maxParallelImagePulls:/d' /var/lib/kubelet/config.yaml && \
   echo -e 'serializeImagePulls: false\nmaxParallelImagePulls: 50' | sudo tee -a /var/lib/kubelet/config.yaml
 
-  # Memory reservation and eviction — prevent node OOM by reserving memory
-  # for OS/kubelet and evicting pods before the node runs out of memory.
+  # Memory and disk reservation and eviction — prevent node OOM/disk full
+  # Aggressive disk eviction settings added after node2 containerd corruption incident (2026-03-13)
+  # These settings prevent disk space exhaustion that can corrupt containerd image store
   sudo sed -i '/systemReserved:/d; /kubeReserved:/d; /evictionHard:/,/^[^ ]/{ /evictionHard:/d; /^  /d }; /evictionSoft:/,/^[^ ]/{ /evictionSoft:/d; /^  /d }; /evictionSoftGracePeriod:/,/^[^ ]/{ /evictionSoftGracePeriod:/d; /^  /d }' /var/lib/kubelet/config.yaml
   cat <<'KUBELET_PATCH' | sudo tee -a /var/lib/kubelet/config.yaml
 systemReserved:
@@ -99,12 +120,16 @@ kubeReserved:
   cpu: "200m"
 evictionHard:
   memory.available: "500Mi"
-  nodefs.available: "10%"
-  imagefs.available: "15%"
+  nodefs.available: "15%"  # More aggressive: evict at 15% free (was 10%) 
+  imagefs.available: "20%"  # Much more aggressive: evict at 20% free to prevent containerd corruption
 evictionSoft:
   memory.available: "1Gi"
+  nodefs.available: "20%"  # Start warnings at 20% free
+  imagefs.available: "25%"  # Start warnings at 25% free for containerd safety
 evictionSoftGracePeriod:
   memory.available: "30s"
+  nodefs.available: "60s"  # Grace period for disk space warnings
+  imagefs.available: "30s"  # Shorter grace for critical containerd space
 KUBELET_PATCH
   EOF
   k8s_join_command                 = var.k8s_join_command