feat(monitoring): Enhance disk monitoring and containerd GC after node2 incident
IMMEDIATE CHANGES (Active Now): - Lower disk warning threshold: 70% → WARN, 85% → FAIL (was 80%/90%) - More aggressive alerting to prevent containerd corruption - Enhanced cluster health check disk monitoring INFRASTRUCTURE CHANGES (Requires Terraform Apply): - Add containerd garbage collection configuration (30min intervals) - More aggressive kubelet eviction policies (15%/20% vs 10%/15%) - Enhanced disk space protection to prevent node2-type failures Root Cause: node2 disk exhaustion corrupted containerd image store Prevention: Proactive monitoring + aggressive cleanup policies [ci skip] - Infrastructure changes require SOPS access for apply
This commit is contained in:
parent
761dcb3a72
commit
2d5f44d1b3
2 changed files with 38 additions and 8 deletions
|
|
@ -864,16 +864,21 @@ for node in data["items"]:
|
||||||
a = parse_storage(es_alloc)
|
a = parse_storage(es_alloc)
|
||||||
if c > 0:
|
if c > 0:
|
||||||
used_pct = ((c - a) / c) * 100
|
used_pct = ((c - a) / c) * 100
|
||||||
if used_pct > 80:
|
if used_pct > 70: # Lower threshold after node2 containerd corruption incident
|
||||||
level = "FAIL" if used_pct > 90 else "WARN"
|
if used_pct > 85:
|
||||||
|
level = "FAIL" # Critical: Risk of containerd corruption
|
||||||
|
elif used_pct > 75:
|
||||||
|
level = "WARN" # Warning: Monitor closely
|
||||||
|
else:
|
||||||
|
level = "WARN" # Early warning
|
||||||
print(f"{level}:{name}:{used_pct:.0f}")
|
print(f"{level}:{name}:{used_pct:.0f}")
|
||||||
except (ValueError, ZeroDivisionError):
|
except (ValueError, ZeroDivisionError):
|
||||||
pass
|
pass
|
||||||
' 2>/dev/null) || true
|
' 2>/dev/null) || true
|
||||||
|
|
||||||
if [[ -z "$disk_info" ]]; then
|
if [[ -z "$disk_info" ]]; then
|
||||||
pass "All nodes below 80% ephemeral-storage usage"
|
pass "All nodes below 70% ephemeral-storage usage"
|
||||||
json_add "node_disk" "PASS" "All below 80%"
|
json_add "node_disk" "PASS" "All below 70%"
|
||||||
else
|
else
|
||||||
[[ "$QUIET" == true ]] && section_always 17 "Node Disk Usage"
|
[[ "$QUIET" == true ]] && section_always 17 "Node Disk Usage"
|
||||||
while IFS= read -r line; do
|
while IFS= read -r line; do
|
||||||
|
|
|
||||||
|
|
@ -83,12 +83,33 @@ module "k8s-node-template" {
|
||||||
# breaking VPA certgen and Kyverno image pulls.
|
# breaking VPA certgen and Kyverno image pulls.
|
||||||
|
|
||||||
sed -i 's/.*max_concurrent_downloads = 3/max_concurrent_downloads = 20/g' /etc/containerd/config.toml # Enable multiple concurrent downloads
|
sed -i 's/.*max_concurrent_downloads = 3/max_concurrent_downloads = 20/g' /etc/containerd/config.toml # Enable multiple concurrent downloads
|
||||||
|
|
||||||
|
# Configure aggressive garbage collection to prevent disk space exhaustion (node2 incident prevention)
|
||||||
|
# Set up containerd GC for unused images and containers
|
||||||
|
cat >> /etc/containerd/config.toml << 'CONTAINERD_GC'
|
||||||
|
|
||||||
|
[plugins."io.containerd.gc.v1.scheduler"]
|
||||||
|
# Run GC every 30 minutes instead of default 1 hour
|
||||||
|
pause_threshold = 0.02
|
||||||
|
deletion_threshold = 0
|
||||||
|
mutation_threshold = 100
|
||||||
|
schedule_delay = "1800s" # 30 minutes
|
||||||
|
|
||||||
|
[plugins."io.containerd.runtime.v2.task"]
|
||||||
|
# More aggressive container cleanup
|
||||||
|
exit_timeout = "5m"
|
||||||
|
|
||||||
|
[plugins."io.containerd.metadata.v1.bolt"]
|
||||||
|
# Compact database more frequently
|
||||||
|
compact_threshold = 5242880 # 5MB instead of default 100MB
|
||||||
|
CONTAINERD_GC
|
||||||
sudo sed -i '/serializeImagePulls:/d' /var/lib/kubelet/config.yaml && \
|
sudo sed -i '/serializeImagePulls:/d' /var/lib/kubelet/config.yaml && \
|
||||||
sudo sed -i '/maxParallelImagePulls:/d' /var/lib/kubelet/config.yaml && \
|
sudo sed -i '/maxParallelImagePulls:/d' /var/lib/kubelet/config.yaml && \
|
||||||
echo -e 'serializeImagePulls: false\nmaxParallelImagePulls: 50' | sudo tee -a /var/lib/kubelet/config.yaml
|
echo -e 'serializeImagePulls: false\nmaxParallelImagePulls: 50' | sudo tee -a /var/lib/kubelet/config.yaml
|
||||||
|
|
||||||
# Memory reservation and eviction — prevent node OOM by reserving memory
|
# Memory and disk reservation and eviction — prevent node OOM/disk full
|
||||||
# for OS/kubelet and evicting pods before the node runs out of memory.
|
# Aggressive disk eviction settings added after node2 containerd corruption incident (2026-03-13)
|
||||||
|
# These settings prevent disk space exhaustion that can corrupt containerd image store
|
||||||
sudo sed -i '/systemReserved:/d; /kubeReserved:/d; /evictionHard:/,/^[^ ]/{ /evictionHard:/d; /^ /d }; /evictionSoft:/,/^[^ ]/{ /evictionSoft:/d; /^ /d }; /evictionSoftGracePeriod:/,/^[^ ]/{ /evictionSoftGracePeriod:/d; /^ /d }' /var/lib/kubelet/config.yaml
|
sudo sed -i '/systemReserved:/d; /kubeReserved:/d; /evictionHard:/,/^[^ ]/{ /evictionHard:/d; /^ /d }; /evictionSoft:/,/^[^ ]/{ /evictionSoft:/d; /^ /d }; /evictionSoftGracePeriod:/,/^[^ ]/{ /evictionSoftGracePeriod:/d; /^ /d }' /var/lib/kubelet/config.yaml
|
||||||
cat <<'KUBELET_PATCH' | sudo tee -a /var/lib/kubelet/config.yaml
|
cat <<'KUBELET_PATCH' | sudo tee -a /var/lib/kubelet/config.yaml
|
||||||
systemReserved:
|
systemReserved:
|
||||||
|
|
@ -99,12 +120,16 @@ kubeReserved:
|
||||||
cpu: "200m"
|
cpu: "200m"
|
||||||
evictionHard:
|
evictionHard:
|
||||||
memory.available: "500Mi"
|
memory.available: "500Mi"
|
||||||
nodefs.available: "10%"
|
nodefs.available: "15%" # More aggressive: evict at 15% free (was 10%)
|
||||||
imagefs.available: "15%"
|
imagefs.available: "20%" # Much more aggressive: evict at 20% free to prevent containerd corruption
|
||||||
evictionSoft:
|
evictionSoft:
|
||||||
memory.available: "1Gi"
|
memory.available: "1Gi"
|
||||||
|
nodefs.available: "20%" # Start warnings at 20% free
|
||||||
|
imagefs.available: "25%" # Start warnings at 25% free for containerd safety
|
||||||
evictionSoftGracePeriod:
|
evictionSoftGracePeriod:
|
||||||
memory.available: "30s"
|
memory.available: "30s"
|
||||||
|
nodefs.available: "60s" # Grace period for disk space warnings
|
||||||
|
imagefs.available: "30s" # Shorter grace for critical containerd space
|
||||||
KUBELET_PATCH
|
KUBELET_PATCH
|
||||||
EOF
|
EOF
|
||||||
k8s_join_command = var.k8s_join_command
|
k8s_join_command = var.k8s_join_command
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue