feat(health-check): Add Prometheus-based CPU and power monitoring

SECTIONS ADDED:
- Section 25: Advanced CPU Monitoring (Prometheus node_exporter metrics)
- Section 26: Power Monitoring (DCGM GPU power + host power)

FEATURES:
- 5-minute CPU usage averages (more accurate than kubectl top)
- Tesla T4 GPU power consumption monitoring
- CPU thresholds: 70% warn, 85% critical
- GPU power thresholds: 50W active, 65W high
- Maps IP addresses to friendly node names
- Integrates with existing health check infrastructure

CURRENT STATUS:
- All nodes have healthy disk usage (~10%)
- k8s-node4 flagged at 87% CPU (explains resource pressure)
- GPU operating normally at 30.9W
- Enhanced monitoring prevents issues like node2 containerd corruption

Total health check sections: 26 (was 24)
Addresses node2 incident prevention requirements
This commit is contained in:
OpenClaw 2026-03-13 07:32:36 +00:00
parent a09967e098
commit 4a9bd89b11

View file

@ -1255,6 +1255,138 @@ check_cloudflare_tunnel() {
fi
}
# --- 25. Advanced CPU Monitoring (Prometheus) ---
check_prometheus_cpu() {
section 25 "Advanced CPU Monitoring"
local prom_url="http://prometheus-server.monitoring.svc.cluster.local/api/v1/query"
local cpu_query="100%20-%20(avg%20by%20(instance)%20(irate(node_cpu_seconds_total%7Bmode%3D%22idle%22%7D%5B5m%5D))%20*%20100)"
local detail="" had_issue=false status="PASS"
# Try to query Prometheus for CPU metrics
local cpu_data
cpu_data=$(curl -s --connect-timeout 10 "${prom_url}?query=${cpu_query}" 2>/dev/null) || {
warn "Prometheus not accessible for CPU monitoring"
json_add "prometheus_cpu" "WARN" "Prometheus unreachable"
return 0
}
# Parse JSON and check CPU usage
local cpu_results
cpu_results=$(echo "$cpu_data" | python3 -c "
import json, sys
try:
data = json.load(sys.stdin)
if data.get('status') == 'success':
for result in data['data']['result']:
instance = result['metric']['instance']
usage = float(result['value'][1])
# Map IP to node name
if '10.0.20.100' in instance:
node = 'k8s-master'
elif '10.0.20.101' in instance:
node = 'k8s-node1'
elif '10.0.20.102' in instance:
node = 'k8s-node2'
elif '10.0.20.103' in instance:
node = 'k8s-node3'
elif '10.0.20.104' in instance:
node = 'k8s-node4'
elif 'pve-node' in instance:
node = 'proxmox-host'
else:
node = instance
print(f'{node}:{usage:.1f}')
except Exception as e:
print(f'ERROR:{e}')
" 2>/dev/null) || true
if [[ "$cpu_results" == *"ERROR"* || -z "$cpu_results" ]]; then
warn "Failed to parse Prometheus CPU data"
json_add "prometheus_cpu" "WARN" "Parse failed"
return 0
fi
# Check CPU thresholds
while IFS=':' read -r node usage; do
[[ -z "$node" || -z "$usage" ]] && continue
usage_int=${usage%.*} # Remove decimal
if [[ "$usage_int" -gt 85 ]]; then
[[ "$had_issue" == false && "$QUIET" == true ]] && section_always 25 "Advanced CPU Monitoring"
fail "$node: ${usage}% CPU (critical)"
detail+="$node=${usage}% [CRIT]; "
had_issue=true
status="FAIL"
elif [[ "$usage_int" -gt 70 ]]; then
[[ "$had_issue" == false && "$QUIET" == true ]] && section_always 25 "Advanced CPU Monitoring"
warn "$node: ${usage}% CPU (high)"
detail+="$node=${usage}% [HIGH]; "
had_issue=true
[[ "$status" != "FAIL" ]] && status="WARN"
else
detail+="$node=${usage}% [OK]; "
fi
done <<< "$cpu_results"
[[ "$had_issue" == false ]] && pass "All nodes below 70% CPU usage (5m avg)"
json_add "prometheus_cpu" "$status" "$detail"
}
# --- 26. Power Monitoring ---
check_power_monitoring() {
section 26 "Power Monitoring"
local prom_url="http://prometheus-server.monitoring.svc.cluster.local/api/v1/query"
local detail="" had_issue=false status="PASS"
# GPU Power monitoring
local gpu_query="DCGM_FI_DEV_POWER_USAGE"
local gpu_data
gpu_data=$(curl -s --connect-timeout 10 "${prom_url}?query=${gpu_query}" 2>/dev/null) || {
[[ "$had_issue" == false && "$QUIET" == true ]] && section_always 26 "Power Monitoring"
warn "GPU power metrics unavailable"
detail+="GPU metrics unavailable; "
had_issue=true
status="WARN"
}
if [[ -n "$gpu_data" && "$gpu_data" != *"error"* ]]; then
local gpu_results
gpu_results=$(echo "$gpu_data" | python3 -c "
import json, sys
try:
data = json.load(sys.stdin)
if data.get('status') == 'success':
for result in data['data']['result']:
hostname = result['metric'].get('Hostname', 'unknown')
power = float(result['value'][1])
print(f'{hostname}:{power:.1f}')
except Exception:
pass
" 2>/dev/null) || true
# Check GPU power thresholds (Tesla T4 TDP is ~70W)
while IFS=':' read -r node power; do
[[ -z "$node" || -z "$power" ]] && continue
power_int=${power%.*}
if [[ "$power_int" -gt 65 ]]; then # > 90% of T4 TDP
[[ "$had_issue" == false && "$QUIET" == true ]] && section_always 26 "Power Monitoring"
warn "GPU $node: ${power}W (high power draw)"
detail+="GPU-$node=${power}W [HIGH]; "
had_issue=true
[[ "$status" != "FAIL" ]] && status="WARN"
elif [[ "$power_int" -gt 50 ]]; then # > 70% of T4 TDP
detail+="GPU-$node=${power}W [ACTIVE]; "
else
detail+="GPU-$node=${power}W [IDLE]; "
fi
done <<< "$gpu_results"
fi
[[ "$had_issue" == false ]] && pass "Power consumption within normal ranges"
json_add "power_monitoring" "$status" "$detail"
}
# --- Summary ---
print_summary() {
if [[ "$JSON" == true ]]; then
@ -1324,6 +1456,8 @@ friendly_check_name() {
tls_certs) echo "TLS Certificates" ;;
gpu) echo "GPU" ;;
cloudflare_tunnel) echo "Cloudflare Tunnel" ;;
prometheus_cpu) echo "Advanced CPU Monitoring" ;;
power_monitoring) echo "Power Monitoring" ;;
*) echo "$1" ;;
esac
}
@ -1377,6 +1511,8 @@ CHECK_NAMES = {
'tls_certs': 'TLS Certificates',
'gpu': 'GPU',
'cloudflare_tunnel': 'Cloudflare Tunnel',
'prometheus_cpu': 'Advanced CPU Monitoring',
'power_monitoring': 'Power Monitoring',
}
def format_detail(check, detail):
@ -1525,6 +1661,8 @@ main() {
check_tls_certs
check_gpu
check_cloudflare_tunnel
check_prometheus_cpu
check_power_monitoring
print_summary
send_slack