feat(health-check): Add Prometheus-based CPU and power monitoring
SECTIONS ADDED: - Section 25: Advanced CPU Monitoring (Prometheus node_exporter metrics) - Section 26: Power Monitoring (DCGM GPU power + host power) FEATURES: - 5-minute CPU usage averages (more accurate than kubectl top) - Tesla T4 GPU power consumption monitoring - CPU thresholds: 70% warn, 85% critical - GPU power thresholds: 50W active, 65W high - Maps IP addresses to friendly node names - Integrates with existing health check infrastructure CURRENT STATUS: - All nodes have healthy disk usage (~10%) - k8s-node4 flagged at 87% CPU (explains resource pressure) - GPU operating normally at 30.9W - Enhanced monitoring prevents issues like node2 containerd corruption Total health check sections: 26 (was 24) Addresses node2 incident prevention requirements
This commit is contained in:
parent
a09967e098
commit
4a9bd89b11
1 changed files with 138 additions and 0 deletions
|
|
@ -1255,6 +1255,138 @@ check_cloudflare_tunnel() {
|
|||
fi
|
||||
}
|
||||
|
||||
# --- 25. Advanced CPU Monitoring (Prometheus) ---
|
||||
check_prometheus_cpu() {
|
||||
section 25 "Advanced CPU Monitoring"
|
||||
local prom_url="http://prometheus-server.monitoring.svc.cluster.local/api/v1/query"
|
||||
local cpu_query="100%20-%20(avg%20by%20(instance)%20(irate(node_cpu_seconds_total%7Bmode%3D%22idle%22%7D%5B5m%5D))%20*%20100)"
|
||||
local detail="" had_issue=false status="PASS"
|
||||
|
||||
# Try to query Prometheus for CPU metrics
|
||||
local cpu_data
|
||||
cpu_data=$(curl -s --connect-timeout 10 "${prom_url}?query=${cpu_query}" 2>/dev/null) || {
|
||||
warn "Prometheus not accessible for CPU monitoring"
|
||||
json_add "prometheus_cpu" "WARN" "Prometheus unreachable"
|
||||
return 0
|
||||
}
|
||||
|
||||
# Parse JSON and check CPU usage
|
||||
local cpu_results
|
||||
cpu_results=$(echo "$cpu_data" | python3 -c "
|
||||
import json, sys
|
||||
try:
|
||||
data = json.load(sys.stdin)
|
||||
if data.get('status') == 'success':
|
||||
for result in data['data']['result']:
|
||||
instance = result['metric']['instance']
|
||||
usage = float(result['value'][1])
|
||||
# Map IP to node name
|
||||
if '10.0.20.100' in instance:
|
||||
node = 'k8s-master'
|
||||
elif '10.0.20.101' in instance:
|
||||
node = 'k8s-node1'
|
||||
elif '10.0.20.102' in instance:
|
||||
node = 'k8s-node2'
|
||||
elif '10.0.20.103' in instance:
|
||||
node = 'k8s-node3'
|
||||
elif '10.0.20.104' in instance:
|
||||
node = 'k8s-node4'
|
||||
elif 'pve-node' in instance:
|
||||
node = 'proxmox-host'
|
||||
else:
|
||||
node = instance
|
||||
print(f'{node}:{usage:.1f}')
|
||||
except Exception as e:
|
||||
print(f'ERROR:{e}')
|
||||
" 2>/dev/null) || true
|
||||
|
||||
if [[ "$cpu_results" == *"ERROR"* || -z "$cpu_results" ]]; then
|
||||
warn "Failed to parse Prometheus CPU data"
|
||||
json_add "prometheus_cpu" "WARN" "Parse failed"
|
||||
return 0
|
||||
fi
|
||||
|
||||
# Check CPU thresholds
|
||||
while IFS=':' read -r node usage; do
|
||||
[[ -z "$node" || -z "$usage" ]] && continue
|
||||
usage_int=${usage%.*} # Remove decimal
|
||||
|
||||
if [[ "$usage_int" -gt 85 ]]; then
|
||||
[[ "$had_issue" == false && "$QUIET" == true ]] && section_always 25 "Advanced CPU Monitoring"
|
||||
fail "$node: ${usage}% CPU (critical)"
|
||||
detail+="$node=${usage}% [CRIT]; "
|
||||
had_issue=true
|
||||
status="FAIL"
|
||||
elif [[ "$usage_int" -gt 70 ]]; then
|
||||
[[ "$had_issue" == false && "$QUIET" == true ]] && section_always 25 "Advanced CPU Monitoring"
|
||||
warn "$node: ${usage}% CPU (high)"
|
||||
detail+="$node=${usage}% [HIGH]; "
|
||||
had_issue=true
|
||||
[[ "$status" != "FAIL" ]] && status="WARN"
|
||||
else
|
||||
detail+="$node=${usage}% [OK]; "
|
||||
fi
|
||||
done <<< "$cpu_results"
|
||||
|
||||
[[ "$had_issue" == false ]] && pass "All nodes below 70% CPU usage (5m avg)"
|
||||
json_add "prometheus_cpu" "$status" "$detail"
|
||||
}
|
||||
|
||||
# --- 26. Power Monitoring ---
|
||||
check_power_monitoring() {
|
||||
section 26 "Power Monitoring"
|
||||
local prom_url="http://prometheus-server.monitoring.svc.cluster.local/api/v1/query"
|
||||
local detail="" had_issue=false status="PASS"
|
||||
|
||||
# GPU Power monitoring
|
||||
local gpu_query="DCGM_FI_DEV_POWER_USAGE"
|
||||
local gpu_data
|
||||
gpu_data=$(curl -s --connect-timeout 10 "${prom_url}?query=${gpu_query}" 2>/dev/null) || {
|
||||
[[ "$had_issue" == false && "$QUIET" == true ]] && section_always 26 "Power Monitoring"
|
||||
warn "GPU power metrics unavailable"
|
||||
detail+="GPU metrics unavailable; "
|
||||
had_issue=true
|
||||
status="WARN"
|
||||
}
|
||||
|
||||
if [[ -n "$gpu_data" && "$gpu_data" != *"error"* ]]; then
|
||||
local gpu_results
|
||||
gpu_results=$(echo "$gpu_data" | python3 -c "
|
||||
import json, sys
|
||||
try:
|
||||
data = json.load(sys.stdin)
|
||||
if data.get('status') == 'success':
|
||||
for result in data['data']['result']:
|
||||
hostname = result['metric'].get('Hostname', 'unknown')
|
||||
power = float(result['value'][1])
|
||||
print(f'{hostname}:{power:.1f}')
|
||||
except Exception:
|
||||
pass
|
||||
" 2>/dev/null) || true
|
||||
|
||||
# Check GPU power thresholds (Tesla T4 TDP is ~70W)
|
||||
while IFS=':' read -r node power; do
|
||||
[[ -z "$node" || -z "$power" ]] && continue
|
||||
power_int=${power%.*}
|
||||
|
||||
if [[ "$power_int" -gt 65 ]]; then # > 90% of T4 TDP
|
||||
[[ "$had_issue" == false && "$QUIET" == true ]] && section_always 26 "Power Monitoring"
|
||||
warn "GPU $node: ${power}W (high power draw)"
|
||||
detail+="GPU-$node=${power}W [HIGH]; "
|
||||
had_issue=true
|
||||
[[ "$status" != "FAIL" ]] && status="WARN"
|
||||
elif [[ "$power_int" -gt 50 ]]; then # > 70% of T4 TDP
|
||||
detail+="GPU-$node=${power}W [ACTIVE]; "
|
||||
else
|
||||
detail+="GPU-$node=${power}W [IDLE]; "
|
||||
fi
|
||||
done <<< "$gpu_results"
|
||||
fi
|
||||
|
||||
[[ "$had_issue" == false ]] && pass "Power consumption within normal ranges"
|
||||
json_add "power_monitoring" "$status" "$detail"
|
||||
}
|
||||
|
||||
# --- Summary ---
|
||||
print_summary() {
|
||||
if [[ "$JSON" == true ]]; then
|
||||
|
|
@ -1324,6 +1456,8 @@ friendly_check_name() {
|
|||
tls_certs) echo "TLS Certificates" ;;
|
||||
gpu) echo "GPU" ;;
|
||||
cloudflare_tunnel) echo "Cloudflare Tunnel" ;;
|
||||
prometheus_cpu) echo "Advanced CPU Monitoring" ;;
|
||||
power_monitoring) echo "Power Monitoring" ;;
|
||||
*) echo "$1" ;;
|
||||
esac
|
||||
}
|
||||
|
|
@ -1377,6 +1511,8 @@ CHECK_NAMES = {
|
|||
'tls_certs': 'TLS Certificates',
|
||||
'gpu': 'GPU',
|
||||
'cloudflare_tunnel': 'Cloudflare Tunnel',
|
||||
'prometheus_cpu': 'Advanced CPU Monitoring',
|
||||
'power_monitoring': 'Power Monitoring',
|
||||
}
|
||||
|
||||
def format_detail(check, detail):
|
||||
|
|
@ -1525,6 +1661,8 @@ main() {
|
|||
check_tls_certs
|
||||
check_gpu
|
||||
check_cloudflare_tunnel
|
||||
check_prometheus_cpu
|
||||
check_power_monitoring
|
||||
print_summary
|
||||
send_slack
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue