diff --git a/.claude/cluster-health.sh b/.claude/cluster-health.sh index 303f708e..e2ba8ff5 100755 --- a/.claude/cluster-health.sh +++ b/.claude/cluster-health.sh @@ -1,44 +1,69 @@ #!/usr/bin/env bash -# Cluster health check script for OpenClaw pod. -# Runs 8 health checks, auto-fixes safe issues, and posts results to Slack. + +# Cluster health check script (pod-compatible version). +# Runs 24 diagnostic checks against the Kubernetes cluster and prints +# a colour-coded report with PASS / WARN / FAIL for each section. +# Optionally posts results to Slack. # -# Usage: bash cluster-health.sh [--no-slack] [--no-fix] +# Usage: ./cluster-health.sh [--fix] [--quiet|-q] [--json] [--kubeconfig ] [--no-slack] # # Environment: -# KUBECONFIG — path to kubeconfig (set automatically in the pod) -# SLACK_WEBHOOK_URL — Slack incoming webhook URL (required unless --no-slack) +# KUBECONFIG — path to kubeconfig (used in pod environment) +# SLACK_WEBHOOK_URL — Slack incoming webhook URL (required unless --no-slack) +# UPTIME_KUMA_PASSWORD — Uptime Kuma admin password set -euo pipefail -# --- Globals --- -KUBECTL="kubectl" -SEND_SLACK=true -AUTO_FIX=true -ISSUES=() -FIXES=() -WARNINGS=() +# --- Colors --- +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[0;33m' +BLUE='\033[0;34m' +BOLD='\033[1m' +NC='\033[0m' -# --- Argument parsing --- -while [[ $# -gt 0 ]]; do - case "$1" in - --no-slack) SEND_SLACK=false; shift ;; - --no-fix) AUTO_FIX=false; shift ;; - -h|--help) - echo "Usage: $0 [--no-slack] [--no-fix]" - echo "" - echo "Flags:" - echo " --no-slack Skip Slack notification" - echo " --no-fix Skip auto-fix actions (report only)" - exit 0 - ;; - *) - echo "Unknown option: $1" >&2 - exit 1 - ;; - esac -done +# --- Globals --- +PASS_COUNT=0 +WARN_COUNT=0 +FAIL_COUNT=0 +FIX=false +QUIET=false +JSON=false +SEND_SLACK=true +KUBECONFIG_PATH="${KUBECONFIG:-$(pwd)/config}" +KUBECTL="" +JSON_RESULTS=() +TOTAL_CHECKS=24 # --- Helpers --- +info() { [[ "$JSON" == true ]] && return 0; echo -e "${BLUE}[INFO]${NC} $*"; } +pass() { PASS_COUNT=$((PASS_COUNT + 1)); [[ "$JSON" == true ]] && return 0; [[ "$QUIET" == true ]] && return 0; echo -e " ${GREEN}[PASS]${NC} $*"; } +warn() { WARN_COUNT=$((WARN_COUNT + 1)); [[ "$JSON" == true ]] && return 0; echo -e " ${YELLOW}[WARN]${NC} $*"; } +fail() { FAIL_COUNT=$((FAIL_COUNT + 1)); [[ "$JSON" == true ]] && return 0; echo -e " ${RED}[FAIL]${NC} $*"; } + +section() { + local num="$1" title="$2" + [[ "$JSON" == true ]] && return 0 + [[ "$QUIET" == true ]] && return 0 + echo "" + echo -e "${BOLD}[$num/$TOTAL_CHECKS] $title${NC}" +} + +section_always() { + local num="$1" title="$2" + [[ "$JSON" == true ]] && return 0 + echo "" + echo -e "${BOLD}[$num/$TOTAL_CHECKS] $title${NC}" +} + +json_add() { + local name="$1" status="$2" detail="$3" + local escaped + escaped=$(echo "$detail" | python3 -c 'import json,sys; print(json.dumps(sys.stdin.read().strip()))') + JSON_RESULTS+=("{\"check\":\"$name\",\"status\":\"$status\",\"detail\":$escaped}") +} + +# count lines in a variable, returning 0 for empty strings count_lines() { local input="$1" if [[ -z "$input" ]]; then @@ -48,186 +73,373 @@ count_lines() { fi } -# --- 1. Node Health --- -echo "=== [1/8] Node Health ===" +# --- Argument parsing --- +parse_args() { + while [[ $# -gt 0 ]]; do + case "$1" in + --fix) FIX=true; shift ;; + --quiet|-q) QUIET=true; shift ;; + --json) JSON=true; shift ;; + --no-slack) SEND_SLACK=false; shift ;; + --kubeconfig) KUBECONFIG_PATH="$2"; shift 2 ;; + -h|--help) + echo "Usage: $0 [--fix] [--quiet|-q] [--json] [--kubeconfig ] [--no-slack]" + echo "" + echo "Flags:" + echo " --fix Auto-remediate safe issues (delete evicted/CrashLoopBackOff pods)" + echo " --quiet, -q Only show WARN and FAIL sections" + echo " --json Machine-readable JSON output" + echo " --kubeconfig PATH Override kubeconfig (default: \$KUBECONFIG or \$(pwd)/config)" + echo " --no-slack Skip Slack notification" + exit 0 + ;; + *) + echo "Unknown option: $1" >&2 + exit 1 + ;; + esac + done + KUBECTL="kubectl --kubeconfig $KUBECONFIG_PATH" +} -node_issues=false -not_ready=$($KUBECTL get nodes --no-headers 2>/dev/null | awk '$2 != "Ready" {print $1}' || true) -if [[ -n "$not_ready" ]]; then - while IFS= read -r node; do - ISSUES+=("Node $node is NotReady") - echo " ISSUE: Node $node is NotReady" - done <<< "$not_ready" - node_issues=true -fi +# --- 1. Node Status --- +check_nodes() { + section 1 "Node Status" + local nodes not_ready versions unique_versions detail="" -# Check node conditions (MemoryPressure, DiskPressure, PIDPressure) -pressure_conditions=$($KUBECTL get nodes -o json 2>/dev/null | python3 -c ' + nodes=$($KUBECTL get nodes --no-headers 2>&1) || { fail "Cannot reach cluster"; json_add "node_status" "FAIL" "Cannot reach cluster"; return 0; } + not_ready=$(echo "$nodes" | awk '$2 != "Ready" {print $1}' || true) + versions=$(echo "$nodes" | awk '{print $5}' | sort -u) + unique_versions=$(echo "$versions" | wc -l | tr -d ' ') + + if [[ -n "$not_ready" ]]; then + [[ "$QUIET" == true ]] && section_always 1 "Node Status" + fail "NotReady nodes: $not_ready" + detail="NotReady: $not_ready" + json_add "node_status" "FAIL" "$detail" + elif [[ "$unique_versions" -gt 1 ]]; then + [[ "$QUIET" == true ]] && section_always 1 "Node Status" + warn "Version mismatch across nodes: $(echo "$versions" | tr '\n' ' ')" + detail="Version mismatch: $(echo "$versions" | tr '\n' ' ')" + json_add "node_status" "WARN" "$detail" + else + pass "All nodes Ready, version $(echo "$versions" | head -1)" + detail="All nodes Ready" + json_add "node_status" "PASS" "$detail" + fi +} + +# --- 2. Node Resources --- +check_resources() { + section 2 "Node Resources" + local top detail="" had_issue=false status="PASS" + + top=$($KUBECTL top nodes --no-headers 2>&1) || { fail "metrics-server unavailable"; json_add "node_resources" "FAIL" "metrics-server unavailable"; return 0; } + + while IFS= read -r line; do + local node cpu_pct mem_pct + node=$(echo "$line" | awk '{print $1}') + cpu_pct=$(echo "$line" | awk '{print $3}' | tr -d '%') + mem_pct=$(echo "$line" | awk '{print $5}' | tr -d '%') + + # Skip nodes where metrics are not yet available + if [[ "$cpu_pct" == *"unknown"* ]] || [[ "$mem_pct" == *"unknown"* ]]; then + detail+="$node metrics unavailable; " + continue + fi + + if [[ "$cpu_pct" -gt 90 ]] || [[ "$mem_pct" -gt 90 ]]; then + [[ "$had_issue" == false && "$QUIET" == true ]] && section_always 2 "Node Resources" + fail "$node: CPU ${cpu_pct}%, Mem ${mem_pct}%" + detail+="$node CPU=${cpu_pct}% Mem=${mem_pct}% [FAIL]; " + had_issue=true + status="FAIL" + elif [[ "$cpu_pct" -gt 80 ]] || [[ "$mem_pct" -gt 80 ]]; then + [[ "$had_issue" == false && "$QUIET" == true ]] && section_always 2 "Node Resources" + warn "$node: CPU ${cpu_pct}%, Mem ${mem_pct}%" + detail+="$node CPU=${cpu_pct}% Mem=${mem_pct}% [WARN]; " + had_issue=true + [[ "$status" != "FAIL" ]] && status="WARN" + else + detail+="$node CPU=${cpu_pct}% Mem=${mem_pct}% [OK]; " + fi + done <<< "$top" + + [[ "$had_issue" == false ]] && pass "All nodes below 80% CPU and memory" + json_add "node_resources" "$status" "$detail" +} + +# --- 3. Node Conditions --- +check_conditions() { + section 3 "Node Conditions" + local conditions detail="" + + conditions=$($KUBECTL get nodes -o json | python3 -c ' import json, sys data = json.load(sys.stdin) for node in data["items"]: name = node["metadata"]["name"] for c in node["status"]["conditions"]: - if c["type"] in ("MemoryPressure", "DiskPressure", "PIDPressure") and c["status"] == "True": - print(f"{name}: {c[\"type\"]}") -' 2>/dev/null) || true + if c["type"] in ("MemoryPressure","DiskPressure","PIDPressure") and c["status"] == "True": + print(name + ": " + c["type"]) +' 2>&1) || true -if [[ -n "$pressure_conditions" ]]; then - while IFS= read -r line; do - ISSUES+=("$line") - echo " ISSUE: $line" - done <<< "$pressure_conditions" - node_issues=true -fi - -if [[ "$node_issues" == false ]]; then - echo " OK" -fi - -# --- 2. Pod Health --- -echo "=== [2/8] Pod Health ===" - -bad_pods=$($KUBECTL get pods -A --no-headers 2>/dev/null \ - | grep -E 'CrashLoopBackOff|ImagePullBackOff|ErrImagePull|Error' || true) - -if [[ -n "$bad_pods" ]]; then - while IFS= read -r line; do - ns=$(echo "$line" | awk '{print $1}') - pod=$(echo "$line" | awk '{print $2}') - status=$(echo "$line" | awk '{print $4}') - restarts=$(echo "$line" | awk '{print $5}') - - # Clean up restart count (remove trailing characters like "d", "h" from age-based formats) - restarts_clean=$(echo "$restarts" | grep -oE '^[0-9]+' || echo "0") - - if [[ "$status" == "CrashLoopBackOff" ]] && [[ "$restarts_clean" -gt 10 ]]; then - if [[ "$AUTO_FIX" == true ]]; then - echo " FIX: Deleting CrashLoopBackOff pod $ns/$pod (restarts: $restarts_clean)" - $KUBECTL delete pod -n "$ns" "$pod" --grace-period=0 2>/dev/null || true - FIXES+=("Deleted CrashLoopBackOff pod $ns/$pod ($restarts_clean restarts)") - else - ISSUES+=("CrashLoopBackOff pod $ns/$pod with $restarts_clean restarts (would auto-fix)") - echo " ISSUE: CrashLoopBackOff pod $ns/$pod with $restarts_clean restarts" - fi - else - ISSUES+=("Pod $ns/$pod in $status state") - echo " ISSUE: Pod $ns/$pod in $status state" - fi - done <<< "$bad_pods" -else - echo " OK" -fi - -# --- 3. Evicted/Failed Pods --- -echo "=== [3/8] Evicted/Failed Pods ===" - -failed_pods=$($KUBECTL get pods -A --no-headers --field-selector=status.phase=Failed 2>/dev/null || true) -failed_count=$(count_lines "$failed_pods") - -if [[ "$failed_count" -gt 0 ]]; then - if [[ "$AUTO_FIX" == true ]]; then - echo " FIX: Deleting $failed_count evicted/failed pods" - $KUBECTL delete pods -A --field-selector=status.phase=Failed 2>/dev/null || true - FIXES+=("Deleted $failed_count evicted/failed pods") + if [[ -n "$conditions" ]]; then + [[ "$QUIET" == true ]] && section_always 3 "Node Conditions" + while IFS= read -r line; do + fail "$line" + done <<< "$conditions" + detail="$conditions" + json_add "node_conditions" "FAIL" "$detail" else - ISSUES+=("$failed_count evicted/failed pods (would auto-fix)") - echo " ISSUE: $failed_count evicted/failed pods" + pass "No pressure conditions on any node" + json_add "node_conditions" "PASS" "No pressure conditions" fi -else - echo " OK" -fi +} -# --- 4. Failed Deployments --- -echo "=== [4/8] Failed Deployments ===" +# --- 4. Problematic Pods --- +check_pods() { + section 4 "Problematic Pods" + local bad count detail="" status="PASS" -deploy_issues=false -deployments=$($KUBECTL get deployments -A --no-headers 2>/dev/null || true) + bad=$( { + $KUBECTL get pods -A --no-headers --field-selector=status.phase!=Running,status.phase!=Succeeded 2>/dev/null \ + | grep -E 'CrashLoopBackOff|Error|Pending|Init:|ImagePullBackOff|ErrImagePull' || true + $KUBECTL get pods -A --no-headers 2>/dev/null \ + | grep -E 'CrashLoopBackOff|ImagePullBackOff|ErrImagePull' || true + } | awk '!seen[$1,$2]++' | sed '/^$/d') || true + + count=$(count_lines "$bad") + + # Auto-fix CrashLoopBackOff pods with >10 restarts when --fix is enabled + if [[ "$FIX" == true && "$count" -gt 0 ]]; then + local fixed_count=0 + while IFS= read -r line; do + [[ -z "$line" ]] && continue + local ns pod pod_status restarts restarts_clean + ns=$(echo "$line" | awk '{print $1}') + pod=$(echo "$line" | awk '{print $2}') + pod_status=$(echo "$line" | awk '{print $4}') + restarts=$(echo "$line" | awk '{print $5}') + restarts_clean=$(echo "$restarts" | grep -oE '^[0-9]+' || echo "0") + + if [[ "$pod_status" == "CrashLoopBackOff" ]] && [[ "$restarts_clean" -gt 10 ]]; then + info "Deleting CrashLoopBackOff pod $ns/$pod (restarts: $restarts_clean)" + $KUBECTL delete pod -n "$ns" "$pod" --grace-period=0 2>/dev/null || true + fixed_count=$((fixed_count + 1)) + fi + done <<< "$bad" + if [[ "$fixed_count" -gt 0 ]]; then + info "Deleted $fixed_count CrashLoopBackOff pod(s) with >10 restarts" + fi + fi + + if [[ "$count" -eq 0 ]]; then + pass "No problematic pods" + detail="None" + elif [[ "$count" -le 10 ]]; then + [[ "$QUIET" == true ]] && section_always 4 "Problematic Pods" + warn "$count problematic pod(s):" + [[ "$JSON" != true ]] && echo "$bad" | while IFS= read -r line; do echo " $line"; done + detail="$count pods" + status="WARN" + else + [[ "$QUIET" == true ]] && section_always 4 "Problematic Pods" + fail "$count problematic pods (showing first 10):" + [[ "$JSON" != true ]] && echo "$bad" | head -10 | while IFS= read -r line; do echo " $line"; done + detail="$count pods" + status="FAIL" + fi + json_add "problematic_pods" "$status" "$detail" +} + +# --- 5. Evicted/Failed Pods --- +check_evicted() { + section 5 "Evicted/Failed Pods" + local evicted count detail="" status="PASS" + + evicted=$($KUBECTL get pods -A --no-headers --field-selector=status.phase=Failed 2>/dev/null || true) + count=$(count_lines "$evicted") + + if [[ "$count" -eq 0 ]]; then + pass "No evicted or failed pods" + detail="0" + elif [[ "$count" -le 50 ]]; then + [[ "$QUIET" == true ]] && section_always 5 "Evicted/Failed Pods" + warn "$count evicted/failed pod(s)" + detail="$count pods" + status="WARN" + else + [[ "$QUIET" == true ]] && section_always 5 "Evicted/Failed Pods" + fail "$count evicted/failed pods" + detail="$count pods" + status="FAIL" + fi + + if [[ "$FIX" == true && "$count" -gt 0 ]]; then + info "Deleting $count evicted/failed pods..." + $KUBECTL delete pods -A --field-selector=status.phase=Failed 2>/dev/null || true + info "Deleted evicted/failed pods" + fi + json_add "evicted_pods" "$status" "$detail" +} + +# --- 6. DaemonSets --- +check_daemonsets() { + section 6 "DaemonSets" + local ds detail="" had_issue=false + + ds=$($KUBECTL get daemonsets -A --no-headers 2>&1) || { fail "Cannot list DaemonSets"; json_add "daemonsets" "FAIL" "Cannot list"; return 0; } -if [[ -n "$deployments" ]]; then while IFS= read -r line; do + local ns name desired ready ns=$(echo "$line" | awk '{print $1}') name=$(echo "$line" | awk '{print $2}') - ready_col=$(echo "$line" | awk '{print $3}') - current=$(echo "$ready_col" | cut -d/ -f1) - desired=$(echo "$ready_col" | cut -d/ -f2) + desired=$(echo "$line" | awk '{print $3}') + ready=$(echo "$line" | awk '{print $5}') + + if [[ "$desired" != "$ready" ]]; then + [[ "$had_issue" == false && "$QUIET" == true ]] && section_always 6 "DaemonSets" + fail "$ns/$name: desired=$desired ready=$ready" + detail+="$ns/$name desired=$desired ready=$ready; " + had_issue=true + fi + done <<< "$ds" + + if [[ "$had_issue" == false ]]; then + pass "All DaemonSets healthy (desired == ready)" + json_add "daemonsets" "PASS" "All healthy" + else + json_add "daemonsets" "FAIL" "$detail" + fi +} + +# --- 7. Deployments --- +check_deployments() { + section 7 "Deployments" + local deps detail="" had_issue=false + + deps=$($KUBECTL get deployments -A --no-headers 2>&1) || { fail "Cannot list Deployments"; json_add "deployments" "FAIL" "Cannot list"; return 0; } + + while IFS= read -r line; do + local ns name ready current desired + ns=$(echo "$line" | awk '{print $1}') + name=$(echo "$line" | awk '{print $2}') + ready=$(echo "$line" | awk '{print $3}') + current=$(echo "$ready" | cut -d/ -f1) + desired=$(echo "$ready" | cut -d/ -f2) if [[ "$current" != "$desired" ]]; then - ISSUES+=("Deployment $ns/$name: $current/$desired replicas ready") - echo " ISSUE: Deployment $ns/$name: $current/$desired replicas ready" - deploy_issues=true + [[ "$had_issue" == false && "$QUIET" == true ]] && section_always 7 "Deployments" + fail "$ns/$name: $current/$desired ready" + detail+="$ns/$name $current/$desired; " + had_issue=true fi - done <<< "$deployments" -fi + done <<< "$deps" -if [[ "$deploy_issues" == false ]]; then - echo " OK" -fi + if [[ "$had_issue" == false ]]; then + pass "All deployments fully available" + json_add "deployments" "PASS" "All available" + else + json_add "deployments" "FAIL" "$detail" + fi +} -# --- 5. Pending PVCs --- -echo "=== [5/8] Pending PVCs ===" +# --- 8. PVC Status --- +check_pvcs() { + section 8 "PVC Status" + local pvcs detail="" had_issue=false -pvc_issues=false -pvcs=$($KUBECTL get pvc -A --no-headers 2>/dev/null || true) + pvcs=$($KUBECTL get pvc -A --no-headers 2>&1) || true + if [[ -z "$pvcs" || "$pvcs" == *"No resources found"* ]]; then + pass "No PVCs in cluster" + json_add "pvcs" "PASS" "No PVCs" + return 0 + fi -if [[ -n "$pvcs" && "$pvcs" != *"No resources found"* ]]; then while IFS= read -r line; do + local ns name status ns=$(echo "$line" | awk '{print $1}') name=$(echo "$line" | awk '{print $2}') status=$(echo "$line" | awk '{print $3}') if [[ "$status" != "Bound" ]]; then - ISSUES+=("PVC $ns/$name in $status state") - echo " ISSUE: PVC $ns/$name in $status state" - pvc_issues=true + [[ "$had_issue" == false && "$QUIET" == true ]] && section_always 8 "PVC Status" + fail "$ns/$name: $status" + detail+="$ns/$name=$status; " + had_issue=true fi done <<< "$pvcs" -fi -if [[ "$pvc_issues" == false ]]; then - echo " OK" -fi + if [[ "$had_issue" == false ]]; then + pass "All PVCs Bound" + json_add "pvcs" "PASS" "All Bound" + else + json_add "pvcs" "FAIL" "$detail" + fi +} -# --- 6. Resource Pressure --- -echo "=== [6/8] Resource Pressure ===" +# --- 9. HPA Health --- +check_hpa() { + section 9 "HPA Health" + local hpas detail="" had_issue=false status="PASS" -resource_issues=false -top_output=$($KUBECTL top nodes --no-headers 2>/dev/null || true) + hpas=$($KUBECTL get hpa -A --no-headers 2>&1) || true + if [[ -z "$hpas" || "$hpas" == *"No resources found"* ]]; then + pass "No HPAs configured" + json_add "hpa" "PASS" "No HPAs" + return 0 + fi -if [[ -n "$top_output" ]]; then while IFS= read -r line; do - node=$(echo "$line" | awk '{print $1}') - cpu_pct=$(echo "$line" | awk '{print $3}' | tr -d '%') - mem_pct=$(echo "$line" | awk '{print $5}' | tr -d '%') + local ns name targets + ns=$(echo "$line" | awk '{print $1}') + name=$(echo "$line" | awk '{print $2}') + targets=$(echo "$line" | awk '{print $3}') - # Skip unknown metrics - if [[ "$cpu_pct" == *"unknown"* ]] || [[ "$mem_pct" == *"unknown"* ]]; then - continue + if echo "$targets" | grep -q ''; then + [[ "$had_issue" == false && "$QUIET" == true ]] && section_always 9 "HPA Health" + fail "$ns/$name: targets=$targets (unknown metrics)" + detail+="$ns/$name=unknown; " + had_issue=true + status="FAIL" + else + # Parse percentage values from targets like "45%/80%, 30%/50%" + local pcts + pcts=$(echo "$targets" | grep -oE '[0-9]+%/' | tr -d '%/' || true) + if [[ -n "$pcts" ]]; then + while IFS= read -r pct; do + [[ -z "$pct" ]] && continue + if [[ "$pct" -gt 150 ]]; then + [[ "$had_issue" == false && "$QUIET" == true ]] && section_always 9 "HPA Health" + fail "$ns/$name: utilization at ${pct}%" + detail+="$ns/$name=${pct}%; " + had_issue=true + status="FAIL" + break + elif [[ "$pct" -gt 100 ]]; then + [[ "$had_issue" == false && "$QUIET" == true ]] && section_always 9 "HPA Health" + warn "$ns/$name: utilization at ${pct}%" + detail+="$ns/$name=${pct}%; " + had_issue=true + [[ "$status" != "FAIL" ]] && status="WARN" + break + fi + done <<< "$pcts" + fi fi + done <<< "$hpas" - if [[ "$cpu_pct" -gt 90 ]] || [[ "$mem_pct" -gt 90 ]]; then - ISSUES+=("Node $node under pressure: CPU ${cpu_pct}%, Mem ${mem_pct}%") - echo " ISSUE: Node $node under pressure: CPU ${cpu_pct}%, Mem ${mem_pct}%" - resource_issues=true - elif [[ "$cpu_pct" -gt 80 ]] || [[ "$mem_pct" -gt 80 ]]; then - WARNINGS+=("Node $node elevated usage: CPU ${cpu_pct}%, Mem ${mem_pct}%") - echo " WARN: Node $node elevated usage: CPU ${cpu_pct}%, Mem ${mem_pct}%" - resource_issues=true - fi - done <<< "$top_output" -else - WARNINGS+=("metrics-server unavailable, cannot check resource pressure") - echo " WARN: metrics-server unavailable" - resource_issues=true -fi + [[ "$had_issue" == false ]] && pass "All HPAs healthy" + json_add "hpa" "$status" "${detail:-All healthy}" +} -if [[ "$resource_issues" == false ]]; then - echo " OK" -fi +# --- 10. CronJob Failures --- +check_cronjobs() { + section 10 "CronJob Failures" + local failures detail="" -# --- 7. CronJob Failures --- -echo "=== [7/8] CronJob Failures ===" - -cronjob_failures=$($KUBECTL get jobs -A -o json 2>/dev/null | python3 -c ' + failures=$($KUBECTL get jobs -A -o json 2>/dev/null | python3 -c ' import json, sys from datetime import datetime, timezone, timedelta @@ -253,126 +465,871 @@ for job in data.get("items", []): t = datetime.fromisoformat(ts.replace("Z", "+00:00")) if t > cutoff: print(f"{ns}/{name}: {c.get(\"reason\", \"Unknown\")}") - except Exception: + except: print(f"{ns}/{name}: {c.get(\"reason\", \"Unknown\")}") ' 2>/dev/null) || true -if [[ -n "$cronjob_failures" ]]; then - while IFS= read -r line; do - ISSUES+=("CronJob failure: $line") - echo " ISSUE: CronJob failure: $line" - done <<< "$cronjob_failures" -else - echo " OK" -fi - -# --- 8. DaemonSet Health --- -echo "=== [8/8] DaemonSet Health ===" - -ds_issues=false -daemonsets=$($KUBECTL get daemonsets -A --no-headers 2>/dev/null || true) - -if [[ -n "$daemonsets" ]]; then + if [[ -z "$failures" ]]; then + pass "No CronJob failures in last 24h" + json_add "cronjob_failures" "PASS" "None" + else + [[ "$QUIET" == true ]] && section_always 10 "CronJob Failures" + local count + count=$(count_lines "$failures") + fail "$count CronJob failure(s) in last 24h:" + [[ "$JSON" != true ]] && echo "$failures" | while IFS= read -r line; do echo " $line"; done + json_add "cronjob_failures" "FAIL" "$count failures" + fi +} + +# --- 11. CrowdSec --- +check_crowdsec() { + section 11 "CrowdSec Agents" + local cs_pods not_running + + cs_pods=$($KUBECTL get pods -n crowdsec --no-headers 2>/dev/null || true) + if [[ -z "$cs_pods" ]]; then + [[ "$QUIET" == true ]] && section_always 11 "CrowdSec Agents" + warn "CrowdSec namespace not found or empty" + json_add "crowdsec" "WARN" "No CrowdSec pods found" + return 0 + fi + + not_running=$(echo "$cs_pods" | awk '$3 != "Running" && $3 != "Completed" {print $1 ": " $3}' || true) + if [[ -n "$not_running" ]]; then + [[ "$QUIET" == true ]] && section_always 11 "CrowdSec Agents" + while IFS= read -r line; do + fail "CrowdSec pod not running: $line" + done <<< "$not_running" + json_add "crowdsec" "FAIL" "$not_running" + else + local total + total=$(count_lines "$cs_pods") + pass "All $total CrowdSec pods running" + json_add "crowdsec" "PASS" "$total pods running" + fi +} + +# --- 12. Ingress --- +check_ingresses() { + section 12 "Ingress Routes" + local ingresses no_lb detail="" had_issue=false + + ingresses=$($KUBECTL get ingress -A --no-headers 2>/dev/null || true) + if [[ -n "$ingresses" ]]; then + no_lb=$(echo "$ingresses" | awk '{if ($5 == "" || $5 == "") print $1"/"$2}' || true) + if [[ -n "$no_lb" ]]; then + [[ "$QUIET" == true ]] && section_always 12 "Ingress Routes" + while IFS= read -r line; do + fail "Ingress missing LB IP: $line" + done <<< "$no_lb" + detail="Missing LB: $no_lb" + had_issue=true + fi + fi + + # Check Traefik LB service + local traefik_svc_ip + traefik_svc_ip=$($KUBECTL get svc -n traefik traefik -o jsonpath='{.status.loadBalancer.ingress[0].ip}' 2>/dev/null || true) + if [[ -z "$traefik_svc_ip" ]]; then + [[ "$had_issue" == false && "$QUIET" == true ]] && section_always 12 "Ingress Routes" + fail "Traefik LoadBalancer has no external IP" + detail+="Traefik LB missing IP; " + had_issue=true + else + detail+="Traefik LB=$traefik_svc_ip; " + fi + + if [[ "$had_issue" == false ]]; then + pass "All ingresses have LB assignment (Traefik LB=$traefik_svc_ip)" + json_add "ingresses" "PASS" "$detail" + else + json_add "ingresses" "FAIL" "$detail" + fi +} + +# --- 13. Prometheus Alerts --- +check_alerts() { + section 13 "Prometheus Alerts" + local alerts firing_count + + # Try alertmanager first, then prometheus server + alerts=$($KUBECTL exec -n monitoring deploy/prometheus-alertmanager -- \ + wget -q -O- http://localhost:9093/api/v2/alerts 2>/dev/null || true) + + if [[ -z "$alerts" ]]; then + alerts=$($KUBECTL exec -n monitoring deploy/prometheus-server -- \ + wget -q -O- http://localhost:9090/api/v1/alerts 2>/dev/null || true) + fi + + if [[ -z "$alerts" ]]; then + [[ "$QUIET" == true ]] && section_always 13 "Prometheus Alerts" + warn "Could not query Prometheus/Alertmanager" + json_add "prometheus_alerts" "WARN" "Cannot query" + return 0 + fi + + firing_count=$(echo "$alerts" | python3 -c ' +import json, sys +try: + data = json.load(sys.stdin) + if isinstance(data, list): + active = [a for a in data if a.get("status", {}).get("state") == "active"] + count = len(active) + names = [a.get("labels", {}).get("alertname", "?") for a in active] + print(f"{count}:" + ",".join(names) if count > 0 else "0:") + elif isinstance(data, dict) and "data" in data: + alerts_list = data["data"].get("alerts", []) + firing = [a for a in alerts_list if a.get("state") == "firing"] + count = len(firing) + names = [a.get("labels", {}).get("alertname", "?") for a in firing] + print(f"{count}:" + ",".join(names) if count > 0 else "0:") + else: + print("0:") +except: + print("-1:") +' 2>/dev/null || echo "-1:") + + local count names + count=$(echo "$firing_count" | cut -d: -f1) + names=$(echo "$firing_count" | cut -d: -f2-) + + if [[ "$count" == "-1" ]]; then + [[ "$QUIET" == true ]] && section_always 13 "Prometheus Alerts" + warn "Failed to parse alert data" + json_add "prometheus_alerts" "WARN" "Parse error" + elif [[ "$count" -eq 0 ]]; then + pass "No firing alerts" + json_add "prometheus_alerts" "PASS" "0 firing" + elif [[ "$count" -le 3 ]]; then + [[ "$QUIET" == true ]] && section_always 13 "Prometheus Alerts" + warn "$count firing alert(s): $names" + json_add "prometheus_alerts" "WARN" "$count firing: $names" + else + [[ "$QUIET" == true ]] && section_always 13 "Prometheus Alerts" + fail "$count firing alerts: $names" + json_add "prometheus_alerts" "FAIL" "$count firing: $names" + fi +} + +# --- 14. Uptime Kuma --- +check_uptime_kuma() { + section 14 "Uptime Kuma Monitors" + local result + + result=$(python3 -c ' +import sys, os +try: + from uptime_kuma_api import UptimeKumaApi +except ImportError: + print("ERROR:uptime-kuma-api not installed") + sys.exit(0) + +try: + api = UptimeKumaApi("https://uptime.viktorbarzin.me") + password = os.environ.get("UPTIME_KUMA_PASSWORD", "") + if not password: + print("ERROR:UPTIME_KUMA_PASSWORD not set") + sys.exit(0) + api.login("admin", password) + + monitors = api.get_monitors() + down = [] + up_count = 0 + paused_count = 0 + + for m in monitors: + name = m.get("name", "unknown") + active = m.get("active", True) + if not active: + paused_count += 1 + continue + # Check heartbeat list for latest status + try: + hb = api.get_monitor_beats(m["id"], 1) + if hb and len(hb) > 0: + status = hb[-1].get("status", 0) + else: + status = m.get("status", 0) + except Exception: + status = m.get("status", 0) + # status: 0=DOWN, 1=UP, 2=PENDING, 3=MAINTENANCE + if status == 1: + up_count += 1 + elif status == 3: + paused_count += 1 + else: + down.append(name) + + api.disconnect() + + down_count = len(down) + total_active = up_count + down_count + down_names = ", ".join(down) if down else "" + print(f"{down_count}:{up_count}:{paused_count}:{total_active}:{down_names}") +except Exception as e: + print(f"CONN_ERROR:{e}") +' 2>/dev/null) || result="CONN_ERROR:python execution failed" + + if [[ "$result" == "ERROR:"* ]]; then + [[ "$QUIET" == true ]] && section_always 14 "Uptime Kuma Monitors" + warn "Uptime Kuma: ${result#ERROR:}" + json_add "uptime_kuma" "WARN" "${result#ERROR:}" + elif [[ "$result" == "CONN_ERROR:"* ]]; then + [[ "$QUIET" == true ]] && section_always 14 "Uptime Kuma Monitors" + warn "Cannot connect to Uptime Kuma: ${result#CONN_ERROR:}" + json_add "uptime_kuma" "WARN" "Connection failed" + else + local down_count up_count paused_count total_active down_names + down_count=$(echo "$result" | cut -d: -f1) + up_count=$(echo "$result" | cut -d: -f2) + paused_count=$(echo "$result" | cut -d: -f3) + total_active=$(echo "$result" | cut -d: -f4) + down_names=$(echo "$result" | cut -d: -f5-) + + if [[ "$down_count" -eq 0 ]]; then + pass "All $total_active active monitors up ($paused_count paused)" + json_add "uptime_kuma" "PASS" "$total_active up, $paused_count paused" + elif [[ "$down_count" -le 3 ]]; then + [[ "$QUIET" == true ]] && section_always 14 "Uptime Kuma Monitors" + warn "$down_count/$total_active monitor(s) down: $down_names" + json_add "uptime_kuma" "WARN" "$down_count down: $down_names" + else + [[ "$QUIET" == true ]] && section_always 14 "Uptime Kuma Monitors" + fail "$down_count/$total_active monitors down: $down_names" + json_add "uptime_kuma" "FAIL" "$down_count down: $down_names" + fi + fi +} + +# --- 15. ResourceQuota Pressure --- +check_resourcequota() { + section 15 "ResourceQuota Pressure" + local quotas detail="" had_issue=false status="PASS" + + quotas=$($KUBECTL get resourcequota -A -o json 2>/dev/null) || { pass "No ResourceQuotas configured"; json_add "resourcequota" "PASS" "No quotas"; return 0; } + + local pressure + pressure=$(echo "$quotas" | python3 -c ' +import json, sys, re + +def parse_cpu(val): + """Convert CPU value to millicores.""" + val = str(val) + if val.endswith("m"): + return float(val[:-1]) + return float(val) * 1000 + +def parse_mem(val): + """Convert memory value to bytes.""" + val = str(val) + units = {"Ki": 1024, "Mi": 1024**2, "Gi": 1024**3, "Ti": 1024**4} + for suffix, mult in units.items(): + if val.endswith(suffix): + return float(val[:-len(suffix)]) * mult + # Plain bytes or numeric + return float(val) + +data = json.load(sys.stdin) +for item in data.get("items", []): + ns = item["metadata"]["namespace"] + name = item["metadata"]["name"] + status = item.get("status", {}) + hard = status.get("hard", {}) + used = status.get("used", {}) + + for resource, hard_val in hard.items(): + used_val = used.get(resource, "0") + try: + if "cpu" in resource: + h = parse_cpu(hard_val) + u = parse_cpu(used_val) + elif "memory" in resource or "storage" in resource: + h = parse_mem(hard_val) + u = parse_mem(used_val) + elif resource == "pods": + h = float(hard_val) + u = float(used_val) + else: + continue + if h <= 0: + continue + pct = (u / h) * 100 + if pct > 80: + level = "FAIL" if pct > 95 else "WARN" + print(f"{level}:{ns}/{name}:{resource}:{pct:.0f}%") + except (ValueError, ZeroDivisionError): + pass +' 2>/dev/null) || true + + if [[ -z "$pressure" ]]; then + pass "All ResourceQuotas below 80% usage" + json_add "resourcequota" "PASS" "All below 80%" + else + [[ "$QUIET" == true ]] && section_always 15 "ResourceQuota Pressure" + while IFS= read -r line; do + local level ns_res resource pct + level=$(echo "$line" | cut -d: -f1) + ns_res=$(echo "$line" | cut -d: -f2) + resource=$(echo "$line" | cut -d: -f3) + pct=$(echo "$line" | cut -d: -f4) + if [[ "$level" == "FAIL" ]]; then + fail "$ns_res: $resource at $pct" + status="FAIL" + else + warn "$ns_res: $resource at $pct" + [[ "$status" != "FAIL" ]] && status="WARN" + fi + detail+="$ns_res $resource=$pct; " + had_issue=true + done <<< "$pressure" + json_add "resourcequota" "$status" "$detail" + fi +} + +# --- 16. StatefulSets --- +check_statefulsets() { + section 16 "StatefulSets" + local sts detail="" had_issue=false + + sts=$($KUBECTL get statefulsets -A --no-headers 2>&1) || true + if [[ -z "$sts" || "$sts" == *"No resources found"* ]]; then + pass "No StatefulSets in cluster" + json_add "statefulsets" "PASS" "No StatefulSets" + return 0 + fi + while IFS= read -r line; do + local ns name ready current desired ns=$(echo "$line" | awk '{print $1}') name=$(echo "$line" | awk '{print $2}') - desired=$(echo "$line" | awk '{print $3}') - ready=$(echo "$line" | awk '{print $5}') + ready=$(echo "$line" | awk '{print $3}') + current=$(echo "$ready" | cut -d/ -f1) + desired=$(echo "$ready" | cut -d/ -f2) - if [[ "$desired" != "$ready" ]]; then - ISSUES+=("DaemonSet $ns/$name: desired=$desired ready=$ready") - echo " ISSUE: DaemonSet $ns/$name: desired=$desired ready=$ready" - ds_issues=true + if [[ "$current" != "$desired" ]]; then + [[ "$had_issue" == false && "$QUIET" == true ]] && section_always 16 "StatefulSets" + fail "$ns/$name: $current/$desired ready" + detail+="$ns/$name $current/$desired; " + had_issue=true fi - done <<< "$daemonsets" -fi + done <<< "$sts" -if [[ "$ds_issues" == false ]]; then - echo " OK" -fi + if [[ "$had_issue" == false ]]; then + pass "All StatefulSets fully available" + json_add "statefulsets" "PASS" "All available" + else + json_add "statefulsets" "FAIL" "$detail" + fi +} + +# --- 17. Node Disk Usage --- +check_node_disk() { + section 17 "Node Disk Usage" + local node_json detail="" had_issue=false status="PASS" + + node_json=$($KUBECTL get nodes -o json 2>/dev/null) || { fail "Cannot get node info"; json_add "node_disk" "FAIL" "Cannot get nodes"; return 0; } + + local disk_info + disk_info=$(echo "$node_json" | python3 -c ' +import json, sys + +def parse_storage(val): + """Convert storage value to bytes.""" + val = str(val) + units = {"Ki": 1024, "Mi": 1024**2, "Gi": 1024**3, "Ti": 1024**4} + for suffix, mult in units.items(): + if val.endswith(suffix): + return float(val[:-len(suffix)]) * mult + return float(val) + +data = json.load(sys.stdin) +for node in data["items"]: + name = node["metadata"]["name"] + cap = node["status"].get("capacity", {}) + alloc = node["status"].get("allocatable", {}) + es_cap = cap.get("ephemeral-storage", "0") + es_alloc = alloc.get("ephemeral-storage", "0") + try: + c = parse_storage(es_cap) + a = parse_storage(es_alloc) + if c > 0: + used_pct = ((c - a) / c) * 100 + if used_pct > 80: + level = "FAIL" if used_pct > 90 else "WARN" + print(f"{level}:{name}:{used_pct:.0f}") + except (ValueError, ZeroDivisionError): + pass +' 2>/dev/null) || true + + if [[ -z "$disk_info" ]]; then + pass "All nodes below 80% ephemeral-storage usage" + json_add "node_disk" "PASS" "All below 80%" + else + [[ "$QUIET" == true ]] && section_always 17 "Node Disk Usage" + while IFS= read -r line; do + local level node pct + level=$(echo "$line" | cut -d: -f1) + node=$(echo "$line" | cut -d: -f2) + pct=$(echo "$line" | cut -d: -f3) + if [[ "$level" == "FAIL" ]]; then + fail "$node: ephemeral-storage at ${pct}%" + status="FAIL" + else + warn "$node: ephemeral-storage at ${pct}%" + [[ "$status" != "FAIL" ]] && status="WARN" + fi + detail+="$node=${pct}%; " + had_issue=true + done <<< "$disk_info" + json_add "node_disk" "$status" "$detail" + fi +} + +# --- 18. Helm Release Health --- +check_helm_releases() { + section 18 "Helm Release Health" + + # Helm may not be available in the pod environment + if ! command -v helm &>/dev/null; then + pass "Helm not available (skipped)" + json_add "helm_releases" "PASS" "Helm not available" + return 0 + fi + + local releases detail="" had_issue=false status="PASS" + + releases=$(helm list -A --kubeconfig "$KUBECONFIG_PATH" --all -o json 2>/dev/null) || { + [[ "$QUIET" == true ]] && section_always 18 "Helm Release Health" + warn "Cannot list Helm releases" + json_add "helm_releases" "WARN" "Cannot list" + return 0 + } + + local bad_releases + bad_releases=$(echo "$releases" | python3 -c ' +import json, sys +data = json.load(sys.stdin) +for r in data: + name = r.get("name", "?") + ns = r.get("namespace", "?") + st = r.get("status", "unknown") + if st != "deployed": + level = "FAIL" if st.startswith("pending") else "WARN" + print(f"{level}:{ns}/{name}:{st}") +' 2>/dev/null) || true + + if [[ -z "$bad_releases" ]]; then + pass "All Helm releases in deployed state" + json_add "helm_releases" "PASS" "All deployed" + else + [[ "$QUIET" == true ]] && section_always 18 "Helm Release Health" + while IFS= read -r line; do + local level release_name release_status + level=$(echo "$line" | cut -d: -f1) + release_name=$(echo "$line" | cut -d: -f2) + release_status=$(echo "$line" | cut -d: -f3) + if [[ "$level" == "FAIL" ]]; then + fail "Helm release $release_name: $release_status (blocks terraform)" + status="FAIL" + else + warn "Helm release $release_name: $release_status" + [[ "$status" != "FAIL" ]] && status="WARN" + fi + detail+="$release_name=$release_status; " + had_issue=true + done <<< "$bad_releases" + json_add "helm_releases" "$status" "$detail" + fi +} + +# --- 19. Kyverno Policy Engine --- +check_kyverno() { + section 19 "Kyverno Policy Engine" + local kv_pods not_running + + kv_pods=$($KUBECTL get pods -n kyverno --no-headers 2>/dev/null || true) + if [[ -z "$kv_pods" ]]; then + [[ "$QUIET" == true ]] && section_always 19 "Kyverno Policy Engine" + fail "Kyverno namespace not found or empty — policy engine down, cascading cluster impact" + json_add "kyverno" "FAIL" "No Kyverno pods found" + return 0 + fi + + not_running=$(echo "$kv_pods" | awk '$3 != "Running" && $3 != "Completed" {print $1 ": " $3}' || true) + if [[ -n "$not_running" ]]; then + [[ "$QUIET" == true ]] && section_always 19 "Kyverno Policy Engine" + while IFS= read -r line; do + fail "Kyverno pod not running: $line" + done <<< "$not_running" + json_add "kyverno" "FAIL" "$not_running" + else + local total + total=$(count_lines "$kv_pods") + pass "All $total Kyverno pods running" + json_add "kyverno" "PASS" "$total pods running" + fi +} + +# --- 20. NFS Connectivity --- +check_nfs() { + section 20 "NFS Connectivity" + + # Try native tools first (available locally), fall back to kubectl-based check (pod environment) + if command -v showmount &>/dev/null; then + if showmount -e 10.0.10.15 &>/dev/null; then + pass "NFS server 10.0.10.15 reachable (exports listed)" + json_add "nfs" "PASS" "NFS reachable" + return 0 + fi + fi + + if command -v nc &>/dev/null; then + if nc -z -G 3 10.0.10.15 2049 &>/dev/null; then + pass "NFS server 10.0.10.15 port 2049 open" + json_add "nfs" "PASS" "NFS port open" + return 0 + fi + fi + + # Fallback: check if NFS-backed pods are running (works in pod environment) + local nfs_pods + nfs_pods=$($KUBECTL get pods -A -o json 2>/dev/null | python3 -c ' +import json, sys +data = json.load(sys.stdin) +count = 0 +for pod in data.get("items", []): + for vol in pod.get("spec", {}).get("volumes", []): + if "nfs" in vol: + if pod.get("status", {}).get("phase") == "Running": + count += 1 + break +print(count) +' 2>/dev/null) || nfs_pods="0" + + if [[ "$nfs_pods" -gt 0 ]]; then + pass "NFS healthy ($nfs_pods pods using NFS volumes are running)" + json_add "nfs" "PASS" "$nfs_pods NFS pods running" + else + [[ "$QUIET" == true ]] && section_always 20 "NFS Connectivity" + warn "Cannot verify NFS (showmount not available, no NFS pods found)" + json_add "nfs" "WARN" "Cannot verify" + fi +} + +# --- 21. DNS Resolution --- +check_dns() { + section 21 "DNS Resolution" + local internal_ok=false external_ok=false detail="" + + # Try dig first (available locally), fall back to python3 (pod environment) + if command -v dig &>/dev/null; then + if dig @10.0.20.101 viktorbarzin.me +short +time=3 +tries=1 &>/dev/null; then + internal_ok=true + fi + if dig @10.0.20.101 google.com +short +time=3 +tries=1 &>/dev/null; then + external_ok=true + fi + else + # Fallback: use python3 for DNS resolution (works in pod environment) + local result + result=$(python3 -c " +import socket +try: + socket.getaddrinfo('viktorbarzin.me', 443) + print('INTERNAL_OK') +except Exception: + print('INTERNAL_FAIL') +try: + socket.getaddrinfo('google.com', 443) + print('EXTERNAL_OK') +except Exception: + print('EXTERNAL_FAIL') +" 2>/dev/null) || result="" + + if echo "$result" | grep -q "INTERNAL_OK"; then + internal_ok=true + fi + if echo "$result" | grep -q "EXTERNAL_OK"; then + external_ok=true + fi + fi + + if [[ "$internal_ok" == true && "$external_ok" == true ]]; then + pass "DNS resolves both internal (viktorbarzin.me) and external (google.com)" + json_add "dns" "PASS" "Both resolve" + elif [[ "$internal_ok" == true || "$external_ok" == true ]]; then + [[ "$QUIET" == true ]] && section_always 21 "DNS Resolution" + if [[ "$internal_ok" == false ]]; then + warn "DNS: internal (viktorbarzin.me) failed, external (google.com) OK" + detail="Internal failed" + else + warn "DNS: internal (viktorbarzin.me) OK, external (google.com) failed" + detail="External failed" + fi + json_add "dns" "WARN" "$detail" + else + [[ "$QUIET" == true ]] && section_always 21 "DNS Resolution" + fail "DNS not resolving — both internal and external failed" + json_add "dns" "FAIL" "Both failed" + fi +} + +# --- 22. TLS Certificate Expiry --- +check_tls_certs() { + section 22 "TLS Certificate Expiry" + local secrets detail="" had_issue=false status="PASS" + + secrets=$($KUBECTL get secrets -A -o json 2>/dev/null) || { + [[ "$QUIET" == true ]] && section_always 22 "TLS Certificate Expiry" + warn "Cannot list secrets" + json_add "tls_certs" "WARN" "Cannot list secrets" + return 0 + } + + local cert_issues + cert_issues=$(echo "$secrets" | python3 -c ' +import json, sys, base64, subprocess, hashlib +from datetime import datetime, timezone + +data = json.load(sys.stdin) +seen_fingerprints = set() +results = [] + +for item in data.get("items", []): + if item.get("type") != "kubernetes.io/tls": + continue + ns = item["metadata"]["namespace"] + name = item["metadata"]["name"] + cert_data = item.get("data", {}).get("tls.crt", "") + if not cert_data: + continue + + # Deduplicate by cert fingerprint + raw = base64.b64decode(cert_data) + fp = hashlib.sha256(raw).hexdigest()[:16] + if fp in seen_fingerprints: + continue + seen_fingerprints.add(fp) + + # Parse certificate expiry with openssl + try: + result = subprocess.run( + ["openssl", "x509", "-noout", "-enddate", "-subject"], + input=raw, capture_output=True, timeout=5 + ) + output = result.stdout.decode() + for line in output.splitlines(): + if line.startswith("notAfter="): + date_str = line.split("=", 1)[1] + # Parse openssl date format: "Mon DD HH:MM:SS YYYY GMT" + try: + expiry = datetime.strptime(date_str.strip(), "%b %d %H:%M:%S %Y %Z") + expiry = expiry.replace(tzinfo=timezone.utc) + days_left = (expiry - datetime.now(timezone.utc)).days + if days_left <= 7: + print(f"FAIL:{ns}/{name}:{days_left}d") + elif days_left <= 30: + print(f"WARN:{ns}/{name}:{days_left}d") + except ValueError: + pass + except (subprocess.TimeoutExpired, Exception): + pass +' 2>/dev/null) || true + + if [[ -z "$cert_issues" ]]; then + pass "All TLS certificates valid for >30 days" + json_add "tls_certs" "PASS" "All valid >30d" + else + [[ "$QUIET" == true ]] && section_always 22 "TLS Certificate Expiry" + while IFS= read -r line; do + local level cert_name days + level=$(echo "$line" | cut -d: -f1) + cert_name=$(echo "$line" | cut -d: -f2) + days=$(echo "$line" | cut -d: -f3) + if [[ "$level" == "FAIL" ]]; then + fail "TLS cert $cert_name expires in $days" + status="FAIL" + else + warn "TLS cert $cert_name expires in $days" + [[ "$status" != "FAIL" ]] && status="WARN" + fi + detail+="$cert_name=$days; " + had_issue=true + done <<< "$cert_issues" + json_add "tls_certs" "$status" "$detail" + fi +} + +# --- 23. GPU Health --- +check_gpu() { + section 23 "GPU Health" + local gpu_pods not_running + + gpu_pods=$($KUBECTL get pods -n nvidia --no-headers 2>/dev/null || true) + if [[ -z "$gpu_pods" ]]; then + [[ "$QUIET" == true ]] && section_always 23 "GPU Health" + warn "NVIDIA namespace not found or empty" + json_add "gpu" "WARN" "No GPU pods found" + return 0 + fi + + # Check specifically for device-plugin (critical for GPU scheduling) + local device_plugin_down=false + local other_down=false + local detail="" + + while IFS= read -r line; do + local pod_name pod_status + pod_name=$(echo "$line" | awk '{print $1}') + pod_status=$(echo "$line" | awk '{print $3}') + if [[ "$pod_status" != "Running" && "$pod_status" != "Completed" ]]; then + if echo "$pod_name" | grep -q "device-plugin"; then + device_plugin_down=true + detail+="device-plugin $pod_name: $pod_status; " + else + other_down=true + detail+="$pod_name: $pod_status; " + fi + fi + done <<< "$gpu_pods" + + if [[ "$device_plugin_down" == true ]]; then + [[ "$QUIET" == true ]] && section_always 23 "GPU Health" + fail "GPU device-plugin is down — GPU workloads cannot schedule" + json_add "gpu" "FAIL" "$detail" + elif [[ "$other_down" == true ]]; then + [[ "$QUIET" == true ]] && section_always 23 "GPU Health" + warn "Some GPU pods not running: $detail" + json_add "gpu" "WARN" "$detail" + else + local total + total=$(count_lines "$gpu_pods") + pass "All $total GPU pods running" + json_add "gpu" "PASS" "$total pods running" + fi +} + +# --- 24. Cloudflare Tunnel --- +check_cloudflare_tunnel() { + section 24 "Cloudflare Tunnel" + local cf_pods running_count total_count + + cf_pods=$($KUBECTL get pods -n cloudflared --no-headers 2>/dev/null || true) + if [[ -z "$cf_pods" ]]; then + [[ "$QUIET" == true ]] && section_always 24 "Cloudflare Tunnel" + fail "Cloudflare tunnel namespace not found or empty — external access broken" + json_add "cloudflare_tunnel" "FAIL" "No pods found" + return 0 + fi + + total_count=$(count_lines "$cf_pods") + running_count=$(echo "$cf_pods" | awk '$3 == "Running"' | wc -l | tr -d ' ') + + if [[ "$running_count" -eq 0 ]]; then + [[ "$QUIET" == true ]] && section_always 24 "Cloudflare Tunnel" + fail "Cloudflare tunnel: 0/$total_count pods running — external access broken" + json_add "cloudflare_tunnel" "FAIL" "0/$total_count running" + elif [[ "$running_count" -lt "$total_count" ]]; then + [[ "$QUIET" == true ]] && section_always 24 "Cloudflare Tunnel" + warn "Cloudflare tunnel: $running_count/$total_count pods running (degraded)" + json_add "cloudflare_tunnel" "WARN" "$running_count/$total_count running" + else + pass "Cloudflare tunnel: all $total_count pods running" + json_add "cloudflare_tunnel" "PASS" "$total_count pods running" + fi +} # --- Summary --- -echo "" -echo "===============================" -echo " Summary" -echo "===============================" +print_summary() { + if [[ "$JSON" == true ]]; then + echo "{" + echo " \"timestamp\": \"$(date -u +%Y-%m-%dT%H:%M:%SZ)\"," + echo " \"pass\": $PASS_COUNT," + echo " \"warn\": $WARN_COUNT," + echo " \"fail\": $FAIL_COUNT," + echo " \"checks\": [" + local first=true + for r in "${JSON_RESULTS[@]}"; do + if [[ "$first" == true ]]; then + echo " $r" + first=false + else + echo " ,$r" + fi + done + echo " ]" + echo "}" + return 0 + fi -# Gather stats for the summary line -node_count=$($KUBECTL get nodes --no-headers 2>/dev/null | wc -l | tr -d ' ') -pod_count=$($KUBECTL get pods -A --no-headers --field-selector=status.phase=Running 2>/dev/null | wc -l | tr -d ' ') -issue_count=${#ISSUES[@]} -fix_count=${#FIXES[@]} -warning_count=${#WARNINGS[@]} - -stats="${node_count} nodes | ${pod_count} pods running | ${issue_count} issue(s)" - -echo " $stats" - -if [[ "$fix_count" -gt 0 ]]; then echo "" - echo " Auto-fixed:" - for fix in "${FIXES[@]}"; do - echo " - $fix" - done -fi - -if [[ "$issue_count" -gt 0 ]]; then + echo -e "${BOLD}═══════════════════════════════════════${NC}" + echo -e "${BOLD} Cluster Health Summary${NC}" + echo -e "${BOLD}═══════════════════════════════════════${NC}" + echo -e " ${GREEN}PASS${NC}: $PASS_COUNT ${YELLOW}WARN${NC}: $WARN_COUNT ${RED}FAIL${NC}: $FAIL_COUNT" echo "" - echo " Needs attention:" - for issue in "${ISSUES[@]}"; do - echo " - $issue" - done -fi -if [[ "$warning_count" -gt 0 ]]; then - echo "" - echo " Warnings:" - for w in "${WARNINGS[@]}"; do - echo " - $w" - done -fi - -# --- Slack notification --- -if [[ "$SEND_SLACK" == true ]]; then - if [[ -z "${SLACK_WEBHOOK_URL:-}" ]]; then - echo "" - echo "WARNING: SLACK_WEBHOOK_URL not set, skipping Slack notification" + if [[ "$FAIL_COUNT" -gt 0 ]]; then + echo -e " Overall: ${RED}UNHEALTHY${NC}" + elif [[ "$WARN_COUNT" -gt 0 ]]; then + echo -e " Overall: ${YELLOW}DEGRADED${NC}" else - # Build Slack message - if [[ "$issue_count" -eq 0 && "$warning_count" -eq 0 ]]; then - slack_text=":white_check_mark: *Cluster Health Check — All Clear*\n${stats}" - elif [[ "$issue_count" -eq 0 && "$warning_count" -gt 0 ]]; then - slack_text=":warning: *Cluster Health Check — ${warning_count} Warning(s)*\n${stats}" - for w in "${WARNINGS[@]}"; do - slack_text+="\n• ${w}" - done - else - slack_text=":rotating_light: *Cluster Health Check — ${issue_count} Issue(s) Found*\n${stats}" + echo -e " Overall: ${GREEN}HEALTHY${NC}" + fi + echo "" +} - if [[ "$fix_count" -gt 0 ]]; then - slack_text+="\n\n*Auto-fixed:*" - for fix in "${FIXES[@]}"; do - slack_text+="\n• ${fix}" - done - fi +# --- Slack Notification --- +send_slack() { + if [[ "$SEND_SLACK" != true ]]; then + return 0 + fi + if [[ -z "${SLACK_WEBHOOK_URL:-}" ]]; then + [[ "$JSON" != true ]] && echo "WARNING: SLACK_WEBHOOK_URL not set, skipping Slack notification" + return 0 + fi - if [[ "$issue_count" -gt 0 ]]; then - slack_text+="\n\n*Needs attention:*" - for issue in "${ISSUES[@]}"; do - slack_text+="\n• ${issue}" - done - fi + # Gather stats for summary line + local node_count pod_count + node_count=$($KUBECTL get nodes --no-headers 2>/dev/null | wc -l | tr -d ' ') + pod_count=$($KUBECTL get pods -A --no-headers --field-selector=status.phase=Running 2>/dev/null | wc -l | tr -d ' ') - if [[ "$warning_count" -gt 0 ]]; then - slack_text+="\n\n*Warnings:*" - for w in "${WARNINGS[@]}"; do - slack_text+="\n• ${w}" - done - fi + # Collect FAIL and WARN items from JSON_RESULTS + local fail_items="" warn_items="" + for r in "${JSON_RESULTS[@]}"; do + local r_status r_check r_detail + r_status=$(echo "$r" | python3 -c 'import json,sys; d=json.loads(sys.stdin.read()); print(d["status"])' 2>/dev/null || true) + r_check=$(echo "$r" | python3 -c 'import json,sys; d=json.loads(sys.stdin.read()); print(d["check"])' 2>/dev/null || true) + r_detail=$(echo "$r" | python3 -c 'import json,sys; d=json.loads(sys.stdin.read()); print(d["detail"])' 2>/dev/null || true) + + if [[ "$r_status" == "FAIL" ]]; then + fail_items+="\\n• [${r_check}] ${r_detail}" + elif [[ "$r_status" == "WARN" ]]; then + warn_items+="\\n• [${r_check}] ${r_detail}" fi + done - # Use python3 to build Slack Block Kit payload with proper mrkdwn - json_payload=$(printf '%b' "$slack_text" | python3 -c " + local slack_text + local total_checks=$((PASS_COUNT + WARN_COUNT + FAIL_COUNT)) + + if [[ "$FAIL_COUNT" -eq 0 && "$WARN_COUNT" -eq 0 ]]; then + slack_text=":white_check_mark: *Cluster Health Check — All Clear*\\n${total_checks}/${total_checks} checks passed | ${node_count} nodes | ${pod_count} pods running" + else + local issue_count=$((FAIL_COUNT + WARN_COUNT)) + slack_text=":rotating_light: *Cluster Health Check — ${issue_count} Issue(s)*\\nPASS: ${PASS_COUNT} | WARN: ${WARN_COUNT} | FAIL: ${FAIL_COUNT}" + + if [[ -n "$fail_items" ]]; then + slack_text+="\\n\\n*Failed:*${fail_items}" + fi + if [[ -n "$warn_items" ]]; then + slack_text+="\\n\\n*Warnings:*${warn_items}" + fi + fi + + # Build Slack Block Kit payload with proper mrkdwn + local json_payload + json_payload=$(printf '%b' "$slack_text" | python3 -c " import json, sys text = sys.stdin.read().strip() payload = { @@ -389,17 +1346,61 @@ payload = { print(json.dumps(payload)) ") - curl -s -X POST "$SLACK_WEBHOOK_URL" \ - -H 'Content-Type: application/json' \ - -d "$json_payload" >/dev/null 2>&1 || echo "WARNING: Failed to send Slack notification" + curl -s -X POST "$SLACK_WEBHOOK_URL" \ + -H 'Content-Type: application/json' \ + -d "$json_payload" >/dev/null 2>&1 || { + [[ "$JSON" != true ]] && echo "WARNING: Failed to send Slack notification" + } - echo "" - echo "Slack notification sent." + [[ "$JSON" != true ]] && echo "Slack notification sent." +} + +# --- Main --- +main() { + parse_args "$@" + + if [[ "$JSON" != true ]]; then + echo -e "${BOLD}Cluster Health Check${NC} — $(date '+%Y-%m-%d %H:%M:%S')" + echo -e "Kubeconfig: $KUBECONFIG_PATH" + if [[ "$FIX" == true ]]; then + echo -e "${YELLOW}Auto-fix mode enabled${NC}" + fi fi -fi -# --- Exit code --- -if [[ "$issue_count" -gt 0 ]]; then - exit 1 -fi -exit 0 + check_nodes + check_resources + check_conditions + check_pods + check_evicted + check_daemonsets + check_deployments + check_pvcs + check_hpa + check_cronjobs + check_crowdsec + check_ingresses + check_alerts + check_uptime_kuma + check_resourcequota + check_statefulsets + check_node_disk + check_helm_releases + check_kyverno + check_nfs + check_dns + check_tls_certs + check_gpu + check_cloudflare_tunnel + print_summary + send_slack + + # Exit code: 2 for failures, 1 for warnings, 0 for clean + if [[ "$FAIL_COUNT" -gt 0 ]]; then + exit 2 + elif [[ "$WARN_COUNT" -gt 0 ]]; then + exit 1 + fi + exit 0 +} + +main "$@"