From a0d770d9a720b23fe9db138abd8443ac046a19d0 Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Sun, 19 Apr 2026 15:13:03 +0000 Subject: [PATCH] [cluster-health] Expand to 42 checks, remove pod CronJob path MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - scripts/cluster_healthcheck.sh: add 12 new checks (cert-manager readiness/expiry/requests, backup freshness per-DB/offsite/LVM, monitoring prom+AM/vault-sealed/CSS, external reachability cloudflared +authentik/ExternalAccessDivergence/traefik-5xx). Bump TOTAL_CHECKS to 42, add --no-fix flag. - Remove the duplicate pod-version .claude/cluster-health.sh (1728 lines) and the openclaw cluster_healthcheck CronJob (local CLI is now the single authoritative runner). Keep the healthcheck SA + Role + RoleBinding — still reused by task_processor CronJob. - Remove SLACK_WEBHOOK_URL env from openclaw deployment and delete the unused setup-monitoring.sh. - Rewrite .claude/skills/cluster-health/SKILL.md: mandates running the script first, refreshes the 42-check table, drops stale CronJob/Slack/post-mortem sections, documents the monorepo-canonical + hardlink layout. File is hardlinked to /home/wizard/code/.claude/skills/cluster-health/SKILL.md for dual discovery. - AGENTS.md + k8s-portal agent page: 25-check → 42-check. Co-Authored-By: Claude Opus 4.7 (1M context) --- .claude/cluster-health.sh | 1728 ----------------- .claude/skills/cluster-health/SKILL.md | 505 +++-- AGENTS.md | 2 +- scripts/cluster_healthcheck.sh | 607 +++++- setup-monitoring.sh | 29 - .../files/src/routes/agent/+server.ts | 2 +- stacks/openclaw/main.tf | 85 +- 7 files changed, 853 insertions(+), 2105 deletions(-) delete mode 100755 .claude/cluster-health.sh delete mode 100755 setup-monitoring.sh diff --git a/.claude/cluster-health.sh b/.claude/cluster-health.sh deleted file mode 100755 index 001d8ebb..00000000 --- a/.claude/cluster-health.sh +++ /dev/null @@ -1,1728 +0,0 @@ -#!/usr/bin/env bash - -# Cluster health check script (pod-compatible version). -# Runs 24 diagnostic checks against the Kubernetes cluster and prints -# a colour-coded report with PASS / WARN / FAIL for each section. -# Optionally posts results to Slack. -# -# Usage: ./cluster-health.sh [--fix] [--quiet|-q] [--json] [--kubeconfig ] [--no-slack] -# -# Environment: -# KUBECONFIG — path to kubeconfig (used in pod environment) -# SLACK_WEBHOOK_URL — Slack incoming webhook URL (required unless --no-slack) -# UPTIME_KUMA_PASSWORD — Uptime Kuma admin password - -set -euo pipefail - -# --- Colors --- -RED='\033[0;31m' -GREEN='\033[0;32m' -YELLOW='\033[0;33m' -BLUE='\033[0;34m' -BOLD='\033[1m' -NC='\033[0m' - -# --- Globals --- -PASS_COUNT=0 -WARN_COUNT=0 -FAIL_COUNT=0 -FIX=false -QUIET=false -JSON=false -SEND_SLACK=true -KUBECONFIG_PATH="${KUBECONFIG:-$(pwd)/config}" -KUBECTL="" -JSON_RESULTS=() -TOTAL_CHECKS=24 - -# --- Helpers --- -info() { [[ "$JSON" == true ]] && return 0; echo -e "${BLUE}[INFO]${NC} $*"; } -pass() { PASS_COUNT=$((PASS_COUNT + 1)); [[ "$JSON" == true ]] && return 0; [[ "$QUIET" == true ]] && return 0; echo -e " ${GREEN}[PASS]${NC} $*"; } -warn() { WARN_COUNT=$((WARN_COUNT + 1)); [[ "$JSON" == true ]] && return 0; echo -e " ${YELLOW}[WARN]${NC} $*"; } -fail() { FAIL_COUNT=$((FAIL_COUNT + 1)); [[ "$JSON" == true ]] && return 0; echo -e " ${RED}[FAIL]${NC} $*"; } - -section() { - local num="$1" title="$2" - [[ "$JSON" == true ]] && return 0 - [[ "$QUIET" == true ]] && return 0 - echo "" - echo -e "${BOLD}[$num/$TOTAL_CHECKS] $title${NC}" -} - -section_always() { - local num="$1" title="$2" - [[ "$JSON" == true ]] && return 0 - echo "" - echo -e "${BOLD}[$num/$TOTAL_CHECKS] $title${NC}" -} - -json_add() { - local name="$1" status="$2" detail="$3" - local escaped - escaped=$(echo "$detail" | python3 -c 'import json,sys; print(json.dumps(sys.stdin.read().strip()))') - JSON_RESULTS+=("{\"check\":\"$name\",\"status\":\"$status\",\"detail\":$escaped}") -} - -# count lines in a variable, returning 0 for empty strings -count_lines() { - local input="$1" - if [[ -z "$input" ]]; then - echo 0 - else - echo "$input" | wc -l | tr -d ' ' - fi -} - -# --- Argument parsing --- -parse_args() { - while [[ $# -gt 0 ]]; do - case "$1" in - --fix) FIX=true; shift ;; - --quiet|-q) QUIET=true; shift ;; - --json) JSON=true; shift ;; - --no-slack) SEND_SLACK=false; shift ;; - --kubeconfig) KUBECONFIG_PATH="$2"; shift 2 ;; - -h|--help) - echo "Usage: $0 [--fix] [--quiet|-q] [--json] [--kubeconfig ] [--no-slack]" - echo "" - echo "Flags:" - echo " --fix Auto-remediate safe issues (delete evicted/CrashLoopBackOff pods)" - echo " --quiet, -q Only show WARN and FAIL sections" - echo " --json Machine-readable JSON output" - echo " --kubeconfig PATH Override kubeconfig (default: \$KUBECONFIG or \$(pwd)/config)" - echo " --no-slack Skip Slack notification" - exit 0 - ;; - *) - echo "Unknown option: $1" >&2 - exit 1 - ;; - esac - done - KUBECTL="kubectl --kubeconfig $KUBECONFIG_PATH" - - # Auto-source UPTIME_KUMA_PASSWORD from terraform.tfvars if not set - if [[ -z "${UPTIME_KUMA_PASSWORD:-}" ]]; then - local script_dir tfvars_file - script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" - tfvars_file="${script_dir}/../terraform.tfvars" - if [[ -f "$tfvars_file" ]]; then - UPTIME_KUMA_PASSWORD=$(grep 'uptime_kuma_password' "$tfvars_file" | head -1 | sed 's/.*= *"\(.*\)"/\1/') - export UPTIME_KUMA_PASSWORD - fi - fi -} - -# --- 1. Node Status --- -check_nodes() { - section 1 "Node Status" - local nodes not_ready versions unique_versions detail="" - - nodes=$($KUBECTL get nodes --no-headers 2>&1) || { fail "Cannot reach cluster"; json_add "node_status" "FAIL" "Cannot reach cluster"; return 0; } - not_ready=$(echo "$nodes" | awk '$2 != "Ready" {print $1}' || true) - versions=$(echo "$nodes" | awk '{print $5}' | sort -u) - unique_versions=$(echo "$versions" | wc -l | tr -d ' ') - - if [[ -n "$not_ready" ]]; then - [[ "$QUIET" == true ]] && section_always 1 "Node Status" - fail "NotReady nodes: $not_ready" - detail="NotReady: $not_ready" - json_add "node_status" "FAIL" "$detail" - elif [[ "$unique_versions" -gt 1 ]]; then - [[ "$QUIET" == true ]] && section_always 1 "Node Status" - warn "Version mismatch across nodes: $(echo "$versions" | tr '\n' ' ')" - detail="Version mismatch: $(echo "$versions" | tr '\n' ' ')" - json_add "node_status" "WARN" "$detail" - else - pass "All nodes Ready, version $(echo "$versions" | head -1)" - detail="All nodes Ready" - json_add "node_status" "PASS" "$detail" - fi -} - -# --- 2. Node Resources --- -check_resources() { - section 2 "Node Resources" - local top detail="" had_issue=false status="PASS" - - top=$($KUBECTL top nodes --no-headers 2>&1) || { fail "metrics-server unavailable"; json_add "node_resources" "FAIL" "metrics-server unavailable"; return 0; } - - while IFS= read -r line; do - local node cpu_pct mem_pct - node=$(echo "$line" | awk '{print $1}') - cpu_pct=$(echo "$line" | awk '{print $3}' | tr -d '%') - mem_pct=$(echo "$line" | awk '{print $5}' | tr -d '%') - - # Skip nodes where metrics are not yet available - if [[ "$cpu_pct" == *"unknown"* ]] || [[ "$mem_pct" == *"unknown"* ]]; then - detail+="$node metrics unavailable; " - continue - fi - - if [[ "$cpu_pct" -gt 90 ]] || [[ "$mem_pct" -gt 90 ]]; then - [[ "$had_issue" == false && "$QUIET" == true ]] && section_always 2 "Node Resources" - fail "$node: CPU ${cpu_pct}%, Mem ${mem_pct}%" - detail+="$node CPU=${cpu_pct}% Mem=${mem_pct}% [FAIL]; " - had_issue=true - status="FAIL" - elif [[ "$cpu_pct" -gt 80 ]] || [[ "$mem_pct" -gt 80 ]]; then - [[ "$had_issue" == false && "$QUIET" == true ]] && section_always 2 "Node Resources" - warn "$node: CPU ${cpu_pct}%, Mem ${mem_pct}%" - detail+="$node CPU=${cpu_pct}% Mem=${mem_pct}% [WARN]; " - had_issue=true - [[ "$status" != "FAIL" ]] && status="WARN" - else - detail+="$node CPU=${cpu_pct}% Mem=${mem_pct}% [OK]; " - fi - done <<< "$top" - - [[ "$had_issue" == false ]] && pass "All nodes below 80% CPU and memory" - json_add "node_resources" "$status" "$detail" -} - -# --- 3. Node Conditions --- -check_conditions() { - section 3 "Node Conditions" - local conditions detail="" - - conditions=$($KUBECTL get nodes -o json | python3 -c ' -import json, sys -data = json.load(sys.stdin) -for node in data["items"]: - name = node["metadata"]["name"] - for c in node["status"]["conditions"]: - if c["type"] in ("MemoryPressure","DiskPressure","PIDPressure") and c["status"] == "True": - print(name + ": " + c["type"]) -' 2>&1) || true - - if [[ -n "$conditions" ]]; then - [[ "$QUIET" == true ]] && section_always 3 "Node Conditions" - while IFS= read -r line; do - fail "$line" - done <<< "$conditions" - detail="$conditions" - json_add "node_conditions" "FAIL" "$detail" - else - pass "No pressure conditions on any node" - json_add "node_conditions" "PASS" "No pressure conditions" - fi -} - -# --- 4. Problematic Pods --- -check_pods() { - section 4 "Problematic Pods" - local bad count detail="" status="PASS" - - bad=$( { - $KUBECTL get pods -A --no-headers --field-selector=status.phase!=Running,status.phase!=Succeeded 2>/dev/null \ - | grep -E 'CrashLoopBackOff|Error|Pending|Init:|ImagePullBackOff|ErrImagePull' || true - $KUBECTL get pods -A --no-headers 2>/dev/null \ - | grep -E 'CrashLoopBackOff|ImagePullBackOff|ErrImagePull' || true - } | awk '!seen[$1,$2]++' | sed '/^$/d') || true - - count=$(count_lines "$bad") - - # Auto-fix CrashLoopBackOff pods with >10 restarts when --fix is enabled - if [[ "$FIX" == true && "$count" -gt 0 ]]; then - local fixed_count=0 - while IFS= read -r line; do - [[ -z "$line" ]] && continue - local ns pod pod_status restarts restarts_clean - ns=$(echo "$line" | awk '{print $1}') - pod=$(echo "$line" | awk '{print $2}') - pod_status=$(echo "$line" | awk '{print $4}') - restarts=$(echo "$line" | awk '{print $5}') - restarts_clean=$(echo "$restarts" | grep -oE '^[0-9]+' || echo "0") - - if [[ "$pod_status" == "CrashLoopBackOff" ]] && [[ "$restarts_clean" -gt 10 ]]; then - info "Deleting CrashLoopBackOff pod $ns/$pod (restarts: $restarts_clean)" - $KUBECTL delete pod -n "$ns" "$pod" --grace-period=0 2>/dev/null || true - fixed_count=$((fixed_count + 1)) - fi - done <<< "$bad" - if [[ "$fixed_count" -gt 0 ]]; then - info "Deleted $fixed_count CrashLoopBackOff pod(s) with >10 restarts" - fi - fi - - if [[ "$count" -eq 0 ]]; then - pass "No problematic pods" - detail="None" - elif [[ "$count" -le 10 ]]; then - [[ "$QUIET" == true ]] && section_always 4 "Problematic Pods" - warn "$count problematic pod(s):" - [[ "$JSON" != true ]] && echo "$bad" | while IFS= read -r line; do echo " $line"; done - detail="$count pods" - status="WARN" - else - [[ "$QUIET" == true ]] && section_always 4 "Problematic Pods" - fail "$count problematic pods (showing first 10):" - [[ "$JSON" != true ]] && echo "$bad" | head -10 | while IFS= read -r line; do echo " $line"; done - detail="$count pods" - status="FAIL" - fi - json_add "problematic_pods" "$status" "$detail" -} - -# --- 5. Evicted/Failed Pods --- -check_evicted() { - section 5 "Evicted/Failed Pods" - local evicted count detail="" status="PASS" - - evicted=$($KUBECTL get pods -A --no-headers --field-selector=status.phase=Failed 2>/dev/null || true) - count=$(count_lines "$evicted") - - if [[ "$count" -eq 0 ]]; then - pass "No evicted or failed pods" - detail="0" - elif [[ "$count" -le 50 ]]; then - [[ "$QUIET" == true ]] && section_always 5 "Evicted/Failed Pods" - warn "$count evicted/failed pod(s)" - detail="$count pods" - status="WARN" - else - [[ "$QUIET" == true ]] && section_always 5 "Evicted/Failed Pods" - fail "$count evicted/failed pods" - detail="$count pods" - status="FAIL" - fi - - if [[ "$FIX" == true && "$count" -gt 0 ]]; then - info "Deleting $count evicted/failed pods..." - $KUBECTL delete pods -A --field-selector=status.phase=Failed 2>/dev/null || true - info "Deleted evicted/failed pods" - fi - json_add "evicted_pods" "$status" "$detail" -} - -# --- 6. DaemonSets --- -check_daemonsets() { - section 6 "DaemonSets" - local ds detail="" had_issue=false - - ds=$($KUBECTL get daemonsets -A --no-headers 2>&1) || { fail "Cannot list DaemonSets"; json_add "daemonsets" "FAIL" "Cannot list"; return 0; } - - while IFS= read -r line; do - local ns name desired ready - ns=$(echo "$line" | awk '{print $1}') - name=$(echo "$line" | awk '{print $2}') - desired=$(echo "$line" | awk '{print $3}') - ready=$(echo "$line" | awk '{print $5}') - - if [[ "$desired" != "$ready" ]]; then - [[ "$had_issue" == false && "$QUIET" == true ]] && section_always 6 "DaemonSets" - fail "$ns/$name: desired=$desired ready=$ready" - detail+="$ns/$name desired=$desired ready=$ready; " - had_issue=true - fi - done <<< "$ds" - - if [[ "$had_issue" == false ]]; then - pass "All DaemonSets healthy (desired == ready)" - json_add "daemonsets" "PASS" "All healthy" - else - json_add "daemonsets" "FAIL" "$detail" - fi -} - -# --- 7. Deployments --- -check_deployments() { - section 7 "Deployments" - local deps detail="" had_issue=false - - deps=$($KUBECTL get deployments -A --no-headers 2>&1) || { fail "Cannot list Deployments"; json_add "deployments" "FAIL" "Cannot list"; return 0; } - - while IFS= read -r line; do - local ns name ready current desired - ns=$(echo "$line" | awk '{print $1}') - name=$(echo "$line" | awk '{print $2}') - ready=$(echo "$line" | awk '{print $3}') - current=$(echo "$ready" | cut -d/ -f1) - desired=$(echo "$ready" | cut -d/ -f2) - - if [[ "$current" != "$desired" ]]; then - [[ "$had_issue" == false && "$QUIET" == true ]] && section_always 7 "Deployments" - fail "$ns/$name: $current/$desired ready" - detail+="$ns/$name $current/$desired; " - had_issue=true - fi - done <<< "$deps" - - if [[ "$had_issue" == false ]]; then - pass "All deployments fully available" - json_add "deployments" "PASS" "All available" - else - json_add "deployments" "FAIL" "$detail" - fi -} - -# --- 8. PVC Status --- -check_pvcs() { - section 8 "PVC Status" - local pvcs detail="" had_issue=false - - pvcs=$($KUBECTL get pvc -A --no-headers 2>&1) || true - if [[ -z "$pvcs" || "$pvcs" == *"No resources found"* ]]; then - pass "No PVCs in cluster" - json_add "pvcs" "PASS" "No PVCs" - return 0 - fi - - while IFS= read -r line; do - local ns name status - ns=$(echo "$line" | awk '{print $1}') - name=$(echo "$line" | awk '{print $2}') - status=$(echo "$line" | awk '{print $3}') - - if [[ "$status" != "Bound" ]]; then - [[ "$had_issue" == false && "$QUIET" == true ]] && section_always 8 "PVC Status" - fail "$ns/$name: $status" - detail+="$ns/$name=$status; " - had_issue=true - fi - done <<< "$pvcs" - - if [[ "$had_issue" == false ]]; then - pass "All PVCs Bound" - json_add "pvcs" "PASS" "All Bound" - else - json_add "pvcs" "FAIL" "$detail" - fi -} - -# --- 9. HPA Health --- -check_hpa() { - section 9 "HPA Health" - local hpas detail="" had_issue=false status="PASS" - - hpas=$($KUBECTL get hpa -A --no-headers 2>&1) || true - if [[ -z "$hpas" || "$hpas" == *"No resources found"* ]]; then - pass "No HPAs configured" - json_add "hpa" "PASS" "No HPAs" - return 0 - fi - - while IFS= read -r line; do - local ns name targets - ns=$(echo "$line" | awk '{print $1}') - name=$(echo "$line" | awk '{print $2}') - targets=$(echo "$line" | awk '{print $3}') - - if echo "$targets" | grep -q ''; then - [[ "$had_issue" == false && "$QUIET" == true ]] && section_always 9 "HPA Health" - fail "$ns/$name: targets=$targets (unknown metrics)" - detail+="$ns/$name=unknown; " - had_issue=true - status="FAIL" - else - # Parse percentage values from targets like "45%/80%, 30%/50%" - local pcts - pcts=$(echo "$targets" | grep -oE '[0-9]+%/' | tr -d '%/' || true) - if [[ -n "$pcts" ]]; then - while IFS= read -r pct; do - [[ -z "$pct" ]] && continue - if [[ "$pct" -gt 150 ]]; then - [[ "$had_issue" == false && "$QUIET" == true ]] && section_always 9 "HPA Health" - fail "$ns/$name: utilization at ${pct}%" - detail+="$ns/$name=${pct}%; " - had_issue=true - status="FAIL" - break - elif [[ "$pct" -gt 100 ]]; then - [[ "$had_issue" == false && "$QUIET" == true ]] && section_always 9 "HPA Health" - warn "$ns/$name: utilization at ${pct}%" - detail+="$ns/$name=${pct}%; " - had_issue=true - [[ "$status" != "FAIL" ]] && status="WARN" - break - fi - done <<< "$pcts" - fi - fi - done <<< "$hpas" - - [[ "$had_issue" == false ]] && pass "All HPAs healthy" - json_add "hpa" "$status" "${detail:-All healthy}" -} - -# --- 10. CronJob Failures --- -check_cronjobs() { - section 10 "CronJob Failures" - local failures detail="" - - failures=$($KUBECTL get jobs -A -o json 2>/dev/null | python3 -c ' -import json, sys -from datetime import datetime, timezone, timedelta - -data = json.load(sys.stdin) -cutoff = datetime.now(timezone.utc) - timedelta(hours=24) - -for job in data.get("items", []): - meta = job.get("metadata", {}) - ns = meta.get("namespace", "") - name = meta.get("name", "") - - owners = meta.get("ownerReferences", []) - is_cronjob = any(o.get("kind") == "CronJob" for o in owners) - if not is_cronjob: - continue - - conditions = job.get("status", {}).get("conditions", []) - for c in conditions: - if c.get("type") == "Failed" and c.get("status") == "True": - ts = c.get("lastTransitionTime", "") - if ts: - try: - t = datetime.fromisoformat(ts.replace("Z", "+00:00")) - if t > cutoff: - print(f"{ns}/{name}: {c.get(\"reason\", \"Unknown\")}") - except: - print(f"{ns}/{name}: {c.get(\"reason\", \"Unknown\")}") -' 2>/dev/null) || true - - if [[ -z "$failures" ]]; then - pass "No CronJob failures in last 24h" - json_add "cronjob_failures" "PASS" "None" - else - [[ "$QUIET" == true ]] && section_always 10 "CronJob Failures" - local count - count=$(count_lines "$failures") - fail "$count CronJob failure(s) in last 24h:" - [[ "$JSON" != true ]] && echo "$failures" | while IFS= read -r line; do echo " $line"; done - json_add "cronjob_failures" "FAIL" "$count failures" - fi -} - -# --- 11. CrowdSec --- -check_crowdsec() { - section 11 "CrowdSec Agents" - local cs_pods not_running - - cs_pods=$($KUBECTL get pods -n crowdsec --no-headers 2>/dev/null || true) - if [[ -z "$cs_pods" ]]; then - [[ "$QUIET" == true ]] && section_always 11 "CrowdSec Agents" - warn "CrowdSec namespace not found or empty" - json_add "crowdsec" "WARN" "No CrowdSec pods found" - return 0 - fi - - not_running=$(echo "$cs_pods" | awk '$3 != "Running" && $3 != "Completed" {print $1 ": " $3}' || true) - if [[ -n "$not_running" ]]; then - [[ "$QUIET" == true ]] && section_always 11 "CrowdSec Agents" - while IFS= read -r line; do - fail "CrowdSec pod not running: $line" - done <<< "$not_running" - json_add "crowdsec" "FAIL" "$not_running" - else - local total - total=$(count_lines "$cs_pods") - pass "All $total CrowdSec pods running" - json_add "crowdsec" "PASS" "$total pods running" - fi -} - -# --- 12. Ingress --- -check_ingresses() { - section 12 "Ingress Routes" - local ingresses no_lb detail="" had_issue=false - - ingresses=$($KUBECTL get ingress -A --no-headers 2>/dev/null || true) - if [[ -n "$ingresses" ]]; then - no_lb=$(echo "$ingresses" | awk '{if ($5 == "" || $5 == "") print $1"/"$2}' || true) - if [[ -n "$no_lb" ]]; then - [[ "$QUIET" == true ]] && section_always 12 "Ingress Routes" - while IFS= read -r line; do - fail "Ingress missing LB IP: $line" - done <<< "$no_lb" - detail="Missing LB: $no_lb" - had_issue=true - fi - fi - - # Check Traefik LB service - local traefik_svc_ip - traefik_svc_ip=$($KUBECTL get svc -n traefik traefik -o jsonpath='{.status.loadBalancer.ingress[0].ip}' 2>/dev/null || true) - if [[ -z "$traefik_svc_ip" ]]; then - [[ "$had_issue" == false && "$QUIET" == true ]] && section_always 12 "Ingress Routes" - fail "Traefik LoadBalancer has no external IP" - detail+="Traefik LB missing IP; " - had_issue=true - else - detail+="Traefik LB=$traefik_svc_ip; " - fi - - if [[ "$had_issue" == false ]]; then - pass "All ingresses have LB assignment (Traefik LB=$traefik_svc_ip)" - json_add "ingresses" "PASS" "$detail" - else - json_add "ingresses" "FAIL" "$detail" - fi -} - -# --- 13. Prometheus Alerts --- -check_alerts() { - section 13 "Prometheus Alerts" - local alerts firing_count - - # Try alertmanager first, then prometheus server - alerts=$($KUBECTL exec -n monitoring deploy/prometheus-alertmanager -- \ - wget -q -O- http://localhost:9093/api/v2/alerts 2>/dev/null || true) - - if [[ -z "$alerts" ]]; then - alerts=$($KUBECTL exec -n monitoring deploy/prometheus-server -- \ - wget -q -O- http://localhost:9090/api/v1/alerts 2>/dev/null || true) - fi - - if [[ -z "$alerts" ]]; then - [[ "$QUIET" == true ]] && section_always 13 "Prometheus Alerts" - warn "Could not query Prometheus/Alertmanager" - json_add "prometheus_alerts" "WARN" "Cannot query" - return 0 - fi - - firing_count=$(echo "$alerts" | python3 -c ' -import json, sys -try: - data = json.load(sys.stdin) - if isinstance(data, list): - active = [a for a in data if a.get("status", {}).get("state") == "active"] - count = len(active) - names = [a.get("labels", {}).get("alertname", "?") for a in active] - print(f"{count}:" + ",".join(names) if count > 0 else "0:") - elif isinstance(data, dict) and "data" in data: - alerts_list = data["data"].get("alerts", []) - firing = [a for a in alerts_list if a.get("state") == "firing"] - count = len(firing) - names = [a.get("labels", {}).get("alertname", "?") for a in firing] - print(f"{count}:" + ",".join(names) if count > 0 else "0:") - else: - print("0:") -except: - print("-1:") -' 2>/dev/null || echo "-1:") - - local count names - count=$(echo "$firing_count" | cut -d: -f1) - names=$(echo "$firing_count" | cut -d: -f2-) - - if [[ "$count" == "-1" ]]; then - [[ "$QUIET" == true ]] && section_always 13 "Prometheus Alerts" - warn "Failed to parse alert data" - json_add "prometheus_alerts" "WARN" "Parse error" - elif [[ "$count" -eq 0 ]]; then - pass "No firing alerts" - json_add "prometheus_alerts" "PASS" "0 firing" - elif [[ "$count" -le 3 ]]; then - [[ "$QUIET" == true ]] && section_always 13 "Prometheus Alerts" - warn "$count firing alert(s): $names" - json_add "prometheus_alerts" "WARN" "$count firing: $names" - else - [[ "$QUIET" == true ]] && section_always 13 "Prometheus Alerts" - fail "$count firing alerts: $names" - json_add "prometheus_alerts" "FAIL" "$count firing: $names" - fi -} - -# --- 14. Uptime Kuma --- -check_uptime_kuma() { - section 14 "Uptime Kuma Monitors" - local result - - result=$(python3 -c ' -import sys, os -try: - from uptime_kuma_api import UptimeKumaApi -except ImportError: - print("ERROR:uptime-kuma-api not installed") - sys.exit(0) - -try: - password = os.environ.get("UPTIME_KUMA_PASSWORD", "") - if not password: - print("ERROR:UPTIME_KUMA_PASSWORD not set") - sys.exit(0) - api = UptimeKumaApi("https://uptime.viktorbarzin.me", timeout=120, wait_events=0.2) - api.login("admin", password) - - monitors = api.get_monitors() - heartbeats = api.get_heartbeats() - - internal_up = 0 - internal_down = [] - external_up = 0 - external_down = [] - paused_count = 0 - - for m in monitors: - mid = m.get("id") - name = m.get("name", "unknown") - active = m.get("active", True) - is_external = name.startswith("[External] ") - - if not active: - paused_count += 1 - continue - - beats = heartbeats.get(mid, []) - if beats: - last_beat = beats[-1] - if isinstance(last_beat, list): - last_beat = last_beat[-1] if last_beat else {} - status = last_beat.get("status", 0) if isinstance(last_beat, dict) else 0 - if hasattr(status, "value"): - status = status.value - is_up = (status == 1) - else: - is_up = False - - if is_external: - if is_up: - external_up += 1 - else: - external_down.append(name.replace("[External] ", "")) - else: - if is_up: - internal_up += 1 - else: - internal_down.append(name) - - api.disconnect() - - int_down_names = ", ".join(internal_down) if internal_down else "" - ext_down_names = ", ".join(external_down) if external_down else "" - print(f"{len(internal_down)}:{internal_up}:{len(external_down)}:{external_up}:{paused_count}:{int_down_names}|{ext_down_names}") -except Exception as e: - print(f"CONN_ERROR:{e}") -' 2>/dev/null) || result="CONN_ERROR:python execution failed" - - if [[ "$result" == "ERROR:"* ]]; then - [[ "$QUIET" == true ]] && section_always 14 "Uptime Kuma Monitors" - warn "Uptime Kuma: ${result#ERROR:}" - json_add "uptime_kuma" "WARN" "${result#ERROR:}" - elif [[ "$result" == "CONN_ERROR:"* ]]; then - [[ "$QUIET" == true ]] && section_always 14 "Uptime Kuma Monitors" - warn "Cannot connect to Uptime Kuma: ${result#CONN_ERROR:}" - json_add "uptime_kuma" "WARN" "Connection failed" - else - local int_down int_up ext_down ext_up paused_count down_details - int_down=$(echo "$result" | cut -d: -f1) - int_up=$(echo "$result" | cut -d: -f2) - ext_down=$(echo "$result" | cut -d: -f3) - ext_up=$(echo "$result" | cut -d: -f4) - paused_count=$(echo "$result" | cut -d: -f5) - down_details=$(echo "$result" | cut -d: -f6-) - local int_down_names="${down_details%%|*}" - local ext_down_names="${down_details#*|}" - - local total_down=$((int_down + ext_down)) - local total_up=$((int_up + ext_up)) - local total_active=$((total_up + total_down)) - - if [[ "$total_down" -eq 0 ]]; then - pass "All monitors up — internal: ${int_up}, external: ${ext_up} ($paused_count paused)" - json_add "uptime_kuma" "PASS" "internal: $int_up up, external: $ext_up up, $paused_count paused" - else - [[ "$QUIET" == true ]] && section_always 14 "Uptime Kuma Monitors" - local details="" - [[ "$int_down" -gt 0 ]] && details="internal down($int_down): $int_down_names" - [[ "$ext_down" -gt 0 ]] && { [[ -n "$details" ]] && details="$details; "; details="${details}external down($ext_down): $ext_down_names"; } - if [[ "$total_down" -le 3 ]]; then - warn "$total_down/$total_active down: $details" - json_add "uptime_kuma" "WARN" "$details" - else - fail "$total_down/$total_active down: $details" - json_add "uptime_kuma" "FAIL" "$details" - fi - fi - fi -} - -# --- 15. ResourceQuota Pressure --- -check_resourcequota() { - section 15 "ResourceQuota Pressure" - local quotas detail="" had_issue=false status="PASS" - - quotas=$($KUBECTL get resourcequota -A -o json 2>/dev/null) || { pass "No ResourceQuotas configured"; json_add "resourcequota" "PASS" "No quotas"; return 0; } - - local pressure - pressure=$(echo "$quotas" | python3 -c ' -import json, sys, re - -def parse_cpu(val): - """Convert CPU value to millicores.""" - val = str(val) - if val.endswith("m"): - return float(val[:-1]) - return float(val) * 1000 - -def parse_mem(val): - """Convert memory value to bytes.""" - val = str(val) - units = {"Ki": 1024, "Mi": 1024**2, "Gi": 1024**3, "Ti": 1024**4} - for suffix, mult in units.items(): - if val.endswith(suffix): - return float(val[:-len(suffix)]) * mult - # Plain bytes or numeric - return float(val) - -data = json.load(sys.stdin) -for item in data.get("items", []): - ns = item["metadata"]["namespace"] - name = item["metadata"]["name"] - status = item.get("status", {}) - hard = status.get("hard", {}) - used = status.get("used", {}) - - for resource, hard_val in hard.items(): - used_val = used.get(resource, "0") - try: - if "cpu" in resource: - h = parse_cpu(hard_val) - u = parse_cpu(used_val) - elif "memory" in resource or "storage" in resource: - h = parse_mem(hard_val) - u = parse_mem(used_val) - elif resource == "pods": - h = float(hard_val) - u = float(used_val) - else: - continue - if h <= 0: - continue - pct = (u / h) * 100 - if pct > 80: - level = "FAIL" if pct > 95 else "WARN" - print(f"{level}:{ns}/{name}:{resource}:{pct:.0f}%") - except (ValueError, ZeroDivisionError): - pass -' 2>/dev/null) || true - - if [[ -z "$pressure" ]]; then - pass "All ResourceQuotas below 80% usage" - json_add "resourcequota" "PASS" "All below 80%" - else - [[ "$QUIET" == true ]] && section_always 15 "ResourceQuota Pressure" - while IFS= read -r line; do - local level ns_res resource pct - level=$(echo "$line" | cut -d: -f1) - ns_res=$(echo "$line" | cut -d: -f2) - resource=$(echo "$line" | cut -d: -f3) - pct=$(echo "$line" | cut -d: -f4) - if [[ "$level" == "FAIL" ]]; then - fail "$ns_res: $resource at $pct" - status="FAIL" - else - warn "$ns_res: $resource at $pct" - [[ "$status" != "FAIL" ]] && status="WARN" - fi - detail+="$ns_res $resource=$pct; " - had_issue=true - done <<< "$pressure" - json_add "resourcequota" "$status" "$detail" - fi -} - -# --- 16. StatefulSets --- -check_statefulsets() { - section 16 "StatefulSets" - local sts detail="" had_issue=false - - sts=$($KUBECTL get statefulsets -A --no-headers 2>&1) || true - if [[ -z "$sts" || "$sts" == *"No resources found"* ]]; then - pass "No StatefulSets in cluster" - json_add "statefulsets" "PASS" "No StatefulSets" - return 0 - fi - - while IFS= read -r line; do - local ns name ready current desired - ns=$(echo "$line" | awk '{print $1}') - name=$(echo "$line" | awk '{print $2}') - ready=$(echo "$line" | awk '{print $3}') - current=$(echo "$ready" | cut -d/ -f1) - desired=$(echo "$ready" | cut -d/ -f2) - - if [[ "$current" != "$desired" ]]; then - [[ "$had_issue" == false && "$QUIET" == true ]] && section_always 16 "StatefulSets" - fail "$ns/$name: $current/$desired ready" - detail+="$ns/$name $current/$desired; " - had_issue=true - fi - done <<< "$sts" - - if [[ "$had_issue" == false ]]; then - pass "All StatefulSets fully available" - json_add "statefulsets" "PASS" "All available" - else - json_add "statefulsets" "FAIL" "$detail" - fi -} - -# --- 17. Node Disk Usage --- -check_node_disk() { - section 17 "Node Disk Usage" - local node_json detail="" had_issue=false status="PASS" - - node_json=$($KUBECTL get nodes -o json 2>/dev/null) || { fail "Cannot get node info"; json_add "node_disk" "FAIL" "Cannot get nodes"; return 0; } - - local disk_info - disk_info=$(echo "$node_json" | python3 -c ' -import json, sys - -def parse_storage(val): - """Convert storage value to bytes.""" - val = str(val) - units = {"Ki": 1024, "Mi": 1024**2, "Gi": 1024**3, "Ti": 1024**4} - for suffix, mult in units.items(): - if val.endswith(suffix): - return float(val[:-len(suffix)]) * mult - return float(val) - -data = json.load(sys.stdin) -for node in data["items"]: - name = node["metadata"]["name"] - cap = node["status"].get("capacity", {}) - alloc = node["status"].get("allocatable", {}) - es_cap = cap.get("ephemeral-storage", "0") - es_alloc = alloc.get("ephemeral-storage", "0") - try: - c = parse_storage(es_cap) - a = parse_storage(es_alloc) - if c > 0: - used_pct = ((c - a) / c) * 100 - if used_pct > 70: # Lower threshold after node2 containerd corruption incident - if used_pct > 85: - level = "FAIL" # Critical: Risk of containerd corruption - elif used_pct > 75: - level = "WARN" # Warning: Monitor closely - else: - level = "WARN" # Early warning - print(f"{level}:{name}:{used_pct:.0f}") - except (ValueError, ZeroDivisionError): - pass -' 2>/dev/null) || true - - if [[ -z "$disk_info" ]]; then - pass "All nodes below 70% ephemeral-storage usage" - json_add "node_disk" "PASS" "All below 70%" - else - [[ "$QUIET" == true ]] && section_always 17 "Node Disk Usage" - while IFS= read -r line; do - local level node pct - level=$(echo "$line" | cut -d: -f1) - node=$(echo "$line" | cut -d: -f2) - pct=$(echo "$line" | cut -d: -f3) - if [[ "$level" == "FAIL" ]]; then - fail "$node: ephemeral-storage at ${pct}%" - status="FAIL" - else - warn "$node: ephemeral-storage at ${pct}%" - [[ "$status" != "FAIL" ]] && status="WARN" - fi - detail+="$node=${pct}%; " - had_issue=true - done <<< "$disk_info" - json_add "node_disk" "$status" "$detail" - fi -} - -# --- 18. Helm Release Health --- -check_helm_releases() { - section 18 "Helm Release Health" - - # Helm may not be available in the pod environment - if ! command -v helm &>/dev/null; then - pass "Helm not available (skipped)" - json_add "helm_releases" "PASS" "Helm not available" - return 0 - fi - - local releases detail="" had_issue=false status="PASS" - - releases=$(helm list --all-namespaces --kubeconfig "$KUBECONFIG_PATH" -o json 2>/dev/null) || { - [[ "$QUIET" == true ]] && section_always 18 "Helm Release Health" - warn "Cannot list Helm releases" - json_add "helm_releases" "WARN" "Cannot list" - return 0 - } - - local bad_releases - bad_releases=$(echo "$releases" | python3 -c ' -import json, sys -data = json.load(sys.stdin) -for r in data: - name = r.get("name", "?") - ns = r.get("namespace", "?") - st = r.get("status", "unknown") - if st != "deployed": - level = "FAIL" if st.startswith("pending") else "WARN" - print(f"{level}:{ns}/{name}:{st}") -' 2>/dev/null) || true - - if [[ -z "$bad_releases" ]]; then - pass "All Helm releases in deployed state" - json_add "helm_releases" "PASS" "All deployed" - else - [[ "$QUIET" == true ]] && section_always 18 "Helm Release Health" - while IFS= read -r line; do - local level release_name release_status - level=$(echo "$line" | cut -d: -f1) - release_name=$(echo "$line" | cut -d: -f2) - release_status=$(echo "$line" | cut -d: -f3) - if [[ "$level" == "FAIL" ]]; then - fail "Helm release $release_name: $release_status (blocks terraform)" - status="FAIL" - else - warn "Helm release $release_name: $release_status" - [[ "$status" != "FAIL" ]] && status="WARN" - fi - detail+="$release_name=$release_status; " - had_issue=true - done <<< "$bad_releases" - json_add "helm_releases" "$status" "$detail" - fi -} - -# --- 19. Kyverno Policy Engine --- -check_kyverno() { - section 19 "Kyverno Policy Engine" - local kv_pods not_running - - kv_pods=$($KUBECTL get pods -n kyverno --no-headers 2>/dev/null || true) - if [[ -z "$kv_pods" ]]; then - [[ "$QUIET" == true ]] && section_always 19 "Kyverno Policy Engine" - fail "Kyverno namespace not found or empty — policy engine down, cascading cluster impact" - json_add "kyverno" "FAIL" "No Kyverno pods found" - return 0 - fi - - not_running=$(echo "$kv_pods" | awk '$3 != "Running" && $3 != "Completed" {print $1 ": " $3}' || true) - if [[ -n "$not_running" ]]; then - [[ "$QUIET" == true ]] && section_always 19 "Kyverno Policy Engine" - while IFS= read -r line; do - fail "Kyverno pod not running: $line" - done <<< "$not_running" - json_add "kyverno" "FAIL" "$not_running" - else - local total - total=$(count_lines "$kv_pods") - pass "All $total Kyverno pods running" - json_add "kyverno" "PASS" "$total pods running" - fi -} - -# --- 20. NFS Connectivity --- -check_nfs() { - section 20 "NFS Connectivity" - - # Try native tools first (available locally), fall back to kubectl-based check (pod environment) - if command -v showmount &>/dev/null; then - if showmount -e 192.168.1.127 &>/dev/null; then - pass "NFS server 192.168.1.127 reachable (exports listed)" - json_add "nfs" "PASS" "NFS reachable" - return 0 - fi - fi - - if command -v nc &>/dev/null; then - if nc -z -G 3 192.168.1.127 2049 &>/dev/null; then - pass "NFS server 192.168.1.127 port 2049 open" - json_add "nfs" "PASS" "NFS port open" - return 0 - fi - fi - - # Fallback: check if NFS-backed pods are running (works in pod environment) - local nfs_pods - nfs_pods=$($KUBECTL get pods -A -o json 2>/dev/null | python3 -c ' -import json, sys -data = json.load(sys.stdin) -count = 0 -for pod in data.get("items", []): - for vol in pod.get("spec", {}).get("volumes", []): - if "nfs" in vol: - if pod.get("status", {}).get("phase") == "Running": - count += 1 - break -print(count) -' 2>/dev/null) || nfs_pods="0" - - if [[ "$nfs_pods" -gt 0 ]]; then - pass "NFS healthy ($nfs_pods pods using NFS volumes are running)" - json_add "nfs" "PASS" "$nfs_pods NFS pods running" - else - [[ "$QUIET" == true ]] && section_always 20 "NFS Connectivity" - warn "Cannot verify NFS (showmount not available, no NFS pods found)" - json_add "nfs" "WARN" "Cannot verify" - fi -} - -# --- 21. DNS Resolution --- -check_dns() { - section 21 "DNS Resolution" - local internal_ok=false external_ok=false detail="" - - # Try dig first (available locally), fall back to python3 (pod environment) - # Use system resolver (no @server) so it works from any host or pod - if command -v dig &>/dev/null; then - if dig viktorbarzin.me +short +time=3 +tries=1 2>/dev/null | grep -q .; then - internal_ok=true - fi - if dig google.com +short +time=3 +tries=1 2>/dev/null | grep -q .; then - external_ok=true - fi - else - # Fallback: use python3 for DNS resolution (works in pod environment) - local result - result=$(python3 -c " -import socket -try: - socket.getaddrinfo('viktorbarzin.me', 443) - print('INTERNAL_OK') -except Exception: - print('INTERNAL_FAIL') -try: - socket.getaddrinfo('google.com', 443) - print('EXTERNAL_OK') -except Exception: - print('EXTERNAL_FAIL') -" 2>/dev/null) || result="" - - if echo "$result" | grep -q "INTERNAL_OK"; then - internal_ok=true - fi - if echo "$result" | grep -q "EXTERNAL_OK"; then - external_ok=true - fi - fi - - if [[ "$internal_ok" == true && "$external_ok" == true ]]; then - pass "DNS resolves both internal (viktorbarzin.me) and external (google.com)" - json_add "dns" "PASS" "Both resolve" - elif [[ "$internal_ok" == true || "$external_ok" == true ]]; then - [[ "$QUIET" == true ]] && section_always 21 "DNS Resolution" - if [[ "$internal_ok" == false ]]; then - warn "DNS: internal (viktorbarzin.me) failed, external (google.com) OK" - detail="Internal failed" - else - warn "DNS: internal (viktorbarzin.me) OK, external (google.com) failed" - detail="External failed" - fi - json_add "dns" "WARN" "$detail" - else - [[ "$QUIET" == true ]] && section_always 21 "DNS Resolution" - fail "DNS not resolving — both internal and external failed" - json_add "dns" "FAIL" "Both failed" - fi -} - -# --- 22. TLS Certificate Expiry --- -check_tls_certs() { - section 22 "TLS Certificate Expiry" - local secrets detail="" had_issue=false status="PASS" - - secrets=$($KUBECTL get secrets -A -o json 2>/dev/null) || { - [[ "$QUIET" == true ]] && section_always 22 "TLS Certificate Expiry" - warn "Cannot list secrets" - json_add "tls_certs" "WARN" "Cannot list secrets" - return 0 - } - - local cert_issues - cert_issues=$(echo "$secrets" | python3 -c ' -import json, sys, base64, subprocess, hashlib -from datetime import datetime, timezone - -data = json.load(sys.stdin) -seen_fingerprints = set() -results = [] - -for item in data.get("items", []): - if item.get("type") != "kubernetes.io/tls": - continue - ns = item["metadata"]["namespace"] - name = item["metadata"]["name"] - cert_data = item.get("data", {}).get("tls.crt", "") - if not cert_data: - continue - - # Deduplicate by cert fingerprint - raw = base64.b64decode(cert_data) - fp = hashlib.sha256(raw).hexdigest()[:16] - if fp in seen_fingerprints: - continue - seen_fingerprints.add(fp) - - # Parse certificate expiry with openssl - try: - result = subprocess.run( - ["openssl", "x509", "-noout", "-enddate", "-subject"], - input=raw, capture_output=True, timeout=5 - ) - output = result.stdout.decode() - for line in output.splitlines(): - if line.startswith("notAfter="): - date_str = line.split("=", 1)[1] - # Parse openssl date format: "Mon DD HH:MM:SS YYYY GMT" - try: - expiry = datetime.strptime(date_str.strip(), "%b %d %H:%M:%S %Y %Z") - expiry = expiry.replace(tzinfo=timezone.utc) - days_left = (expiry - datetime.now(timezone.utc)).days - if days_left <= 7: - print(f"FAIL:{ns}/{name}:{days_left}d") - elif days_left <= 30: - print(f"WARN:{ns}/{name}:{days_left}d") - except ValueError: - pass - except (subprocess.TimeoutExpired, Exception): - pass -' 2>/dev/null) || true - - if [[ -z "$cert_issues" ]]; then - pass "All TLS certificates valid for >30 days" - json_add "tls_certs" "PASS" "All valid >30d" - else - [[ "$QUIET" == true ]] && section_always 22 "TLS Certificate Expiry" - while IFS= read -r line; do - local level cert_name days - level=$(echo "$line" | cut -d: -f1) - cert_name=$(echo "$line" | cut -d: -f2) - days=$(echo "$line" | cut -d: -f3) - if [[ "$level" == "FAIL" ]]; then - fail "TLS cert $cert_name expires in $days" - status="FAIL" - else - warn "TLS cert $cert_name expires in $days" - [[ "$status" != "FAIL" ]] && status="WARN" - fi - detail+="$cert_name=$days; " - had_issue=true - done <<< "$cert_issues" - json_add "tls_certs" "$status" "$detail" - fi -} - -# --- 23. GPU Health --- -check_gpu() { - section 23 "GPU Health" - local gpu_pods not_running - - gpu_pods=$($KUBECTL get pods -n nvidia --no-headers 2>/dev/null || true) - if [[ -z "$gpu_pods" ]]; then - [[ "$QUIET" == true ]] && section_always 23 "GPU Health" - warn "NVIDIA namespace not found or empty" - json_add "gpu" "WARN" "No GPU pods found" - return 0 - fi - - # Check specifically for device-plugin (critical for GPU scheduling) - local device_plugin_down=false - local other_down=false - local detail="" - - while IFS= read -r line; do - local pod_name pod_status - pod_name=$(echo "$line" | awk '{print $1}') - pod_status=$(echo "$line" | awk '{print $3}') - if [[ "$pod_status" != "Running" && "$pod_status" != "Completed" ]]; then - if echo "$pod_name" | grep -q "device-plugin"; then - device_plugin_down=true - detail+="device-plugin $pod_name: $pod_status; " - else - other_down=true - detail+="$pod_name: $pod_status; " - fi - fi - done <<< "$gpu_pods" - - if [[ "$device_plugin_down" == true ]]; then - [[ "$QUIET" == true ]] && section_always 23 "GPU Health" - fail "GPU device-plugin is down — GPU workloads cannot schedule" - json_add "gpu" "FAIL" "$detail" - elif [[ "$other_down" == true ]]; then - [[ "$QUIET" == true ]] && section_always 23 "GPU Health" - warn "Some GPU pods not running: $detail" - json_add "gpu" "WARN" "$detail" - else - local total - total=$(count_lines "$gpu_pods") - pass "All $total GPU pods running" - json_add "gpu" "PASS" "$total pods running" - fi -} - -# --- 24. Cloudflare Tunnel --- -check_cloudflare_tunnel() { - section 24 "Cloudflare Tunnel" - local cf_pods running_count total_count - - cf_pods=$($KUBECTL get pods -n cloudflared --no-headers 2>/dev/null || true) - if [[ -z "$cf_pods" ]]; then - [[ "$QUIET" == true ]] && section_always 24 "Cloudflare Tunnel" - fail "Cloudflare tunnel namespace not found or empty — external access broken" - json_add "cloudflare_tunnel" "FAIL" "No pods found" - return 0 - fi - - total_count=$(count_lines "$cf_pods") - running_count=$(echo "$cf_pods" | awk '$3 == "Running"' | wc -l | tr -d ' ') - - if [[ "$running_count" -eq 0 ]]; then - [[ "$QUIET" == true ]] && section_always 24 "Cloudflare Tunnel" - fail "Cloudflare tunnel: 0/$total_count pods running — external access broken" - json_add "cloudflare_tunnel" "FAIL" "0/$total_count running" - elif [[ "$running_count" -lt "$total_count" ]]; then - [[ "$QUIET" == true ]] && section_always 24 "Cloudflare Tunnel" - warn "Cloudflare tunnel: $running_count/$total_count pods running (degraded)" - json_add "cloudflare_tunnel" "WARN" "$running_count/$total_count running" - else - pass "Cloudflare tunnel: all $total_count pods running" - json_add "cloudflare_tunnel" "PASS" "$total_count pods running" - fi -} - -# --- 25. Advanced CPU Monitoring (Prometheus) --- -check_prometheus_cpu() { - section 25 "Advanced CPU Monitoring" - local cpu_query="100%20-%20(avg%20by%20(instance)%20(irate(node_cpu_seconds_total%7Bmode%3D%22idle%22%7D%5B5m%5D))%20*%20100)" - local detail="" had_issue=false status="PASS" - - # Start port-forward to Prometheus if not using in-cluster DNS - local prom_url pf_pid="" - if curl -s --connect-timeout 2 "http://prometheus-server.monitoring.svc.cluster.local/api/v1/query?query=up" &>/dev/null; then - prom_url="http://prometheus-server.monitoring.svc.cluster.local/api/v1/query" - else - local pf_port - pf_port=$(python3 -c 'import socket; s=socket.socket(); s.bind(("",0)); print(s.getsockname()[1]); s.close()') - $KUBECTL port-forward -n monitoring svc/prometheus-server "$pf_port:80" &>/dev/null & - pf_pid=$! - sleep 2 - prom_url="http://127.0.0.1:${pf_port}/api/v1/query" - fi - # Cleanup port-forward on exit from this function - trap '[[ -n "$pf_pid" ]] && kill $pf_pid 2>/dev/null || true' RETURN - - # Try to query Prometheus for CPU metrics - local cpu_data - cpu_data=$(curl -s --connect-timeout 10 "${prom_url}?query=${cpu_query}" 2>/dev/null) || { - warn "Prometheus not accessible for CPU monitoring" - json_add "prometheus_cpu" "WARN" "Prometheus unreachable" - return 0 - } - - # Parse JSON and check CPU usage - local cpu_results - cpu_results=$(echo "$cpu_data" | python3 -c " -import json, sys -try: - data = json.load(sys.stdin) - if data.get('status') == 'success': - for result in data['data']['result']: - instance = result['metric']['instance'] - usage = float(result['value'][1]) - # Map IP to node name - if '10.0.20.100' in instance: - node = 'k8s-master' - elif '10.0.20.101' in instance: - node = 'k8s-node1' - elif '10.0.20.102' in instance: - node = 'k8s-node2' - elif '10.0.20.103' in instance: - node = 'k8s-node3' - elif '10.0.20.104' in instance: - node = 'k8s-node4' - elif 'pve-node' in instance: - node = 'proxmox-host' - else: - node = instance - print(f'{node}:{usage:.1f}') -except Exception as e: - print(f'ERROR:{e}') -" 2>/dev/null) || true - - if [[ "$cpu_results" == *"ERROR"* || -z "$cpu_results" ]]; then - warn "Failed to parse Prometheus CPU data" - json_add "prometheus_cpu" "WARN" "Parse failed" - return 0 - fi - - # Check CPU thresholds - while IFS=':' read -r node usage; do - [[ -z "$node" || -z "$usage" ]] && continue - usage_int=${usage%.*} # Remove decimal - - if [[ "$usage_int" -gt 85 ]]; then - [[ "$had_issue" == false && "$QUIET" == true ]] && section_always 25 "Advanced CPU Monitoring" - fail "$node: ${usage}% CPU (critical)" - detail+="$node=${usage}% [CRIT]; " - had_issue=true - status="FAIL" - elif [[ "$usage_int" -gt 70 ]]; then - [[ "$had_issue" == false && "$QUIET" == true ]] && section_always 25 "Advanced CPU Monitoring" - warn "$node: ${usage}% CPU (high)" - detail+="$node=${usage}% [HIGH]; " - had_issue=true - [[ "$status" != "FAIL" ]] && status="WARN" - else - detail+="$node=${usage}% [OK]; " - fi - done <<< "$cpu_results" - - [[ "$had_issue" == false ]] && pass "All nodes below 70% CPU usage (5m avg)" - json_add "prometheus_cpu" "$status" "$detail" -} - -# --- 26. Power Monitoring --- -check_power_monitoring() { - section 26 "Power Monitoring" - local detail="" had_issue=false status="PASS" - - # Start port-forward to Prometheus if not using in-cluster DNS - local prom_url pf_pid="" - if curl -s --connect-timeout 2 "http://prometheus-server.monitoring.svc.cluster.local/api/v1/query?query=up" &>/dev/null; then - prom_url="http://prometheus-server.monitoring.svc.cluster.local/api/v1/query" - else - local pf_port - pf_port=$(python3 -c 'import socket; s=socket.socket(); s.bind(("",0)); print(s.getsockname()[1]); s.close()') - $KUBECTL port-forward -n monitoring svc/prometheus-server "$pf_port:80" &>/dev/null & - pf_pid=$! - sleep 2 - prom_url="http://127.0.0.1:${pf_port}/api/v1/query" - fi - trap '[[ -n "$pf_pid" ]] && kill $pf_pid 2>/dev/null || true' RETURN - - # GPU Power monitoring - local gpu_query="DCGM_FI_DEV_POWER_USAGE" - local gpu_data - gpu_data=$(curl -s --connect-timeout 10 "${prom_url}?query=${gpu_query}" 2>/dev/null) || { - [[ "$had_issue" == false && "$QUIET" == true ]] && section_always 26 "Power Monitoring" - warn "GPU power metrics unavailable" - detail+="GPU metrics unavailable; " - had_issue=true - status="WARN" - } - - if [[ -n "$gpu_data" && "$gpu_data" != *"error"* ]]; then - local gpu_results - gpu_results=$(echo "$gpu_data" | python3 -c " -import json, sys -try: - data = json.load(sys.stdin) - if data.get('status') == 'success': - for result in data['data']['result']: - hostname = result['metric'].get('Hostname', 'unknown') - power = float(result['value'][1]) - print(f'{hostname}:{power:.1f}') -except Exception: - pass -" 2>/dev/null) || true - - # Check GPU power thresholds (Tesla T4 TDP is ~70W) - while IFS=':' read -r node power; do - [[ -z "$node" || -z "$power" ]] && continue - power_int=${power%.*} - - if [[ "$power_int" -gt 65 ]]; then # > 90% of T4 TDP - [[ "$had_issue" == false && "$QUIET" == true ]] && section_always 26 "Power Monitoring" - warn "GPU $node: ${power}W (high power draw)" - detail+="GPU-$node=${power}W [HIGH]; " - had_issue=true - [[ "$status" != "FAIL" ]] && status="WARN" - elif [[ "$power_int" -gt 50 ]]; then # > 70% of T4 TDP - detail+="GPU-$node=${power}W [ACTIVE]; " - else - detail+="GPU-$node=${power}W [IDLE]; " - fi - done <<< "$gpu_results" - fi - - [[ "$had_issue" == false ]] && pass "Power consumption within normal ranges" - json_add "power_monitoring" "$status" "$detail" -} - -# --- Summary --- -print_summary() { - if [[ "$JSON" == true ]]; then - echo "{" - echo " \"timestamp\": \"$(date -u +%Y-%m-%dT%H:%M:%SZ)\"," - echo " \"pass\": $PASS_COUNT," - echo " \"warn\": $WARN_COUNT," - echo " \"fail\": $FAIL_COUNT," - echo " \"checks\": [" - local first=true - for r in "${JSON_RESULTS[@]}"; do - if [[ "$first" == true ]]; then - echo " $r" - first=false - else - echo " ,$r" - fi - done - echo " ]" - echo "}" - return 0 - fi - - echo "" - echo -e "${BOLD}═══════════════════════════════════════${NC}" - echo -e "${BOLD} Cluster Health Summary${NC}" - echo -e "${BOLD}═══════════════════════════════════════${NC}" - echo -e " ${GREEN}PASS${NC}: $PASS_COUNT ${YELLOW}WARN${NC}: $WARN_COUNT ${RED}FAIL${NC}: $FAIL_COUNT" - echo "" - - if [[ "$FAIL_COUNT" -gt 0 ]]; then - echo -e " Overall: ${RED}UNHEALTHY${NC}" - elif [[ "$WARN_COUNT" -gt 0 ]]; then - echo -e " Overall: ${YELLOW}DEGRADED${NC}" - else - echo -e " Overall: ${GREEN}HEALTHY${NC}" - fi - echo "" -} - -# --- Slack Notification --- - -# Human-readable check name mapping -friendly_check_name() { - case "$1" in - node_status) echo "Node Status" ;; - node_resources) echo "Node Resources" ;; - node_conditions) echo "Node Conditions" ;; - problematic_pods) echo "Problematic Pods" ;; - evicted_pods) echo "Evicted Pods" ;; - daemonsets) echo "DaemonSets" ;; - deployments) echo "Deployments" ;; - pvcs) echo "PVCs" ;; - hpa) echo "HPAs" ;; - cronjob_failures) echo "CronJob Failures" ;; - crowdsec) echo "CrowdSec" ;; - ingresses) echo "Ingresses" ;; - prometheus_alerts) echo "Prometheus Alerts" ;; - uptime_kuma) echo "Uptime Kuma" ;; - resourcequota) echo "Resource Quotas" ;; - statefulsets) echo "StatefulSets" ;; - node_disk) echo "Node Disk" ;; - helm_releases) echo "Helm Releases" ;; - kyverno) echo "Kyverno" ;; - nfs) echo "NFS Storage" ;; - dns) echo "DNS Resolution" ;; - tls_certs) echo "TLS Certificates" ;; - gpu) echo "GPU" ;; - cloudflare_tunnel) echo "Cloudflare Tunnel" ;; - prometheus_cpu) echo "Advanced CPU Monitoring" ;; - power_monitoring) echo "Power Monitoring" ;; - *) echo "$1" ;; - esac -} - -send_slack() { - if [[ "$SEND_SLACK" != true ]]; then - return 0 - fi - if [[ -z "${SLACK_WEBHOOK_URL:-}" ]]; then - [[ "$JSON" != true ]] && echo "WARNING: SLACK_WEBHOOK_URL not set, skipping Slack notification" - return 0 - fi - - # Gather stats for summary line - local node_count pod_count - node_count=$($KUBECTL get nodes --no-headers 2>/dev/null | wc -l | tr -d ' ') - pod_count=$($KUBECTL get pods -A --no-headers --field-selector=status.phase=Running 2>/dev/null | wc -l | tr -d ' ') - - local total_checks=$((PASS_COUNT + WARN_COUNT + FAIL_COUNT)) - - # Use python3 to build the entire Slack payload from JSON_RESULTS - local json_results_str - json_results_str=$(printf '%s\n' "${JSON_RESULTS[@]}") - - local json_payload - json_payload=$(echo "$json_results_str" | python3 -c " -import json, sys - -CHECK_NAMES = { - 'node_status': 'Node Status', - 'node_resources': 'Node Resources', - 'node_conditions': 'Node Conditions', - 'problematic_pods': 'Problematic Pods', - 'evicted_pods': 'Evicted Pods', - 'daemonsets': 'DaemonSets', - 'deployments': 'Deployments', - 'pvcs': 'PVCs', - 'hpa': 'HPAs', - 'cronjob_failures': 'CronJob Failures', - 'crowdsec': 'CrowdSec', - 'ingresses': 'Ingresses', - 'prometheus_alerts': 'Prometheus Alerts', - 'uptime_kuma': 'Uptime Kuma', - 'resourcequota': 'Resource Quotas', - 'statefulsets': 'StatefulSets', - 'node_disk': 'Node Disk', - 'helm_releases': 'Helm Releases', - 'kyverno': 'Kyverno', - 'nfs': 'NFS Storage', - 'dns': 'DNS Resolution', - 'tls_certs': 'TLS Certificates', - 'gpu': 'GPU', - 'cloudflare_tunnel': 'Cloudflare Tunnel', - 'prometheus_cpu': 'Advanced CPU Monitoring', - 'power_monitoring': 'Power Monitoring', -} - -def format_detail(check, detail): - \"\"\"Format detail text for readability. Truncate long lists, split semicolons.\"\"\" - detail = detail.rstrip('; ').strip() - - # For checks with long comma-separated lists (e.g. Uptime Kuma down monitors), - # truncate to first 5 items with a count - if check == 'uptime_kuma' and ': ' in detail: - prefix, names_str = detail.split(': ', 1) - names = [n.strip() for n in names_str.split(',') if n.strip()] - if len(names) > 5: - shown = ', '.join(names[:5]) - detail = f'{prefix}: {shown} (+{len(names) - 5} more)' - elif names: - detail = prefix + ': ' + ', '.join(names) - - # For resource quotas and similar semicolon-separated items, - # split into separate lines - if '; ' in detail: - parts = [p.strip() for p in detail.split(';') if p.strip()] - if len(parts) > 1: - lines = '\\n'.join(f' \u2022 {p}' for p in parts) - return lines - - return detail - -# Parse results -fails = [] -warns = [] -for line in sys.stdin: - line = line.strip() - if not line: - continue - try: - d = json.loads(line) - except json.JSONDecodeError: - continue - status = d.get('status', '') - check = d.get('check', '') - detail = d.get('detail', '') - name = CHECK_NAMES.get(check, check) - formatted = format_detail(check, detail) - - if status == 'FAIL': - fails.append((name, formatted)) - elif status == 'WARN': - warns.append((name, formatted)) - -pass_count = ${PASS_COUNT} -warn_count = ${WARN_COUNT} -fail_count = ${FAIL_COUNT} -total = ${total_checks} -nodes = '${node_count}' -pods = '${pod_count}' - -blocks = [] - -# Header block -if fail_count == 0 and warn_count == 0: - header = f':white_check_mark: *Cluster Health Check \u2014 All Clear*' - summary = f'{total}/{total} checks passed \u2022 {nodes} nodes \u2022 {pods} pods' - blocks.append({'type': 'section', 'text': {'type': 'mrkdwn', 'text': f'{header}\n{summary}'}}) -else: - issue_count = fail_count + warn_count - emoji = ':rotating_light:' if fail_count > 0 else ':warning:' - header = f'{emoji} *Cluster Health Check \u2014 {issue_count} Issue(s)*' - summary = f':white_check_mark: {pass_count} passed \u2022 :warning: {warn_count} warnings \u2022 :x: {fail_count} failed' - blocks.append({'type': 'section', 'text': {'type': 'mrkdwn', 'text': f'{header}\n{summary}'}}) - -# Failed section -if fails: - blocks.append({'type': 'divider'}) - lines = [':x: *Failed*'] - for name, detail in fails: - if '\\n' in detail: - lines.append(f'\u2022 *{name}*:') - lines.append(detail) - else: - lines.append(f'\u2022 *{name}*: {detail}') - blocks.append({'type': 'section', 'text': {'type': 'mrkdwn', 'text': '\\n'.join(lines)}}) - -# Warnings section -if warns: - blocks.append({'type': 'divider'}) - lines = [':warning: *Warnings*'] - for name, detail in warns: - if '\\n' in detail: - lines.append(f'\u2022 *{name}*:') - lines.append(detail) - else: - lines.append(f'\u2022 *{name}*: {detail}') - blocks.append({'type': 'section', 'text': {'type': 'mrkdwn', 'text': '\\n'.join(lines)}}) - -# Footer with timestamp -from datetime import datetime, timezone -ts = datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M UTC') -blocks.append({'type': 'context', 'elements': [{'type': 'mrkdwn', 'text': f'{nodes} nodes \u2022 {pods} pods \u2022 {ts}'}]}) - -payload = {'blocks': blocks} -print(json.dumps(payload)) -") - - curl -s -X POST "$SLACK_WEBHOOK_URL" \ - -H 'Content-Type: application/json' \ - -d "$json_payload" >/dev/null 2>&1 || { - [[ "$JSON" != true ]] && echo "WARNING: Failed to send Slack notification" - } - - [[ "$JSON" != true ]] && echo "Slack notification sent." -} - -# --- Main --- -main() { - parse_args "$@" - - if [[ "$JSON" != true ]]; then - echo -e "${BOLD}Cluster Health Check${NC} — $(date '+%Y-%m-%d %H:%M:%S')" - echo -e "Kubeconfig: $KUBECONFIG_PATH" - if [[ "$FIX" == true ]]; then - echo -e "${YELLOW}Auto-fix mode enabled${NC}" - fi - fi - - check_nodes - check_resources - check_conditions - check_pods - check_evicted - check_daemonsets - check_deployments - check_pvcs - check_hpa - check_cronjobs - check_crowdsec - check_ingresses - check_alerts - check_uptime_kuma - check_resourcequota - check_statefulsets - check_node_disk - check_helm_releases - check_kyverno - check_nfs - check_dns - check_tls_certs - check_gpu - check_cloudflare_tunnel - check_prometheus_cpu - check_power_monitoring - print_summary - send_slack - - # Always exit 0 — reporting is done via Slack notification. - # Non-zero exits mark the CronJob as Failed, which triggers Prometheus - # JobFailed alerts, creating a circular alert loop. - exit 0 -} - -main "$@" diff --git a/.claude/skills/cluster-health/SKILL.md b/.claude/skills/cluster-health/SKILL.md index be18fc9f..ef3ae25f 100644 --- a/.claude/skills/cluster-health/SKILL.md +++ b/.claude/skills/cluster-health/SKILL.md @@ -7,339 +7,314 @@ description: | (3) User asks to fix stuck pods, evicted pods, or CrashLoopBackOff, (4) User mentions "health check", "cluster status", "cluster health", (5) User asks "is everything running" or "any problems". - Runs 8 standard K8s health checks with safe auto-fix for evicted pods - and stuck CrashLoopBackOff pods. + Runs 42 cluster-wide checks (nodes, workloads, monitoring, certs, + backups, external reachability) with safe auto-fix for evicted pods. author: Claude Code -version: 1.0.0 -date: 2026-02-21 +version: 2.0.0 +date: 2026-04-19 --- # Cluster Health Check -## Overview +## MANDATORY: Run the script first -- **Script**: `/workspace/infra/.claude/cluster-health.sh` -- **Schedule**: CronJob runs every 30 minutes in the `openclaw` namespace -- **Slack notifications**: Posts results to the webhook URL in `$SLACK_WEBHOOK_URL` -- **Auto-fix**: Automatically deletes evicted/failed pods and CrashLoopBackOff pods with >10 restarts -- **Exit code**: 0 = healthy, 1 = issues found - -## Quick Check - -Run the health check interactively: +When this skill is invoked, your **first action** must be to run the +cluster health check script and reason over its output before doing +anything else. Do not improvise individual `kubectl` calls — the +script is the authoritative surface. ```bash -# Report only, no Slack notification -bash /workspace/infra/.claude/cluster-health.sh --no-slack - -# Full run with Slack notification -bash /workspace/infra/.claude/cluster-health.sh - -# Report only, no auto-fix and no Slack -bash /workspace/infra/.claude/cluster-health.sh --no-fix --no-slack +cd /home/wizard/code +bash infra/scripts/cluster_healthcheck.sh --json | tee /tmp/cluster-health.json ``` -## What It Checks +If the session is rooted elsewhere, fall back to the absolute path: -| # | Check | Auto-Fix | Alerts | -|---|-------|----------|--------| -| 1 | **Node Health** — NotReady nodes, MemoryPressure, DiskPressure, PIDPressure | No | Yes | -| 2 | **Pod Health** — CrashLoopBackOff, ImagePullBackOff, ErrImagePull, Error | Yes (CrashLoop >10 restarts) | Yes | -| 3 | **Evicted/Failed Pods** — Pods in `Failed` phase | Yes (deletes all) | Yes | -| 4 | **Failed Deployments** — Deployments with ready != desired replicas | No | Yes | -| 5 | **Pending PVCs** — PersistentVolumeClaims not in `Bound` state | No | Yes | -| 6 | **Resource Pressure** — Node CPU or memory >80% (warn) or >90% (issue) | No | Yes | -| 7 | **CronJob Failures** — Failed CronJob-owned Jobs in the last 24h | No | Yes | -| 8 | **DaemonSet Health** — DaemonSets with desired != ready | No | Yes | +```bash +bash /home/wizard/code/infra/scripts/cluster_healthcheck.sh --json +``` + +Then: + +1. Parse the JSON. Report the PASS/WARN/FAIL counts + overall verdict. +2. Iterate every FAIL and WARN check, describe what tripped, and propose + the remediation path (use the recipes below). +3. Only reach for ad-hoc `kubectl` commands when investigating a + specific failure beyond what the script reported. + +Exit codes: `0` = healthy, `1` = warnings only, `2` = failures. + +## Quick flags + +```bash +# Human-readable report (default), no auto-fix +bash infra/scripts/cluster_healthcheck.sh + +# Machine-readable JSON summary +bash infra/scripts/cluster_healthcheck.sh --json + +# Only show WARN + FAIL (suppress PASS noise) +bash infra/scripts/cluster_healthcheck.sh --quiet + +# Enable auto-fix (delete evicted pods, kick stuck CrashLoop pods) +bash infra/scripts/cluster_healthcheck.sh --fix + +# Combined: quiet JSON without auto-fix +bash infra/scripts/cluster_healthcheck.sh --no-fix --quiet --json + +# Custom kubeconfig +bash infra/scripts/cluster_healthcheck.sh --kubeconfig /path/to/config +``` + +## What It Checks (42 checks) + +| # | Check | Notes | +|---|-------|-------| +| 1 | Node Status | NotReady nodes, version drift | +| 2 | Node Resources | CPU/mem >80% (warn) / >90% (fail) | +| 3 | Node Conditions | MemoryPressure / DiskPressure / PIDPressure | +| 4 | Problematic Pods | CrashLoopBackOff / Error / ImagePullBackOff | +| 5 | Evicted/Failed Pods | `status.phase=Failed` | +| 6 | DaemonSets | desired == ready | +| 7 | Deployments | ready == desired replicas | +| 8 | PVC Status | all Bound | +| 9 | HPA Health | targets not ``, utilization <100% | +| 10 | CronJob Failures | job conditions `Failed=True` in last 24h | +| 11 | CrowdSec Agents | all pods Running | +| 12 | Ingress Routes | every ingress has an LB IP + Traefik LB | +| 13 | Prometheus Alerts | count of firing alerts | +| 14 | Uptime Kuma Monitors | internal + external monitors up | +| 15 | ResourceQuota Pressure | any quota >80% used | +| 16 | StatefulSets | ready == desired | +| 17 | Node Disk Usage | ephemeral-storage <80% | +| 18 | Helm Release Health | all `deployed` (no `pending-*`) | +| 19 | Kyverno Policy Engine | all pods Running | +| 20 | NFS Connectivity | 192.168.1.127 showmount / port 2049 | +| 21 | DNS Resolution | Technitium resolves internal + external | +| 22 | TLS Certificate Expiry | TLS `Secret` certs >30d valid | +| 23 | GPU Health | nvidia namespace + device-plugin Running | +| 24 | Cloudflare Tunnel | pods Running | +| 25 | Resource Usage | node CPU/mem headroom | +| 26 | HA Sofia — Entity Availability | Home Assistant unavailable/unknown count | +| 27 | HA Sofia — Integration Health | config entries setup_error / not_loaded | +| 28 | HA Sofia — Automation Status | disabled / stale (>30d) automations | +| 29 | HA Sofia — System Resources | HA CPU / mem / disk | +| 30 | Hardware Exporters | snmp / idrac-redfish / proxmox / tuya pods + scrapes | +| 31 | cert-manager — Certificate Readiness | Certificate CRs with `Ready!=True` | +| 32 | cert-manager — Certificate Expiry (<14d) | notAfter within 14d | +| 33 | cert-manager — Failed CertificateRequests | `Ready=False, reason=Failed` | +| 34 | Backup Freshness — Per-DB Dumps | MySQL + PG dumps within 25h | +| 35 | Backup Freshness — Offsite Sync | Pushgateway `backup_last_success_timestamp` <27h | +| 36 | Backup Freshness — LVM PVC Snapshots | newest thin snapshot <25h (SSH PVE) | +| 37 | Monitoring — Prometheus + Alertmanager | `/-/ready` + AM pods Running | +| 38 | Monitoring — Vault Sealed Status | `vault status` reports `Sealed: false` | +| 39 | Monitoring — ClusterSecretStore Ready | `vault-kv` + `vault-database` Ready | +| 40 | External — Cloudflared + Authentik Replicas | deployments fully ready | +| 41 | External — ExternalAccessDivergence Alert | alert not firing | +| 42 | External — Traefik 5xx Rate (15m) | top-10 services emitting 5xx | ## Safe Auto-Fix Rules -### Safe to auto-fix (the script does these automatically) +`--fix` only performs operations that are genuinely reversible and +observable. Nothing here rewrites Terraform state or mutates the cluster +beyond "delete pod". -1. **Evicted/Failed pods** — These are already terminated and just cluttering the namespace: - ```bash - kubectl delete pods -A --field-selector=status.phase=Failed - ``` +### Done automatically by `--fix` -2. **CrashLoopBackOff pods with >10 restarts** — The pod is stuck in a crash loop; deleting lets the controller recreate it with a fresh backoff timer: - ```bash - kubectl delete pod -n --grace-period=0 - ``` +- **Evicted / Failed pods** — delete them; the controller recreates. + ```bash + kubectl delete pods -A --field-selector=status.phase=Failed + ``` +- **CrashLoopBackOff pods with >10 restarts** — delete once to reset + backoff timer. ### NEVER auto-fix (requires human investigation) -- **NotReady nodes** — Could be network, kubelet, or hardware issue; needs SSH investigation -- **DiskPressure / MemoryPressure / PIDPressure** — Root cause must be identified -- **ImagePullBackOff** — Usually a wrong image tag or registry issue; needs config fix -- **Failed deployments** — Could be resource limits, bad config, missing secrets -- **Pending PVCs** — Usually NFS export missing or storage class issue -- **Resource pressure >90%** — Need to identify which pods are consuming resources -- **CronJob failures** — Need to check job logs to understand why it failed -- **DaemonSet issues** — Could be node taints, resource limits, or image issues +- NotReady nodes +- MemoryPressure / DiskPressure / PIDPressure +- ImagePullBackOff (usually a bad tag / registry credential) +- Deployment ready-replica mismatch +- Pending PVCs +- Node CPU/memory >90% +- CronJob failures +- DaemonSet desired != ready +- Vault sealed +- ClusterSecretStore not Ready +- cert-manager Certificate failures +- Backup freshness regressions +- Any external-reachability failure -## Deep Investigation +## Deep-investigation recipes per failure mode -When the health check reports issues, use these commands to investigate further. - -### Node Issues +### Node Issues (checks 1, 3, 17, 25) ```bash -# Describe the problematic node (events, conditions, capacity) -kubectl describe node - -# Check resource usage across all nodes +kubectl describe node kubectl top nodes - -# Check recent events on a specific node -kubectl get events --field-selector involvedObject.name= --sort-by='.lastTimestamp' - -# SSH to the node for direct inspection -ssh root@ +kubectl get events --field-selector involvedObject.name= --sort-by='.lastTimestamp' +# SSH to the node +ssh root@10.0.20.10X systemctl status kubelet journalctl -u kubelet --since "30 minutes ago" | tail -100 -df -h -free -h +df -h ; free -h ``` -### Pod Issues +Node IPs: `10.0.20.100` master, `.101` node1 (GPU), `.102` node2, +`.103` node3, `.104` node4. + +### Pod Issues (checks 4, 5, 11, 19) ```bash -# Describe the pod (events, conditions, container statuses) -kubectl describe pod -n - -# Check current logs -kubectl logs -n --tail=100 - -# Check logs from the previous crashed container -kubectl logs -n --previous --tail=100 - -# Check events in the namespace -kubectl get events -n --sort-by='.lastTimestamp' | tail -20 - -# Check all pods in a namespace -kubectl get pods -n -o wide +kubectl describe pod -n +kubectl logs -n --tail=200 +kubectl logs -n --previous --tail=200 +kubectl get events -n --sort-by='.lastTimestamp' | tail -20 ``` -### Deployment Issues +Common failure causes: OOMKilled (raise mem limit in Terraform), bad +config / missing env var, DB connection failure (check `dbaas` pods), +NFS mount failure (`showmount -e 192.168.1.127`), stale +imagePullSecret. + +### Deployment / StatefulSet / DaemonSet (checks 6, 7, 16) ```bash -# Describe the deployment (strategy, conditions, events) -kubectl describe deployment -n - -# Check rollout status -kubectl rollout status deployment -n - -# Check rollout history -kubectl rollout history deployment -n - -# Check the replicaset -kubectl get rs -n -l app= +kubectl describe deployment -n +kubectl rollout status deployment -n +kubectl rollout history deployment -n +kubectl get rs -n -l app= ``` -### PVC Issues +### PVC (check 8) ```bash -# Describe the PVC (events, status, storage class) -kubectl describe pvc -n - -# Check PVs -kubectl get pv - -# Check events related to PVCs -kubectl get events -n --field-selector reason=FailedMount --sort-by='.lastTimestamp' - -# Verify NFS export exists -showmount -e 10.0.10.15 | grep +kubectl describe pvc -n +kubectl get events -n --field-selector reason=FailedMount --sort-by='.lastTimestamp' +kubectl get pv | grep +showmount -e 192.168.1.127 ``` -### Resource Pressure +### cert-manager (checks 31, 32, 33) ```bash -# Top nodes (CPU and memory usage) -kubectl top nodes - -# Top pods sorted by memory (cluster-wide) -kubectl top pods -A --sort-by=memory | head -20 - -# Top pods sorted by CPU (cluster-wide) -kubectl top pods -A --sort-by=cpu | head -20 - -# Check resource requests/limits in a namespace -kubectl describe resourcequota -n -kubectl describe limitrange -n +kubectl get certificate -A +kubectl describe certificate -n +kubectl get certificaterequest -A +kubectl describe certificaterequest -n +kubectl logs -n cert-manager deploy/cert-manager | tail -50 ``` -## Common Remediation +Common causes: ACME HTTP-01 challenge blocked, ClusterIssuer missing +DNS provider secret, rate-limit from Let's Encrypt. -### Persistent CrashLoopBackOff +### Backups (checks 34, 35, 36) -A pod keeps crashing even after the auto-fix deletes it. +```bash +# Per-DB dumps (inside the DB pod) +kubectl exec -n dbaas mysql-standalone-0 -- ls -lah /backup/per-db/ +kubectl exec -n dbaas pg-cluster-0 -- ls -lah /backup/per-db/ -1. **Check logs from the crashed container**: - ```bash - kubectl logs -n --previous --tail=200 - ``` +# Pushgateway metrics +kubectl exec -n monitoring deploy/prometheus-server -- \ + wget -qO- http://prometheus-prometheus-pushgateway:9091/metrics | \ + grep backup_last_success_timestamp -2. **Check the pod description for clues**: - ```bash - kubectl describe pod -n - ``` - Look for: - - `OOMKilled` in Last State — the container ran out of memory - - `Error` with exit code 1 — application error (bad config, missing env var, DB connection failure) - - `Error` with exit code 137 — killed by OOM killer or liveness probe - - `Error` with exit code 143 — SIGTERM (graceful shutdown failure) +# LVM snapshots on PVE host +ssh -o BatchMode=yes root@192.168.1.127 \ + 'lvs -o lv_name,lv_time,lv_size --noheadings | grep snap' +``` -3. **Common causes**: - - **OOMKilled**: Increase memory limits in Terraform (see below) - - **Bad config**: Check environment variables, secrets, config maps - - **DB connection failure**: Verify the database pod is running (`kubectl get pods -n dbaas`) - - **NFS mount failure**: Verify NFS export exists (`showmount -e 10.0.10.15`) - - **Missing secret**: Check if TLS secret or other secrets exist in the namespace +If offsite sync is stale, the common cause is the +`offsite-sync-backup.service` systemd unit on the PVE host failing. +`ssh root@192.168.1.127 'systemctl status offsite-sync-backup'`. -### OOMKilled +### Monitoring stack (checks 37, 38, 39) -The container was killed because it exceeded its memory limit. +```bash +# Prometheus +kubectl exec -n monitoring deploy/prometheus-server -- wget -qO- http://localhost:9090/-/ready +kubectl logs -n monitoring deploy/prometheus-server --tail=100 -1. **Check current limits**: - ```bash - kubectl describe pod -n | grep -A 5 "Limits" - ``` +# Alertmanager +kubectl get pods -n monitoring | grep alertmanager +kubectl logs -n monitoring -l app=prometheus-alertmanager --tail=100 -2. **Fix in Terraform** — Edit `modules/kubernetes//main.tf` and increase the memory limit: - ```hcl - resources { - limits = { - memory = "2Gi" # Increase from current value - } - } - ``` +# Vault +kubectl exec -n vault vault-0 -- sh -c 'VAULT_ADDR=http://127.0.0.1:8200 vault status' +# If sealed: check raft peers with `vault operator raft list-peers` and unseal. -3. **Apply the change**: - ```bash - cd /workspace/infra - terraform apply -target=module.kubernetes_cluster.module. -auto-approve - ``` +# ClusterSecretStore +kubectl get clustersecretstore +kubectl describe clustersecretstore vault-kv vault-database +kubectl logs -n external-secrets deploy/external-secrets --tail=100 +``` -### ImagePullBackOff +### External reachability (checks 40, 41, 42) -The container image cannot be pulled. +```bash +# Cloudflared +kubectl get pods -n cloudflared +kubectl logs -n cloudflared -l app=cloudflared --tail=100 -1. **Check the exact error**: - ```bash - kubectl describe pod -n | grep -A 5 "Events" - ``` +# Authentik +kubectl get pods -n authentik -l app=authentik-server +kubectl logs -n authentik -l app=authentik-server --tail=100 -2. **Common causes**: - - **Wrong image tag**: Verify the tag exists on the registry (Docker Hub, ghcr.io, etc.) - - **Private registry without credentials**: Check if imagePullSecrets are configured - - **Pull-through cache issue**: The registry cache at `10.0.20.10` may have a stale entry - ```bash - # Check pull-through cache ports: - # 5000 = docker.io, 5010 = ghcr.io, 5020 = quay.io, 5030 = registry.k8s.io - curl -s http://10.0.20.10:5000/v2/_catalog | python3 -m json.tool - ``` - - **Registry rate limit**: Docker Hub free tier has pull limits; pull-through cache helps avoid this +# ExternalAccessDivergence alert +kubectl exec -n monitoring deploy/prometheus-server -- \ + wget -qO- 'http://localhost:9090/api/v1/alerts' | \ + python3 -m json.tool | grep -A 5 ExternalAccessDivergence -3. **Fix**: Update the image tag in the service's Terraform module and re-apply. +# Traefik 5xx — find the hot service +kubectl exec -n monitoring deploy/prometheus-server -- \ + wget -qO- 'http://localhost:9090/api/v1/query?query=topk(10,rate(traefik_service_requests_total{code=~%225..%22}%5B15m%5D))' \ + | python3 -m json.tool +``` -### Node NotReady +### OOMKilled remediation -A node has gone NotReady. +1. `kubectl describe pod -n | grep -A 5 Limits` +2. Edit `infra/modules/kubernetes//main.tf` and raise + `resources.limits.memory`. +3. `cd /home/wizard/code/infra && scripts/tg apply` (Tier 1) or + `terraform apply -target=module.` as appropriate. -1. **Check node conditions**: - ```bash - kubectl describe node | grep -A 20 "Conditions" - ``` +### ImagePullBackOff remediation -2. **SSH to the node and check kubelet**: - ```bash - ssh root@ - systemctl status kubelet - journalctl -u kubelet --since "10 minutes ago" | tail -50 - ``` +1. `kubectl describe pod -n | grep -A 5 Events` +2. Verify tag exists on the source registry. +3. Check pull-through cache at `10.0.20.10:{5000,5010,5020,5030}`. +4. Update the image tag in Terraform + re-apply. -3. **Check resources**: - ```bash - # On the node - df -h # Disk space - free -h # Memory - top -bn1 # CPU/processes - ``` +### Persistent CrashLoopBackOff after auto-fix -4. **Node IPs** (for SSH): - - `10.0.20.100` — k8s-master - - `10.0.20.101` — k8s-node1 (GPU) - - `10.0.20.102` — k8s-node2 - - `10.0.20.103` — k8s-node3 - - `10.0.20.104` — k8s-node4 +1. `kubectl logs -n --previous --tail=200` +2. `kubectl describe pod -n ` and check Last State: + - `OOMKilled` → raise memory limit + - Exit code 137 → OOM or probe killed + - Exit code 143 → SIGTERM / graceful shutdown failed +3. Cross-check dbaas + NFS + secrets are healthy. -## Slack Webhook +## Notes on the canonical / hardlink setup -The script posts results to the Slack incoming webhook URL in `$SLACK_WEBHOOK_URL`. The message format uses Slack mrkdwn: -- All clear: green checkmark with node/pod count -- Warnings only: warning icon with details -- Issues found: red alert icon with auto-fixes applied and remaining issues +The authoritative copy of this SKILL.md lives at +`/home/wizard/code/.claude/skills/cluster-health/SKILL.md`. A hardlink +at `/home/wizard/code/infra/.claude/skills/cluster-health/SKILL.md` +points to the same inode so infra-rooted sessions also discover the +skill. -The webhook URL is passed as an environment variable from `openclaw_skill_secrets` in `terraform.tfvars`. +To verify the hardlink is intact: -## Infrastructure +```bash +stat -c '%i %n' \ + /home/wizard/code/.claude/skills/cluster-health/SKILL.md \ + /home/wizard/code/infra/.claude/skills/cluster-health/SKILL.md +``` -| Component | Path / Location | -|-----------|----------------| -| Health check script | `/workspace/infra/.claude/cluster-health.sh` (in-pod) or `.claude/cluster-health.sh` (repo) | -| Terraform module | `modules/kubernetes/openclaw/main.tf` | -| CronJob definition | Defined in the OpenClaw Terraform module | -| Existing full healthcheck | `scripts/cluster_healthcheck.sh` (local-only, 24 checks with color output) | -| Infra repo (in pod) | `/workspace/infra` | -| kubectl (in pod) | `/tools/kubectl` | -| terraform (in pod) | `/tools/terraform` | +Both should print the same inode number. If they diverge (e.g. `git +checkout` replaced the file rather than updating it), re-link: -## Auto-File Incidents for SEV1/SEV2 - -After running health checks, if **SEV1 or SEV2 issues** are found (node down, multiple services affected, core service outage, or single important service down), auto-file a GitHub Issue: - -### Severity Classification -- **SEV1**: Node NotReady, multiple services down, data at risk, core service outage (DNS, auth, ingress, databases) -- **SEV2**: Single non-core service down, degraded performance, persistent CrashLoopBackOff -- **SEV3**: Warnings only, resource pressure <90%, cosmetic — do NOT auto-file - -### Workflow -1. **Dedup check**: Before filing, query open incidents: - ```bash - GITHUB_TOKEN=$(vault kv get -field=github_pat secret/viktor) - curl -s -H "Authorization: token $GITHUB_TOKEN" \ - "https://api.github.com/repos/ViktorBarzin/infra/issues?labels=incident&state=open&per_page=50" - ``` - If an open issue already covers the same service/namespace, **skip filing**. - -2. **File the issue** with labels `incident`, `sev1` or `sev2`, `postmortem-required`: - - Title: `[AUTO] ` - - Body: full diagnostic dump (pod status, events, alerts, node state) - - The issue-automation GHA workflow will trigger the post-mortem pipeline automatically - -3. **Auto-close recovered services**: If a service that previously had an auto-filed incident is now healthy: - ```bash - # Comment and close - curl -s -X POST -H "Authorization: token $GITHUB_TOKEN" \ - "https://api.github.com/repos/ViktorBarzin/infra/issues//comments" \ - -d '{"body": "**Resolved** — Service recovered. Auto-closed by cluster health check."}' - curl -s -X PATCH -H "Authorization: token $GITHUB_TOKEN" \ - "https://api.github.com/repos/ViktorBarzin/infra/issues/" \ - -d '{"state": "closed"}' - ``` - -## Post-Mortem Auto-Suggest - -After running a healthcheck, if the cluster has **recovered from an unhealthy state** (previous run showed FAIL items that are now resolved), suggest writing a post-mortem: - -> The cluster has recovered from the previous unhealthy state. Would you like me to write a post-mortem? Run `/post-mortem` to generate one. - -This ensures incidents are documented while context is fresh. - -## Notes - -1. This script is designed to run inside the OpenClaw pod where kubectl is pre-configured via the ServiceAccount -2. The full `scripts/cluster_healthcheck.sh` script runs 24 checks and is meant for local interactive use; this skill's script runs 8 core checks optimized for automated CronJob execution -3. When investigating issues interactively, prefer running commands directly rather than re-running the script -4. All Terraform changes must go through the `.tf` files — never use `kubectl apply/edit/patch` for persistent changes +```bash +ln -f /home/wizard/code/.claude/skills/cluster-health/SKILL.md \ + /home/wizard/code/infra/.claude/skills/cluster-health/SKILL.md +``` diff --git a/AGENTS.md b/AGENTS.md index 2a885021..98e0bd89 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -99,7 +99,7 @@ Terragrunt-based homelab managing a Kubernetes cluster (5 nodes, v1.34.2) on Pro - `config.tfvars` — non-secret configuration (plaintext) - `secrets.sops.json` — all secrets (SOPS-encrypted JSON) - `terraform.tfvars` — legacy secrets file (git-crypt, kept for reference) -- `scripts/cluster_healthcheck.sh` — 25-check cluster health script +- `scripts/cluster_healthcheck.sh` — 42-check cluster health script (nodes, workloads, monitoring, certs, backups, external reachability) ## Storage - **NFS** (`nfs-proxmox` StorageClass): For app data. Use the `nfs_volume` module, never inline `nfs {}` blocks. diff --git a/scripts/cluster_healthcheck.sh b/scripts/cluster_healthcheck.sh index 8a65839a..997c0b7d 100755 --- a/scripts/cluster_healthcheck.sh +++ b/scripts/cluster_healthcheck.sh @@ -1,7 +1,7 @@ #!/usr/bin/env bash # Cluster health check script. -# Runs 24 diagnostic checks against the Kubernetes cluster and prints +# Runs 42 diagnostic checks against the Kubernetes cluster and prints # a colour-coded report with PASS / WARN / FAIL for each section. # # Usage: ./scripts/cluster_healthcheck.sh [--fix] [--quiet|-q] [--json] [--kubeconfig ] @@ -26,7 +26,7 @@ JSON=false KUBECONFIG_PATH="$(pwd)/config" KUBECTL="" JSON_RESULTS=() -TOTAL_CHECKS=30 +TOTAL_CHECKS=42 # --- Helpers --- info() { [[ "$JSON" == true ]] && return 0; echo -e "${BLUE}[INFO]${NC} $*"; } @@ -71,14 +71,16 @@ parse_args() { while [[ $# -gt 0 ]]; do case "$1" in --fix) FIX=true; shift ;; + --no-fix) FIX=false; shift ;; --quiet|-q) QUIET=true; shift ;; --json) JSON=true; shift ;; --kubeconfig) KUBECONFIG_PATH="$2"; shift 2 ;; -h|--help) - echo "Usage: $0 [--fix] [--quiet|-q] [--json] [--kubeconfig ]" + echo "Usage: $0 [--fix|--no-fix] [--quiet|-q] [--json] [--kubeconfig ]" echo "" echo "Flags:" echo " --fix Auto-remediate safe issues (delete evicted pods)" + echo " --no-fix Disable auto-remediation (default)" echo " --quiet, -q Only show WARN and FAIL sections" echo " --json Machine-readable JSON output" echo " --kubeconfig PATH Override kubeconfig (default: \$(pwd)/config)" @@ -1750,6 +1752,593 @@ else: json_add "hardware_exporters" "$status" "${detail:-All healthy}" } +# --- 31. cert-manager: Certificate Readiness --- +check_cert_manager_certificates() { + section 31 "cert-manager — Certificate Readiness" + local certs not_ready detail="" status="PASS" + + certs=$($KUBECTL get certificates.cert-manager.io -A -o json 2>/dev/null) || { + warn "cert-manager CRDs not installed or inaccessible" + json_add "certmanager_certificates" "WARN" "CRDs unavailable" + return 0 + } + + not_ready=$(echo "$certs" | python3 -c ' +import json, sys +data = json.load(sys.stdin) +for item in data.get("items", []): + ns = item["metadata"]["namespace"] + name = item["metadata"]["name"] + conds = item.get("status", {}).get("conditions", []) + ready = next((c for c in conds if c.get("type") == "Ready"), None) + if not ready or ready.get("status") != "True": + reason = ready.get("reason", "NoCondition") if ready else "NoCondition" + print(f"{ns}/{name}:{reason}") +' 2>/dev/null) || true + + if [[ -z "$not_ready" ]]; then + pass "All Certificate CRs Ready" + json_add "certmanager_certificates" "PASS" "All Ready" + else + [[ "$QUIET" == true ]] && section_always 31 "cert-manager — Certificate Readiness" + local count + count=$(count_lines "$not_ready") + while IFS= read -r line; do + fail "Certificate not Ready: $line" + detail+="$line; " + done <<< "$not_ready" + status="FAIL" + json_add "certmanager_certificates" "$status" "$count not Ready: $detail" + fi +} + +# --- 32. cert-manager: Certificate Expiry (<14d) --- +check_cert_manager_expiry() { + section 32 "cert-manager — Certificate Expiry (<14d)" + local certs expiring detail="" status="PASS" + + certs=$($KUBECTL get certificates.cert-manager.io -A -o json 2>/dev/null) || { + warn "cert-manager CRDs not installed or inaccessible" + json_add "certmanager_expiry" "WARN" "CRDs unavailable" + return 0 + } + + expiring=$(echo "$certs" | python3 -c ' +import json, sys +from datetime import datetime, timezone, timedelta +data = json.load(sys.stdin) +cutoff = datetime.now(timezone.utc) + timedelta(days=14) +for item in data.get("items", []): + ns = item["metadata"]["namespace"] + name = item["metadata"]["name"] + not_after = item.get("status", {}).get("notAfter") + if not not_after: + continue + try: + expiry = datetime.fromisoformat(not_after.replace("Z", "+00:00")) + if expiry < cutoff: + days = (expiry - datetime.now(timezone.utc)).days + level = "FAIL" if days <= 3 else "WARN" + print(f"{level}:{ns}/{name}:{days}") + except ValueError: + pass +' 2>/dev/null) || true + + if [[ -z "$expiring" ]]; then + pass "No Certificate CRs expiring within 14 days" + json_add "certmanager_expiry" "PASS" "None expiring <14d" + else + [[ "$QUIET" == true ]] && section_always 32 "cert-manager — Certificate Expiry (<14d)" + while IFS= read -r line; do + local level cert_name days + level=$(echo "$line" | cut -d: -f1) + cert_name=$(echo "$line" | cut -d: -f2) + days=$(echo "$line" | cut -d: -f3) + if [[ "$level" == "FAIL" ]]; then + fail "Certificate $cert_name expires in ${days}d" + status="FAIL" + else + warn "Certificate $cert_name expires in ${days}d" + [[ "$status" != "FAIL" ]] && status="WARN" + fi + detail+="$cert_name=${days}d; " + done <<< "$expiring" + json_add "certmanager_expiry" "$status" "$detail" + fi +} + +# --- 33. cert-manager: Failed CertificateRequests --- +check_cert_manager_requests() { + section 33 "cert-manager — Failed CertificateRequests" + local requests failed detail="" status="PASS" + + requests=$($KUBECTL get certificaterequests.cert-manager.io -A -o json 2>/dev/null) || { + warn "cert-manager CRDs not installed or inaccessible" + json_add "certmanager_requests" "WARN" "CRDs unavailable" + return 0 + } + + failed=$(echo "$requests" | python3 -c ' +import json, sys +data = json.load(sys.stdin) +for item in data.get("items", []): + ns = item["metadata"]["namespace"] + name = item["metadata"]["name"] + conds = item.get("status", {}).get("conditions", []) + for c in conds: + if c.get("type") == "Ready" and c.get("status") == "False" and c.get("reason") == "Failed": + print(f"{ns}/{name}:{c.get(\"message\", \"\")[:80]}") + break +' 2>/dev/null) || true + + if [[ -z "$failed" ]]; then + pass "No failed CertificateRequests" + json_add "certmanager_requests" "PASS" "None failed" + else + [[ "$QUIET" == true ]] && section_always 33 "cert-manager — Failed CertificateRequests" + local count + count=$(count_lines "$failed") + while IFS= read -r line; do + fail "CertificateRequest failed: $line" + detail+="$line; " + done <<< "$failed" + status="FAIL" + json_add "certmanager_requests" "$status" "$count failed: $detail" + fi +} + +# --- 34. Backup Freshness: Per-DB Dumps --- +check_backup_per_db() { + section 34 "Backup Freshness — Per-DB Dumps" + local detail="" had_issue=false status="PASS" + + # Freshness threshold: 25 hours + local now_epoch max_age_sec + now_epoch=$(date -u +%s) + max_age_sec=$((25 * 3600)) + + _check_cronjob_fresh() { + local ns="$1" cj="$2" label="$3" + local ts age_sec + ts=$($KUBECTL get cronjob -n "$ns" "$cj" -o jsonpath='{.status.lastSuccessfulTime}' 2>/dev/null || true) + if [[ -z "$ts" ]]; then + [[ "$had_issue" == false && "$QUIET" == true ]] && section_always 34 "Backup Freshness — Per-DB Dumps" + fail "$label: CronJob $ns/$cj has no lastSuccessfulTime" + detail+="${label}=no-success; " + had_issue=true + status="FAIL" + return 0 + fi + local ts_epoch + ts_epoch=$(date -u -d "$ts" +%s 2>/dev/null || echo 0) + age_sec=$((now_epoch - ts_epoch)) + if [[ "$age_sec" -gt "$max_age_sec" ]]; then + [[ "$had_issue" == false && "$QUIET" == true ]] && section_always 34 "Backup Freshness — Per-DB Dumps" + local age_h=$((age_sec / 3600)) + fail "$label: last success ${age_h}h ago (>25h)" + detail+="${label}=${age_h}h; " + had_issue=true + status="FAIL" + else + local age_h=$((age_sec / 3600)) + detail+="${label}=${age_h}h; " + fi + } + + _check_cronjob_fresh dbaas mysql-backup-per-db mysql + _check_cronjob_fresh dbaas postgresql-backup-per-db pg + + [[ "$had_issue" == false ]] && pass "Per-DB dumps fresh — $detail" + json_add "backup_per_db" "$status" "$detail" +} + +# --- 35. Backup Freshness: Offsite Sync --- +check_backup_offsite_sync() { + section 35 "Backup Freshness — Offsite Sync" + local metrics detail="" status="PASS" + + metrics=$($KUBECTL exec -n monitoring deploy/prometheus-server -- \ + wget -qO- "http://prometheus-prometheus-pushgateway:9091/metrics" 2>/dev/null || true) + + if [[ -z "$metrics" ]]; then + [[ "$QUIET" == true ]] && section_always 35 "Backup Freshness — Offsite Sync" + warn "Cannot query Pushgateway" + json_add "backup_offsite_sync" "WARN" "Pushgateway unreachable" + return 0 + fi + + local age_hours + age_hours=$(echo "$metrics" | python3 -c ' +import sys, re, time +ts = None +for line in sys.stdin: + if line.startswith("#"): + continue + if "backup_last_success_timestamp" in line and "offsite-backup-sync" in line: + m = re.search(r"\s([0-9.eE+]+)\s*$", line.strip()) + if m: + try: + ts = float(m.group(1)) + break + except ValueError: + pass +if ts is None: + print("missing") +else: + age = (time.time() - ts) / 3600 + print(f"{age:.1f}") +' 2>/dev/null) || age_hours="error" + + if [[ "$age_hours" == "missing" ]]; then + [[ "$QUIET" == true ]] && section_always 35 "Backup Freshness — Offsite Sync" + fail "backup_last_success_timestamp metric missing for offsite-backup-sync" + json_add "backup_offsite_sync" "FAIL" "Metric missing" + elif [[ "$age_hours" == "error" ]]; then + [[ "$QUIET" == true ]] && section_always 35 "Backup Freshness — Offsite Sync" + warn "Failed to parse Pushgateway metric" + json_add "backup_offsite_sync" "WARN" "Parse error" + else + local age_int + age_int=$(printf '%.0f' "$age_hours") + if [[ "$age_int" -gt 27 ]]; then + [[ "$QUIET" == true ]] && section_always 35 "Backup Freshness — Offsite Sync" + fail "Offsite sync last success ${age_hours}h ago (>27h)" + status="FAIL" + else + pass "Offsite sync last success ${age_hours}h ago" + fi + detail="age=${age_hours}h" + json_add "backup_offsite_sync" "$status" "$detail" + fi +} + +# --- 36. Backup Freshness: LVM PVC Snapshots --- +check_backup_lvm_snapshots() { + section 36 "Backup Freshness — LVM PVC Snapshots" + local snap_output detail="" status="PASS" + + snap_output=$(ssh -o BatchMode=yes -o ConnectTimeout=5 -o StrictHostKeyChecking=no \ + root@192.168.1.127 "lvs -o lv_name,lv_time --noheadings 2>/dev/null | grep -- -snap" 2>/dev/null || true) + + if [[ -z "$snap_output" ]]; then + [[ "$QUIET" == true ]] && section_always 36 "Backup Freshness — LVM PVC Snapshots" + warn "No LVM PVC snapshots found or SSH to 192.168.1.127 failed (BatchMode)" + json_add "backup_lvm_snapshots" "WARN" "SSH failed or no snapshots" + return 0 + fi + + local newest_age_hours + newest_age_hours=$(echo "$snap_output" | python3 -c ' +import sys, re, time +from datetime import datetime +newest = None +for line in sys.stdin: + line = line.strip() + if not line: + continue + parts = line.split(None, 1) + if len(parts) < 2: + continue + date_str = parts[1].strip() + # lv_time format: "2026-04-19 03:00:01 +0000" or similar + for fmt in ("%Y-%m-%d %H:%M:%S %z", "%Y-%m-%d %H:%M:%S"): + try: + dt = datetime.strptime(date_str, fmt) + ts = dt.timestamp() + if newest is None or ts > newest: + newest = ts + break + except ValueError: + continue +if newest is None: + print("parse_error") +else: + age = (time.time() - newest) / 3600 + print(f"{age:.1f}") +' 2>/dev/null) || newest_age_hours="error" + + if [[ "$newest_age_hours" == "parse_error" || "$newest_age_hours" == "error" ]]; then + [[ "$QUIET" == true ]] && section_always 36 "Backup Freshness — LVM PVC Snapshots" + warn "Could not parse LVM snapshot timestamps" + json_add "backup_lvm_snapshots" "WARN" "Parse error" + else + local count age_int + count=$(count_lines "$snap_output") + age_int=$(printf '%.0f' "$newest_age_hours") + if [[ "$age_int" -gt 25 ]]; then + [[ "$QUIET" == true ]] && section_always 36 "Backup Freshness — LVM PVC Snapshots" + fail "Newest LVM snapshot ${newest_age_hours}h old (>25h); $count total" + status="FAIL" + else + pass "LVM snapshots fresh — $count total, newest ${newest_age_hours}h old" + fi + detail="count=$count newest=${newest_age_hours}h" + json_add "backup_lvm_snapshots" "$status" "$detail" + fi +} + +# --- 37. Monitoring: Prometheus + Alertmanager --- +check_monitoring_prom_am() { + section 37 "Monitoring — Prometheus + Alertmanager" + local detail="" had_issue=false status="PASS" + + # Prometheus /-/ready + local prom_ready + prom_ready=$($KUBECTL exec -n monitoring deploy/prometheus-server -- \ + wget -qO- "http://localhost:9090/-/ready" 2>/dev/null || true) + if echo "$prom_ready" | grep -qi "ready"; then + detail+="prometheus=ready; " + else + [[ "$had_issue" == false && "$QUIET" == true ]] && section_always 37 "Monitoring — Prometheus + Alertmanager" + fail "Prometheus /-/ready returned no Ready response" + detail+="prometheus=not-ready; " + had_issue=true + status="FAIL" + fi + + # Alertmanager running pod count + local am_running + am_running=$($KUBECTL get pods -n monitoring --no-headers 2>/dev/null | \ + grep alertmanager | awk '$3 == "Running"' | wc -l | tr -d ' ') + if [[ "$am_running" -gt 0 ]]; then + detail+="alertmanager=${am_running} running; " + else + [[ "$had_issue" == false && "$QUIET" == true ]] && section_always 37 "Monitoring — Prometheus + Alertmanager" + fail "Alertmanager: 0 Running pods" + detail+="alertmanager=none-running; " + had_issue=true + status="FAIL" + fi + + [[ "$had_issue" == false ]] && pass "Prometheus Ready, $am_running Alertmanager pod(s) Running" + json_add "monitoring_prom_am" "$status" "$detail" +} + +# --- 38. Monitoring: Vault Sealed Status --- +check_monitoring_vault() { + section 38 "Monitoring — Vault Sealed Status" + local output detail="" status="PASS" + + output=$($KUBECTL exec -n vault vault-0 -- \ + sh -c 'VAULT_ADDR=http://127.0.0.1:8200 vault status' 2>&1 || true) + + if [[ -z "$output" ]]; then + [[ "$QUIET" == true ]] && section_always 38 "Monitoring — Vault Sealed Status" + fail "Cannot exec vault status on vault-0" + json_add "monitoring_vault" "FAIL" "Exec failed" + return 0 + fi + + if echo "$output" | grep -qi "^Sealed[[:space:]]*false"; then + pass "Vault unsealed" + detail="sealed=false" + json_add "monitoring_vault" "PASS" "$detail" + elif echo "$output" | grep -qi "^Sealed[[:space:]]*true"; then + [[ "$QUIET" == true ]] && section_always 38 "Monitoring — Vault Sealed Status" + fail "Vault is SEALED — secrets unavailable" + detail="sealed=true" + status="FAIL" + json_add "monitoring_vault" "$status" "$detail" + else + [[ "$QUIET" == true ]] && section_always 38 "Monitoring — Vault Sealed Status" + warn "Cannot parse vault status output" + json_add "monitoring_vault" "WARN" "Parse error" + fi +} + +# --- 39. Monitoring: ClusterSecretStore Ready --- +check_monitoring_css() { + section 39 "Monitoring — ClusterSecretStore Ready" + local css not_ready detail="" status="PASS" + + css=$($KUBECTL get clustersecretstore -o json 2>/dev/null) || { + [[ "$QUIET" == true ]] && section_always 39 "Monitoring — ClusterSecretStore Ready" + warn "ClusterSecretStore CRD not installed" + json_add "monitoring_css" "WARN" "CRD missing" + return 0 + } + + not_ready=$(echo "$css" | python3 -c ' +import json, sys +data = json.load(sys.stdin) +for item in data.get("items", []): + name = item["metadata"]["name"] + conds = item.get("status", {}).get("conditions", []) + ready = next((c for c in conds if c.get("type") == "Ready"), None) + if not ready or ready.get("status") != "True": + print(f"{name}:{ready.get(\"reason\", \"NoCondition\") if ready else \"NoCondition\"}") +' 2>/dev/null) || true + + if [[ -z "$not_ready" ]]; then + local total + total=$(echo "$css" | python3 -c 'import json,sys; print(len(json.load(sys.stdin).get("items",[])))' 2>/dev/null || echo "?") + pass "All $total ClusterSecretStores Ready" + json_add "monitoring_css" "PASS" "$total Ready" + else + [[ "$QUIET" == true ]] && section_always 39 "Monitoring — ClusterSecretStore Ready" + while IFS= read -r line; do + fail "ClusterSecretStore not Ready: $line" + detail+="$line; " + done <<< "$not_ready" + status="FAIL" + json_add "monitoring_css" "$status" "$detail" + fi +} + +# --- 40. External Reachability: Cloudflared + Authentik Replicas --- +check_external_replicas() { + section 40 "External — Cloudflared + Authentik Replicas" + local detail="" had_issue=false status="PASS" + + # Cloudflared + local cf_json cf_ready cf_desired + cf_json=$($KUBECTL get deployment cloudflared -n cloudflared -o json 2>/dev/null || true) + if [[ -z "$cf_json" ]]; then + [[ "$had_issue" == false && "$QUIET" == true ]] && section_always 40 "External — Cloudflared + Authentik Replicas" + fail "Cloudflared deployment not found" + detail+="cloudflared=missing; " + had_issue=true + status="FAIL" + else + cf_ready=$(echo "$cf_json" | python3 -c 'import json,sys; print(json.load(sys.stdin).get("status",{}).get("readyReplicas",0) or 0)' 2>/dev/null || echo "0") + cf_desired=$(echo "$cf_json" | python3 -c 'import json,sys; print(json.load(sys.stdin).get("spec",{}).get("replicas",0) or 0)' 2>/dev/null || echo "0") + if [[ "$cf_ready" != "$cf_desired" ]]; then + [[ "$had_issue" == false && "$QUIET" == true ]] && section_always 40 "External — Cloudflared + Authentik Replicas" + fail "Cloudflared: $cf_ready/$cf_desired ready (external access degraded)" + detail+="cloudflared=${cf_ready}/${cf_desired}; " + had_issue=true + status="FAIL" + else + detail+="cloudflared=${cf_ready}/${cf_desired}; " + fi + fi + + # Authentik server (Helm chart names the deployment goauthentik-server) + local auth_json auth_ready auth_desired + auth_json=$($KUBECTL get deployment goauthentik-server -n authentik -o json 2>/dev/null || true) + if [[ -z "$auth_json" ]]; then + [[ "$had_issue" == false && "$QUIET" == true ]] && section_always 40 "External — Cloudflared + Authentik Replicas" + warn "goauthentik-server deployment not found in authentik namespace" + detail+="authentik=missing; " + had_issue=true + [[ "$status" != "FAIL" ]] && status="WARN" + else + auth_ready=$(echo "$auth_json" | python3 -c 'import json,sys; print(json.load(sys.stdin).get("status",{}).get("readyReplicas",0) or 0)' 2>/dev/null || echo "0") + auth_desired=$(echo "$auth_json" | python3 -c 'import json,sys; print(json.load(sys.stdin).get("spec",{}).get("replicas",0) or 0)' 2>/dev/null || echo "0") + if [[ "$auth_ready" != "$auth_desired" ]]; then + [[ "$had_issue" == false && "$QUIET" == true ]] && section_always 40 "External — Cloudflared + Authentik Replicas" + fail "goauthentik-server: $auth_ready/$auth_desired ready (auth degraded)" + detail+="authentik=${auth_ready}/${auth_desired}; " + had_issue=true + status="FAIL" + else + detail+="authentik=${auth_ready}/${auth_desired}; " + fi + fi + + [[ "$had_issue" == false ]] && pass "Cloudflared + authentik-server at full replicas ($detail)" + json_add "external_replicas" "$status" "$detail" +} + +# --- 41. External Reachability: ExternalAccessDivergence Alert --- +check_external_divergence() { + section 41 "External — ExternalAccessDivergence Alert" + local alerts result detail="" status="PASS" + + alerts=$($KUBECTL exec -n monitoring deploy/prometheus-server -- \ + wget -qO- "http://localhost:9090/api/v1/alerts" 2>/dev/null || true) + + if [[ -z "$alerts" ]]; then + [[ "$QUIET" == true ]] && section_always 41 "External — ExternalAccessDivergence Alert" + warn "Cannot query Prometheus alerts" + json_add "external_divergence" "WARN" "Cannot query" + return 0 + fi + + result=$(echo "$alerts" | python3 -c ' +import json, sys +try: + data = json.load(sys.stdin) + alerts = data.get("data", {}).get("alerts", []) if isinstance(data, dict) else data + firing = [a for a in alerts + if a.get("labels", {}).get("alertname") == "ExternalAccessDivergence" + and a.get("state") == "firing"] + if firing: + hosts = [a.get("labels", {}).get("host") or a.get("labels", {}).get("service") or "?" for a in firing] + print(f"{len(firing)}:" + ",".join(hosts)) + else: + print("0:") +except Exception as e: + print(f"error:{e}") +' 2>/dev/null) || result="error:parse" + + if [[ "$result" == error:* ]]; then + [[ "$QUIET" == true ]] && section_always 41 "External — ExternalAccessDivergence Alert" + warn "Failed to parse alerts JSON: ${result#error:}" + json_add "external_divergence" "WARN" "Parse error" + return 0 + fi + + local count names + count=$(echo "$result" | cut -d: -f1) + names=$(echo "$result" | cut -d: -f2-) + + if [[ "$count" -eq 0 ]]; then + pass "ExternalAccessDivergence not firing" + json_add "external_divergence" "PASS" "Not firing" + else + [[ "$QUIET" == true ]] && section_always 41 "External — ExternalAccessDivergence Alert" + fail "ExternalAccessDivergence firing for $count target(s): $names" + status="FAIL" + detail="$count firing: $names" + json_add "external_divergence" "$status" "$detail" + fi +} + +# --- 42. External Reachability: Traefik 5xx Rate --- +check_external_traefik_5xx() { + section 42 "External — Traefik 5xx Rate (15m)" + local query_result detail="" status="PASS" + + query_result=$($KUBECTL exec -n monitoring deploy/prometheus-server -- \ + wget -qO- 'http://localhost:9090/api/v1/query?query=topk(10,rate(traefik_service_requests_total{code=~%225..%22}%5B15m%5D))' 2>/dev/null || true) + + if [[ -z "$query_result" ]]; then + [[ "$QUIET" == true ]] && section_always 42 "External — Traefik 5xx Rate (15m)" + warn "Cannot query Prometheus for traefik 5xx rate" + json_add "external_traefik_5xx" "WARN" "Query failed" + return 0 + fi + + local parsed + parsed=$(echo "$query_result" | python3 -c ' +import json, sys +try: + data = json.load(sys.stdin) + results = data.get("data", {}).get("result", []) + hot = [(r.get("metric", {}).get("service", "?"), float(r.get("value", [0, "0"])[1])) for r in results] + hot = [(s, v) for s, v in hot if v > 0.01] # 1% req/s threshold + hot.sort(key=lambda x: -x[1]) + if not hot: + print("0:") + else: + top = [f"{s}={v:.2f}/s" for s, v in hot[:5]] + print(f"{len(hot)}:" + "; ".join(top)) +except Exception as e: + print(f"error:{e}") +' 2>/dev/null) || parsed="error:parse" + + if [[ "$parsed" == error:* ]]; then + [[ "$QUIET" == true ]] && section_always 42 "External — Traefik 5xx Rate (15m)" + warn "Parse failed: ${parsed#error:}" + json_add "external_traefik_5xx" "WARN" "Parse error" + return 0 + fi + + local count top + count=$(echo "$parsed" | cut -d: -f1) + top=$(echo "$parsed" | cut -d: -f2-) + + if [[ "$count" -eq 0 ]]; then + pass "No Traefik services with 5xx rate >0.01 req/s (last 15m)" + json_add "external_traefik_5xx" "PASS" "None above threshold" + else + [[ "$QUIET" == true ]] && section_always 42 "External — Traefik 5xx Rate (15m)" + # WARN at any 5xx; FAIL if top service >1 req/s + local top_rate + top_rate=$(echo "$top" | grep -oE '[0-9.]+/s' | head -1 | tr -d '/s') + if awk "BEGIN{exit !($top_rate > 1.0)}" 2>/dev/null; then + fail "$count Traefik service(s) with elevated 5xx: $top" + status="FAIL" + else + warn "$count Traefik service(s) emitting 5xx: $top" + status="WARN" + fi + detail="$count services: $top" + json_add "external_traefik_5xx" "$status" "$detail" + fi +} + # --- Summary --- print_summary() { if [[ "$JSON" == true ]]; then @@ -1832,6 +2421,18 @@ main() { check_ha_automations check_ha_system check_hardware_exporters + check_cert_manager_certificates + check_cert_manager_expiry + check_cert_manager_requests + check_backup_per_db + check_backup_offsite_sync + check_backup_lvm_snapshots + check_monitoring_prom_am + check_monitoring_vault + check_monitoring_css + check_external_replicas + check_external_divergence + check_external_traefik_5xx print_summary # Exit code: 2 for failures, 1 for warnings, 0 for clean diff --git a/setup-monitoring.sh b/setup-monitoring.sh deleted file mode 100755 index a7e3caf7..00000000 --- a/setup-monitoring.sh +++ /dev/null @@ -1,29 +0,0 @@ -#!/bin/bash -# Setup script for automated monitoring environment -# Ensures health check scripts have access to kubeconfig - -echo "=== Setting up automated monitoring environment ===" - -# Copy kubeconfig to location expected by health check scripts -if [ -f /home/node/.openclaw/kubeconfig ]; then - cp /home/node/.openclaw/kubeconfig /workspace/infra/config - echo "✅ Kubeconfig copied to /workspace/infra/config" -else - echo "❌ Source kubeconfig not found at /home/node/.openclaw/kubeconfig" - exit 1 -fi - -# Test health check access -echo "" -echo "Testing health check script access..." -cd /workspace/infra -if KUBECONFIG="" timeout 30 bash .claude/cluster-health.sh --quiet > /dev/null 2>&1; then - echo "✅ Health check script can access cluster" -else - echo "❌ Health check script cannot access cluster" - exit 1 -fi - -echo "" -echo "✅ Automated monitoring environment setup complete" -echo "📊 Cron health checks will now work properly" \ No newline at end of file diff --git a/stacks/k8s-portal/modules/k8s-portal/files/src/routes/agent/+server.ts b/stacks/k8s-portal/modules/k8s-portal/files/src/routes/agent/+server.ts index 3d0fa891..f96f4d56 100644 --- a/stacks/k8s-portal/modules/k8s-portal/files/src/routes/agent/+server.ts +++ b/stacks/k8s-portal/modules/k8s-portal/files/src/routes/agent/+server.ts @@ -83,7 +83,7 @@ For secrets requiring admin access (shared infra passwords, API keys): | \`modules/kubernetes/nfs_volume/\` | NFS volume module (CSI-backed, soft mount) | | \`config.tfvars\` | Non-secret configuration (plaintext) | | \`secrets.sops.json\` | All secrets (SOPS-encrypted JSON) | -| \`scripts/cluster_healthcheck.sh\` | 25-check cluster health script | +| \`scripts/cluster_healthcheck.sh\` | 42-check cluster health script | | \`AGENTS.md\` | Full AI agent instructions (auto-loaded by most agents) | ### Tier System diff --git a/stacks/openclaw/main.tf b/stacks/openclaw/main.tf index 2a611be9..5d5f9a1d 100644 --- a/stacks/openclaw/main.tf +++ b/stacks/openclaw/main.tf @@ -441,11 +441,6 @@ resource "kubernetes_deployment" "openclaw" { name = "UPTIME_KUMA_PASSWORD" value = local.skill_secrets["uptime_kuma_password"] } - # Skill secrets - Slack - env { - name = "SLACK_WEBHOOK_URL" - value = local.skill_secrets["slack_webhook"] - } # Memory API env { name = "MEMORY_API_URL" @@ -846,7 +841,10 @@ module "task_webhook_ingress" { external_monitor = false } -# --- CronJob: Scheduled cluster health check --- +# --- Shared ServiceAccount: grants pod-exec into the openclaw pod --- +# Used by the task_processor CronJob (below). Previously also used by the +# cluster_healthcheck CronJob, which has been decommissioned — the local +# `scripts/cluster_healthcheck.sh` is now the single authoritative runner. resource "kubernetes_service_account" "healthcheck" { metadata { @@ -889,76 +887,6 @@ resource "kubernetes_role_binding" "healthcheck_exec" { } } -resource "kubernetes_cron_job_v1" "cluster_healthcheck" { - metadata { - name = "cluster-healthcheck" - namespace = kubernetes_namespace.openclaw.metadata[0].name - labels = { - app = "cluster-healthcheck" - tier = local.tiers.aux - } - } - spec { - schedule = "0 */8 * * *" - concurrency_policy = "Forbid" - failed_jobs_history_limit = 3 - successful_jobs_history_limit = 3 - - job_template { - metadata { - labels = { - app = "cluster-healthcheck" - } - } - spec { - active_deadline_seconds = 300 - backoff_limit = 0 - template { - metadata { - labels = { - app = "cluster-healthcheck" - } - } - spec { - service_account_name = kubernetes_service_account.healthcheck.metadata[0].name - restart_policy = "Never" - - container { - name = "healthcheck" - image = "bitnami/kubectl:latest" - command = ["bash", "-c", <<-EOF - # Find the openclaw pod - POD=$(kubectl get pods -n openclaw -l app=openclaw -o jsonpath='{.items[0].metadata.name}' 2>/dev/null) - if [ -z "$POD" ]; then - echo "ERROR: OpenClaw pod not found" - exit 1 - fi - echo "Executing health check in pod $POD..." - kubectl exec -n openclaw "$POD" -c openclaw -- bash /workspace/infra/.claude/cluster-health.sh - EOF - ] - - resources { - requests = { - cpu = "50m" - memory = "64Mi" - } - limits = { - memory = "64Mi" - } - } - } - } - } - } - } - } - lifecycle { - # KYVERNO_LIFECYCLE_V1: Kyverno admission webhook mutates dns_config with ndots=2 - ignore_changes = [spec[0].job_template[0].spec[0].template[0].spec[0].dns_config] - } -} - # --- CronJob: Task processor — polls Forgejo issues and triggers OpenClaw --- resource "kubernetes_cron_job_v1" "task_processor" { @@ -983,8 +911,9 @@ resource "kubernetes_cron_job_v1" "task_processor" { } } spec { - active_deadline_seconds = 600 - backoff_limit = 0 + active_deadline_seconds = 600 + backoff_limit = 0 + ttl_seconds_after_finished = 86400 template { metadata { labels = {