infra/.claude/scripts/platform-status.sh
Viktor Barzin ff83ec3325 add infrastructure agent team: 8 specialized agents + 14 diagnostic scripts
Agents: devops-engineer, dba, security-engineer, sre, network-engineer,
platform-engineer, observability-engineer, home-automation-engineer.
Scripts: deploy-status, db-health, backup-verify, tls-check, crowdsec-status,
authentik-audit, oom-investigator, resource-report, dns-check, network-health,
nfs-health, truenas-status, platform-status, monitoring-health.
Also: known-issues.md suppression list, cluster-health-checker port-forward fix.
2026-03-15 02:01:07 +00:00

260 lines
8 KiB
Bash
Executable file

#!/usr/bin/env bash
set -euo pipefail
AGENT="platform-status"
KUBECTL="kubectl --kubeconfig /Users/viktorbarzin/code/infra/config"
PROXMOX_HOST="root@192.168.1.127"
REGISTRY_HOST="10.0.20.10"
DRY_RUN=false
for arg in "$@"; do
case "$arg" in
--dry-run) DRY_RUN=true ;;
esac
done
checks=()
add_check() {
local name="$1" status="$2" message="$3"
checks+=("{\"name\": \"$name\", \"status\": \"$status\", \"message\": \"$message\"}")
}
check_traefik() {
if $DRY_RUN; then
add_check "traefik" "ok" "dry-run: would check Traefik status"
return
fi
# Discover Traefik pods via labels
local traefik_pod
traefik_pod=$($KUBECTL get pods -n traefik -l app.kubernetes.io/name=traefik -o name 2>/dev/null | head -1)
if [ -z "$traefik_pod" ]; then
traefik_pod=$($KUBECTL get pods -n traefik -l app=traefik -o name 2>/dev/null | head -1)
fi
if [ -z "$traefik_pod" ]; then
add_check "traefik" "fail" "No Traefik pods found in traefik namespace"
return
fi
local phase
phase=$($KUBECTL get "$traefik_pod" -n traefik -o jsonpath='{.status.phase}' 2>/dev/null)
if [ "$phase" = "Running" ]; then
# Check IngressRoute count
local ir_count
ir_count=$($KUBECTL get ingressroute --all-namespaces --no-headers 2>/dev/null | wc -l | tr -d ' ')
add_check "traefik" "ok" "Traefik running, $ir_count IngressRoutes configured"
else
add_check "traefik" "fail" "Traefik pod phase: $phase"
fi
# Check for IngressRoutes with errors (TLS or service issues)
local ir_errors
ir_errors=$($KUBECTL get events --all-namespaces --field-selector reason=IngressRouteError --no-headers 2>/dev/null | wc -l | tr -d ' ')
if [ "$ir_errors" -gt 0 ]; then
add_check "traefik-ingressroutes" "warn" "$ir_errors IngressRoute error events found"
fi
}
check_kyverno() {
if $DRY_RUN; then
add_check "kyverno" "ok" "dry-run: would check Kyverno status"
return
fi
# Discover Kyverno pods via labels
local kyverno_pods
kyverno_pods=$($KUBECTL get pods -n kyverno -l app.kubernetes.io/name=kyverno -o name 2>/dev/null)
if [ -z "$kyverno_pods" ]; then
kyverno_pods=$($KUBECTL get pods -n kyverno -l app=kyverno -o name 2>/dev/null)
fi
if [ -z "$kyverno_pods" ]; then
add_check "kyverno" "warn" "No Kyverno pods found"
return
fi
local total=0 ready=0
while IFS= read -r pod; do
[ -z "$pod" ] && continue
total=$((total + 1))
local phase
phase=$($KUBECTL get "$pod" -n kyverno -o jsonpath='{.status.phase}' 2>/dev/null)
[ "$phase" = "Running" ] && ready=$((ready + 1))
done <<< "$kyverno_pods"
if [ "$ready" -eq "$total" ]; then
# Check policy count
local policy_count
policy_count=$($KUBECTL get clusterpolicy --no-headers 2>/dev/null | wc -l | tr -d ' ')
add_check "kyverno" "ok" "$ready/$total Kyverno pods running, $policy_count ClusterPolicies"
else
add_check "kyverno" "warn" "$ready/$total Kyverno pods running"
fi
# Check for policy violations
local violations
violations=$($KUBECTL get policyreport --all-namespaces -o json 2>/dev/null | \
python3 -c "
import sys, json
try:
data = json.load(sys.stdin)
fail_count = sum(r.get('summary',{}).get('fail',0) for r in data.get('items',[]))
print(fail_count)
except: print('0')
" 2>/dev/null || echo "0")
if [ "$violations" -gt 0 ]; then
add_check "kyverno-violations" "warn" "$violations policy violations across namespaces"
fi
}
check_vpa_goldilocks() {
if $DRY_RUN; then
add_check "vpa-goldilocks" "ok" "dry-run: would check VPA/Goldilocks status"
return
fi
# Check VPA admission controller
local vpa_pods
vpa_pods=$($KUBECTL get pods -n goldilocks -l app.kubernetes.io/name=goldilocks -o name 2>/dev/null)
if [ -z "$vpa_pods" ]; then
vpa_pods=$($KUBECTL get pods -n goldilocks -o name 2>/dev/null)
fi
if [ -z "$vpa_pods" ]; then
add_check "vpa-goldilocks" "warn" "No Goldilocks pods found"
return
fi
local total=0 ready=0
while IFS= read -r pod; do
[ -z "$pod" ] && continue
total=$((total + 1))
local phase
phase=$($KUBECTL get "$pod" -n goldilocks -o jsonpath='{.status.phase}' 2>/dev/null)
[ "$phase" = "Running" ] && ready=$((ready + 1))
done <<< "$vpa_pods"
if [ "$ready" -eq "$total" ]; then
local vpa_count
vpa_count=$($KUBECTL get vpa --all-namespaces --no-headers 2>/dev/null | wc -l | tr -d ' ')
add_check "vpa-goldilocks" "ok" "$ready/$total Goldilocks pods running, $vpa_count VPAs configured"
else
add_check "vpa-goldilocks" "warn" "$ready/$total Goldilocks pods running"
fi
# Check for VPAs with unexpected updateMode
local auto_vpas
auto_vpas=$($KUBECTL get vpa --all-namespaces -o json 2>/dev/null | \
python3 -c "
import sys, json
try:
data = json.load(sys.stdin)
auto = [i['metadata']['name'] for i in data.get('items',[]) if i.get('spec',{}).get('updatePolicy',{}).get('updateMode','') == 'Auto']
print(len(auto))
except: print('0')
" 2>/dev/null || echo "0")
if [ "$auto_vpas" -gt 0 ]; then
add_check "vpa-auto-mode" "warn" "$auto_vpas VPAs set to Auto updateMode (may cause unexpected restarts)"
fi
}
check_pull_through_cache() {
if $DRY_RUN; then
add_check "pull-through-cache" "ok" "dry-run: would check pull-through cache at $REGISTRY_HOST"
return
fi
if timeout 5 curl -sf "http://${REGISTRY_HOST}:5000/v2/" &>/dev/null; then
add_check "pull-through-cache" "ok" "Pull-through cache registry at $REGISTRY_HOST:5000 is healthy"
elif timeout 5 curl -sf "https://${REGISTRY_HOST}/v2/" &>/dev/null; then
add_check "pull-through-cache" "ok" "Pull-through cache registry at $REGISTRY_HOST is healthy (HTTPS)"
else
add_check "pull-through-cache" "fail" "Pull-through cache registry at $REGISTRY_HOST is unreachable"
fi
}
check_proxmox() {
if $DRY_RUN; then
add_check "proxmox" "ok" "dry-run: would check Proxmox host resources"
return
fi
local cpu_load
if cpu_load=$(timeout 10 ssh -o ConnectTimeout=5 -o StrictHostKeyChecking=no "$PROXMOX_HOST" \
"uptime | awk -F'load average:' '{print \$2}' | awk -F, '{print \$1}' | tr -d ' '" 2>/dev/null); then
local cpu_count
cpu_count=$(timeout 10 ssh -o ConnectTimeout=5 -o StrictHostKeyChecking=no "$PROXMOX_HOST" \
"nproc" 2>/dev/null || echo "1")
# Check memory
local mem_info
mem_info=$(timeout 10 ssh -o ConnectTimeout=5 -o StrictHostKeyChecking=no "$PROXMOX_HOST" \
"free -m | awk '/Mem:/{printf \"%d/%dMB (%.0f%%)\", \$3, \$2, \$3/\$2*100}'" 2>/dev/null || echo "unknown")
add_check "proxmox" "ok" "Proxmox host: load=$cpu_load (${cpu_count}cores), mem=$mem_info"
else
add_check "proxmox" "fail" "Could not reach Proxmox host via SSH"
fi
}
check_metallb() {
if $DRY_RUN; then
add_check "metallb" "ok" "dry-run: would check MetalLB status"
return
fi
local metallb_pods
metallb_pods=$($KUBECTL get pods -n metallb-system -l app.kubernetes.io/name=metallb -o name 2>/dev/null)
if [ -z "$metallb_pods" ]; then
metallb_pods=$($KUBECTL get pods -n metallb-system -o name 2>/dev/null)
fi
if [ -z "$metallb_pods" ]; then
add_check "metallb" "warn" "No MetalLB pods found"
return
fi
local total=0 ready=0
while IFS= read -r pod; do
[ -z "$pod" ] && continue
total=$((total + 1))
local phase
phase=$($KUBECTL get "$pod" -n metallb-system -o jsonpath='{.status.phase}' 2>/dev/null)
[ "$phase" = "Running" ] && ready=$((ready + 1))
done <<< "$metallb_pods"
if [ "$ready" -eq "$total" ]; then
add_check "metallb" "ok" "$ready/$total MetalLB pods running"
else
add_check "metallb" "warn" "$ready/$total MetalLB pods running"
fi
}
# Run checks
check_traefik
check_kyverno
check_vpa_goldilocks
check_pull_through_cache
check_proxmox
check_metallb
# Determine overall status
overall="ok"
for c in "${checks[@]}"; do
if echo "$c" | grep -q '"status": "fail"'; then
overall="fail"
break
elif echo "$c" | grep -q '"status": "warn"'; then
overall="warn"
fi
done
# Output JSON
checks_json=$(IFS=,; echo "${checks[*]}")
cat <<EOF
{"status": "$overall", "agent": "$AGENT", "checks": [$checks_json]}
EOF