Agents: devops-engineer, dba, security-engineer, sre, network-engineer, platform-engineer, observability-engineer, home-automation-engineer. Scripts: deploy-status, db-health, backup-verify, tls-check, crowdsec-status, authentik-audit, oom-investigator, resource-report, dns-check, network-health, nfs-health, truenas-status, platform-status, monitoring-health. Also: known-issues.md suppression list, cluster-health-checker port-forward fix.
166 lines
4.9 KiB
Bash
Executable file
166 lines
4.9 KiB
Bash
Executable file
#!/usr/bin/env bash
|
|
set -euo pipefail
|
|
|
|
KUBECTL="kubectl --kubeconfig /Users/viktorbarzin/code/infra/config"
|
|
PFSENSE="python3 /Users/viktorbarzin/code/infra/.claude/pfsense.py"
|
|
AGENT="network-health"
|
|
DRY_RUN=false
|
|
|
|
for arg in "$@"; do
|
|
case "$arg" in
|
|
--dry-run) DRY_RUN=true ;;
|
|
esac
|
|
done
|
|
|
|
checks=()
|
|
|
|
add_check() {
|
|
local name="$1" status="$2" message="$3"
|
|
checks+=("{\"name\": \"$name\", \"status\": \"$status\", \"message\": \"$message\"}")
|
|
}
|
|
|
|
check_pfsense_status() {
|
|
if $DRY_RUN; then
|
|
add_check "pfsense" "ok" "dry-run: would check pfSense system status via pfsense.py"
|
|
return
|
|
fi
|
|
|
|
local pf_output
|
|
pf_output=$($PFSENSE status 2>/dev/null) || {
|
|
add_check "pfsense" "fail" "Failed to connect to pfSense via pfsense.py"
|
|
return
|
|
}
|
|
|
|
if echo "$pf_output" | grep -qi "error\|fail\|down"; then
|
|
add_check "pfsense" "warn" "pfSense reported issues: $(echo "$pf_output" | head -3 | tr '\n' ' ')"
|
|
else
|
|
add_check "pfsense" "ok" "pfSense system healthy"
|
|
fi
|
|
}
|
|
|
|
check_vpn_status() {
|
|
if $DRY_RUN; then
|
|
add_check "vpn" "ok" "dry-run: would check VPN tunnel status via pfsense.py"
|
|
return
|
|
fi
|
|
|
|
local vpn_output
|
|
vpn_output=$($PFSENSE wireguard 2>/dev/null) || {
|
|
add_check "vpn" "warn" "Failed to query VPN status via pfsense.py"
|
|
return
|
|
}
|
|
|
|
if echo "$vpn_output" | grep -qi "error\|fail\|down"; then
|
|
add_check "vpn" "warn" "VPN issues detected: $(echo "$vpn_output" | head -3 | tr '\n' ' ')"
|
|
else
|
|
add_check "vpn" "ok" "VPN tunnels healthy"
|
|
fi
|
|
}
|
|
|
|
check_metallb_speakers() {
|
|
if $DRY_RUN; then
|
|
add_check "metallb-speakers" "ok" "dry-run: would check MetalLB speaker pod health"
|
|
return
|
|
fi
|
|
|
|
local ns="metallb-system"
|
|
|
|
# Find MetalLB speaker pods via labels first
|
|
local speaker_pods
|
|
speaker_pods=$($KUBECTL get pods -n "$ns" -l app.kubernetes.io/component=speaker --no-headers 2>/dev/null) || \
|
|
speaker_pods=$($KUBECTL get pods -n "$ns" -l component=speaker --no-headers 2>/dev/null) || \
|
|
speaker_pods=$($KUBECTL get pods -n "$ns" --no-headers 2>/dev/null | grep -i speaker || true)
|
|
|
|
if [ -z "$speaker_pods" ]; then
|
|
add_check "metallb-speakers" "warn" "No MetalLB speaker pods found in ${ns}"
|
|
return
|
|
fi
|
|
|
|
local total not_running
|
|
total=$(echo "$speaker_pods" | grep -c "." 2>/dev/null || echo "0")
|
|
not_running=$(echo "$speaker_pods" | grep -v "Running" | grep -c "." 2>/dev/null || echo "0")
|
|
|
|
if [ "$not_running" -gt 0 ]; then
|
|
add_check "metallb-speakers" "fail" "${not_running}/${total} MetalLB speaker pod(s) not running"
|
|
else
|
|
add_check "metallb-speakers" "ok" "All ${total} MetalLB speaker pod(s) running"
|
|
fi
|
|
}
|
|
|
|
check_metallb_l2() {
|
|
if $DRY_RUN; then
|
|
add_check "metallb-l2" "ok" "dry-run: would check MetalLB L2 advertisements"
|
|
return
|
|
fi
|
|
|
|
local ns="metallb-system"
|
|
|
|
# Check L2Advertisement CRDs
|
|
local l2_ads
|
|
l2_ads=$($KUBECTL get l2advertisements -n "$ns" -o json 2>/dev/null) || {
|
|
add_check "metallb-l2" "warn" "Could not query L2Advertisement CRDs"
|
|
return
|
|
}
|
|
|
|
local count
|
|
count=$(echo "$l2_ads" | jq '.items | length' 2>/dev/null || echo "0")
|
|
|
|
if [ "$count" -eq 0 ]; then
|
|
add_check "metallb-l2" "warn" "No L2Advertisement resources found"
|
|
else
|
|
# Check MetalLB controller
|
|
local controller
|
|
controller=$($KUBECTL get pods -n "$ns" -l app.kubernetes.io/component=controller --no-headers 2>/dev/null) || \
|
|
controller=$($KUBECTL get pods -n "$ns" --no-headers 2>/dev/null | grep -i controller || true)
|
|
|
|
if [ -z "$controller" ]; then
|
|
add_check "metallb-l2" "warn" "${count} L2Advertisement(s) found but no controller pod"
|
|
elif echo "$controller" | grep -q "Running"; then
|
|
add_check "metallb-l2" "ok" "${count} L2Advertisement(s) configured, controller running"
|
|
else
|
|
add_check "metallb-l2" "warn" "${count} L2Advertisement(s) found but controller not running"
|
|
fi
|
|
fi
|
|
}
|
|
|
|
check_node_connectivity() {
|
|
if $DRY_RUN; then
|
|
add_check "node-connectivity" "ok" "dry-run: would ping k8s nodes"
|
|
return
|
|
fi
|
|
|
|
local nodes=("10.0.20.100" "10.0.20.101" "10.0.20.102" "10.0.20.103" "10.0.20.104")
|
|
local names=("k8s-master" "k8s-node1" "k8s-node2" "k8s-node3" "k8s-node4")
|
|
local failures=0
|
|
local failure_details=""
|
|
|
|
for i in "${!nodes[@]}"; do
|
|
if ! ping -c 1 -W 2 "${nodes[$i]}" >/dev/null 2>&1; then
|
|
failures=$((failures + 1))
|
|
failure_details="${failure_details}${names[$i]}(${nodes[$i]}) "
|
|
fi
|
|
done
|
|
|
|
if [ "$failures" -gt 0 ]; then
|
|
add_check "node-connectivity" "fail" "${failures} node(s) unreachable: ${failure_details}"
|
|
else
|
|
add_check "node-connectivity" "ok" "All ${#nodes[@]} nodes reachable"
|
|
fi
|
|
}
|
|
|
|
check_pfsense_status
|
|
check_vpn_status
|
|
check_metallb_speakers
|
|
check_metallb_l2
|
|
check_node_connectivity
|
|
|
|
# Output JSON
|
|
overall="ok"
|
|
for c in "${checks[@]}"; do
|
|
s=$(echo "$c" | jq -r '.status')
|
|
if [ "$s" = "fail" ]; then overall="fail"; break; fi
|
|
if [ "$s" = "warn" ]; then overall="warn"; fi
|
|
done
|
|
|
|
printf '{"status": "%s", "agent": "%s", "checks": [%s]}\n' \
|
|
"$overall" "$AGENT" "$(IFS=,; echo "${checks[*]}")"
|