infra/.claude/scripts/network-health.sh
Viktor Barzin ff83ec3325 add infrastructure agent team: 8 specialized agents + 14 diagnostic scripts
Agents: devops-engineer, dba, security-engineer, sre, network-engineer,
platform-engineer, observability-engineer, home-automation-engineer.
Scripts: deploy-status, db-health, backup-verify, tls-check, crowdsec-status,
authentik-audit, oom-investigator, resource-report, dns-check, network-health,
nfs-health, truenas-status, platform-status, monitoring-health.
Also: known-issues.md suppression list, cluster-health-checker port-forward fix.
2026-03-15 02:01:07 +00:00

166 lines
4.9 KiB
Bash
Executable file

#!/usr/bin/env bash
set -euo pipefail
KUBECTL="kubectl --kubeconfig /Users/viktorbarzin/code/infra/config"
PFSENSE="python3 /Users/viktorbarzin/code/infra/.claude/pfsense.py"
AGENT="network-health"
DRY_RUN=false
for arg in "$@"; do
case "$arg" in
--dry-run) DRY_RUN=true ;;
esac
done
checks=()
add_check() {
local name="$1" status="$2" message="$3"
checks+=("{\"name\": \"$name\", \"status\": \"$status\", \"message\": \"$message\"}")
}
check_pfsense_status() {
if $DRY_RUN; then
add_check "pfsense" "ok" "dry-run: would check pfSense system status via pfsense.py"
return
fi
local pf_output
pf_output=$($PFSENSE status 2>/dev/null) || {
add_check "pfsense" "fail" "Failed to connect to pfSense via pfsense.py"
return
}
if echo "$pf_output" | grep -qi "error\|fail\|down"; then
add_check "pfsense" "warn" "pfSense reported issues: $(echo "$pf_output" | head -3 | tr '\n' ' ')"
else
add_check "pfsense" "ok" "pfSense system healthy"
fi
}
check_vpn_status() {
if $DRY_RUN; then
add_check "vpn" "ok" "dry-run: would check VPN tunnel status via pfsense.py"
return
fi
local vpn_output
vpn_output=$($PFSENSE wireguard 2>/dev/null) || {
add_check "vpn" "warn" "Failed to query VPN status via pfsense.py"
return
}
if echo "$vpn_output" | grep -qi "error\|fail\|down"; then
add_check "vpn" "warn" "VPN issues detected: $(echo "$vpn_output" | head -3 | tr '\n' ' ')"
else
add_check "vpn" "ok" "VPN tunnels healthy"
fi
}
check_metallb_speakers() {
if $DRY_RUN; then
add_check "metallb-speakers" "ok" "dry-run: would check MetalLB speaker pod health"
return
fi
local ns="metallb-system"
# Find MetalLB speaker pods via labels first
local speaker_pods
speaker_pods=$($KUBECTL get pods -n "$ns" -l app.kubernetes.io/component=speaker --no-headers 2>/dev/null) || \
speaker_pods=$($KUBECTL get pods -n "$ns" -l component=speaker --no-headers 2>/dev/null) || \
speaker_pods=$($KUBECTL get pods -n "$ns" --no-headers 2>/dev/null | grep -i speaker || true)
if [ -z "$speaker_pods" ]; then
add_check "metallb-speakers" "warn" "No MetalLB speaker pods found in ${ns}"
return
fi
local total not_running
total=$(echo "$speaker_pods" | grep -c "." 2>/dev/null || echo "0")
not_running=$(echo "$speaker_pods" | grep -v "Running" | grep -c "." 2>/dev/null || echo "0")
if [ "$not_running" -gt 0 ]; then
add_check "metallb-speakers" "fail" "${not_running}/${total} MetalLB speaker pod(s) not running"
else
add_check "metallb-speakers" "ok" "All ${total} MetalLB speaker pod(s) running"
fi
}
check_metallb_l2() {
if $DRY_RUN; then
add_check "metallb-l2" "ok" "dry-run: would check MetalLB L2 advertisements"
return
fi
local ns="metallb-system"
# Check L2Advertisement CRDs
local l2_ads
l2_ads=$($KUBECTL get l2advertisements -n "$ns" -o json 2>/dev/null) || {
add_check "metallb-l2" "warn" "Could not query L2Advertisement CRDs"
return
}
local count
count=$(echo "$l2_ads" | jq '.items | length' 2>/dev/null || echo "0")
if [ "$count" -eq 0 ]; then
add_check "metallb-l2" "warn" "No L2Advertisement resources found"
else
# Check MetalLB controller
local controller
controller=$($KUBECTL get pods -n "$ns" -l app.kubernetes.io/component=controller --no-headers 2>/dev/null) || \
controller=$($KUBECTL get pods -n "$ns" --no-headers 2>/dev/null | grep -i controller || true)
if [ -z "$controller" ]; then
add_check "metallb-l2" "warn" "${count} L2Advertisement(s) found but no controller pod"
elif echo "$controller" | grep -q "Running"; then
add_check "metallb-l2" "ok" "${count} L2Advertisement(s) configured, controller running"
else
add_check "metallb-l2" "warn" "${count} L2Advertisement(s) found but controller not running"
fi
fi
}
check_node_connectivity() {
if $DRY_RUN; then
add_check "node-connectivity" "ok" "dry-run: would ping k8s nodes"
return
fi
local nodes=("10.0.20.100" "10.0.20.101" "10.0.20.102" "10.0.20.103" "10.0.20.104")
local names=("k8s-master" "k8s-node1" "k8s-node2" "k8s-node3" "k8s-node4")
local failures=0
local failure_details=""
for i in "${!nodes[@]}"; do
if ! ping -c 1 -W 2 "${nodes[$i]}" >/dev/null 2>&1; then
failures=$((failures + 1))
failure_details="${failure_details}${names[$i]}(${nodes[$i]}) "
fi
done
if [ "$failures" -gt 0 ]; then
add_check "node-connectivity" "fail" "${failures} node(s) unreachable: ${failure_details}"
else
add_check "node-connectivity" "ok" "All ${#nodes[@]} nodes reachable"
fi
}
check_pfsense_status
check_vpn_status
check_metallb_speakers
check_metallb_l2
check_node_connectivity
# Output JSON
overall="ok"
for c in "${checks[@]}"; do
s=$(echo "$c" | jq -r '.status')
if [ "$s" = "fail" ]; then overall="fail"; break; fi
if [ "$s" = "warn" ]; then overall="warn"; fi
done
printf '{"status": "%s", "agent": "%s", "checks": [%s]}\n' \
"$overall" "$AGENT" "$(IFS=,; echo "${checks[*]}")"