282 lines
8.9 KiB
Bash
282 lines
8.9 KiB
Bash
|
|
#!/usr/bin/env bash
|
||
|
|
set -euo pipefail
|
||
|
|
|
||
|
|
AGENT="monitoring-health"
|
||
|
|
KUBECTL="kubectl --kubeconfig /Users/viktorbarzin/code/infra/config"
|
||
|
|
MONITORING_NS="monitoring"
|
||
|
|
DRY_RUN=false
|
||
|
|
|
||
|
|
for arg in "$@"; do
|
||
|
|
case "$arg" in
|
||
|
|
--dry-run) DRY_RUN=true ;;
|
||
|
|
esac
|
||
|
|
done
|
||
|
|
|
||
|
|
checks=()
|
||
|
|
|
||
|
|
add_check() {
|
||
|
|
local name="$1" status="$2" message="$3"
|
||
|
|
checks+=("{\"name\": \"$name\", \"status\": \"$status\", \"message\": \"$message\"}")
|
||
|
|
}
|
||
|
|
|
||
|
|
check_prometheus() {
|
||
|
|
if $DRY_RUN; then
|
||
|
|
add_check "prometheus" "ok" "dry-run: would check Prometheus server health"
|
||
|
|
return
|
||
|
|
fi
|
||
|
|
|
||
|
|
# Discover Prometheus server pod via labels
|
||
|
|
local prom_pod
|
||
|
|
prom_pod=$($KUBECTL get pods -n "$MONITORING_NS" -l app.kubernetes.io/name=prometheus,app.kubernetes.io/component=server -o name 2>/dev/null | head -1)
|
||
|
|
if [ -z "$prom_pod" ]; then
|
||
|
|
prom_pod=$($KUBECTL get pods -n "$MONITORING_NS" -l app=prometheus,component=server -o name 2>/dev/null | head -1)
|
||
|
|
fi
|
||
|
|
if [ -z "$prom_pod" ]; then
|
||
|
|
prom_pod=$($KUBECTL get pods -n "$MONITORING_NS" -o name 2>/dev/null | grep prometheus-server | head -1)
|
||
|
|
fi
|
||
|
|
|
||
|
|
if [ -z "$prom_pod" ]; then
|
||
|
|
add_check "prometheus" "fail" "No Prometheus server pod found in $MONITORING_NS"
|
||
|
|
return
|
||
|
|
fi
|
||
|
|
|
||
|
|
local phase
|
||
|
|
phase=$($KUBECTL get "$prom_pod" -n "$MONITORING_NS" -o jsonpath='{.status.phase}' 2>/dev/null)
|
||
|
|
if [ "$phase" != "Running" ]; then
|
||
|
|
add_check "prometheus" "fail" "Prometheus server pod phase: $phase"
|
||
|
|
return
|
||
|
|
fi
|
||
|
|
|
||
|
|
# Check Prometheus is responding
|
||
|
|
local prom_healthy
|
||
|
|
prom_healthy=$($KUBECTL exec "$prom_pod" -n "$MONITORING_NS" -c prometheus-server -- \
|
||
|
|
wget -q -O- "http://localhost:9090/-/healthy" 2>/dev/null || echo "unhealthy")
|
||
|
|
|
||
|
|
if echo "$prom_healthy" | grep -qi "ok\|healthy"; then
|
||
|
|
# Check target scraping
|
||
|
|
local targets_up
|
||
|
|
targets_up=$($KUBECTL exec "$prom_pod" -n "$MONITORING_NS" -c prometheus-server -- \
|
||
|
|
wget -q -O- "http://localhost:9090/api/v1/targets" 2>/dev/null | \
|
||
|
|
python3 -c "
|
||
|
|
import sys, json
|
||
|
|
try:
|
||
|
|
data = json.load(sys.stdin)
|
||
|
|
active = data.get('data',{}).get('activeTargets',[])
|
||
|
|
up = sum(1 for t in active if t.get('health') == 'up')
|
||
|
|
total = len(active)
|
||
|
|
print(f'{up}/{total}')
|
||
|
|
except: print('unknown')
|
||
|
|
" 2>/dev/null || echo "unknown")
|
||
|
|
add_check "prometheus" "ok" "Prometheus server healthy, targets: $targets_up up"
|
||
|
|
else
|
||
|
|
add_check "prometheus" "warn" "Prometheus server running but health check unclear"
|
||
|
|
fi
|
||
|
|
}
|
||
|
|
|
||
|
|
check_alertmanager() {
|
||
|
|
if $DRY_RUN; then
|
||
|
|
add_check "alertmanager" "ok" "dry-run: would check Alertmanager health"
|
||
|
|
return
|
||
|
|
fi
|
||
|
|
|
||
|
|
# Discover Alertmanager pod
|
||
|
|
local am_pod
|
||
|
|
am_pod=$($KUBECTL get pods -n "$MONITORING_NS" -l app.kubernetes.io/name=alertmanager -o name 2>/dev/null | head -1)
|
||
|
|
if [ -z "$am_pod" ]; then
|
||
|
|
am_pod=$($KUBECTL get pods -n "$MONITORING_NS" -o name 2>/dev/null | grep alertmanager | head -1)
|
||
|
|
fi
|
||
|
|
|
||
|
|
if [ -z "$am_pod" ]; then
|
||
|
|
add_check "alertmanager" "fail" "No Alertmanager pod found in $MONITORING_NS"
|
||
|
|
return
|
||
|
|
fi
|
||
|
|
|
||
|
|
local phase
|
||
|
|
phase=$($KUBECTL get "$am_pod" -n "$MONITORING_NS" -o jsonpath='{.status.phase}' 2>/dev/null)
|
||
|
|
if [ "$phase" != "Running" ]; then
|
||
|
|
add_check "alertmanager" "fail" "Alertmanager pod phase: $phase"
|
||
|
|
return
|
||
|
|
fi
|
||
|
|
|
||
|
|
# Check firing alerts
|
||
|
|
local alert_info
|
||
|
|
alert_info=$($KUBECTL exec "$am_pod" -n "$MONITORING_NS" -- \
|
||
|
|
wget -q -O- "http://localhost:9093/api/v2/alerts?active=true" 2>/dev/null | \
|
||
|
|
python3 -c "
|
||
|
|
import sys, json
|
||
|
|
try:
|
||
|
|
alerts = json.load(sys.stdin)
|
||
|
|
firing = [a for a in alerts if a.get('status',{}).get('state') == 'active']
|
||
|
|
print(len(firing))
|
||
|
|
except: print('unknown')
|
||
|
|
" 2>/dev/null || echo "unknown")
|
||
|
|
|
||
|
|
# Check silences
|
||
|
|
local silence_count
|
||
|
|
silence_count=$($KUBECTL exec "$am_pod" -n "$MONITORING_NS" -- \
|
||
|
|
wget -q -O- "http://localhost:9093/api/v2/silences" 2>/dev/null | \
|
||
|
|
python3 -c "
|
||
|
|
import sys, json
|
||
|
|
try:
|
||
|
|
silences = json.load(sys.stdin)
|
||
|
|
active = [s for s in silences if s.get('status',{}).get('state') == 'active']
|
||
|
|
print(len(active))
|
||
|
|
except: print('0')
|
||
|
|
" 2>/dev/null || echo "0")
|
||
|
|
|
||
|
|
if [ "$alert_info" = "unknown" ]; then
|
||
|
|
add_check "alertmanager" "warn" "Alertmanager running but could not query alerts"
|
||
|
|
else
|
||
|
|
local status="ok"
|
||
|
|
[ "$alert_info" -gt 0 ] 2>/dev/null && status="warn"
|
||
|
|
add_check "alertmanager" "$status" "Alertmanager healthy: $alert_info firing alerts, $silence_count active silences"
|
||
|
|
fi
|
||
|
|
}
|
||
|
|
|
||
|
|
check_grafana() {
|
||
|
|
if $DRY_RUN; then
|
||
|
|
add_check "grafana" "ok" "dry-run: would check Grafana health"
|
||
|
|
return
|
||
|
|
fi
|
||
|
|
|
||
|
|
# Discover Grafana pod
|
||
|
|
local grafana_pod
|
||
|
|
grafana_pod=$($KUBECTL get pods -n "$MONITORING_NS" -l app.kubernetes.io/name=grafana -o name 2>/dev/null | head -1)
|
||
|
|
if [ -z "$grafana_pod" ]; then
|
||
|
|
grafana_pod=$($KUBECTL get pods -n "$MONITORING_NS" -o name 2>/dev/null | grep grafana | grep -v test | head -1)
|
||
|
|
fi
|
||
|
|
|
||
|
|
if [ -z "$grafana_pod" ]; then
|
||
|
|
add_check "grafana" "fail" "No Grafana pod found in $MONITORING_NS"
|
||
|
|
return
|
||
|
|
fi
|
||
|
|
|
||
|
|
local phase
|
||
|
|
phase=$($KUBECTL get "$grafana_pod" -n "$MONITORING_NS" -o jsonpath='{.status.phase}' 2>/dev/null)
|
||
|
|
if [ "$phase" != "Running" ]; then
|
||
|
|
add_check "grafana" "fail" "Grafana pod phase: $phase"
|
||
|
|
return
|
||
|
|
fi
|
||
|
|
|
||
|
|
# Check datasource connectivity
|
||
|
|
local ds_info
|
||
|
|
ds_info=$($KUBECTL exec "$grafana_pod" -n "$MONITORING_NS" -- \
|
||
|
|
curl -sf "http://localhost:3000/api/datasources" 2>/dev/null | \
|
||
|
|
python3 -c "
|
||
|
|
import sys, json
|
||
|
|
try:
|
||
|
|
ds = json.load(sys.stdin)
|
||
|
|
names = [d.get('name','?') for d in ds]
|
||
|
|
print(f'{len(ds)} datasources: {\", \".join(names)}')
|
||
|
|
except: print('unknown')
|
||
|
|
" 2>/dev/null || echo "unknown")
|
||
|
|
|
||
|
|
if [ "$ds_info" = "unknown" ]; then
|
||
|
|
add_check "grafana" "warn" "Grafana running but could not query datasources (may need auth)"
|
||
|
|
else
|
||
|
|
add_check "grafana" "ok" "Grafana healthy, $ds_info"
|
||
|
|
fi
|
||
|
|
}
|
||
|
|
|
||
|
|
check_snmp_exporters() {
|
||
|
|
if $DRY_RUN; then
|
||
|
|
add_check "snmp-exporters" "ok" "dry-run: would check SNMP exporter pods"
|
||
|
|
return
|
||
|
|
fi
|
||
|
|
|
||
|
|
local exporters=("snmp-exporter" "idrac-redfish-exporter" "proxmox-exporter")
|
||
|
|
local running=0 total=0
|
||
|
|
|
||
|
|
for exporter in "${exporters[@]}"; do
|
||
|
|
total=$((total + 1))
|
||
|
|
local pod
|
||
|
|
pod=$($KUBECTL get pods -n "$MONITORING_NS" -o name 2>/dev/null | grep "$exporter" | head -1)
|
||
|
|
|
||
|
|
if [ -z "$pod" ]; then
|
||
|
|
# Try all namespaces
|
||
|
|
pod=$($KUBECTL get pods --all-namespaces -o custom-columns=NS:.metadata.namespace,NAME:.metadata.name --no-headers 2>/dev/null | \
|
||
|
|
grep "$exporter" | head -1)
|
||
|
|
if [ -z "$pod" ]; then
|
||
|
|
add_check "exporter-$exporter" "warn" "$exporter pod not found"
|
||
|
|
continue
|
||
|
|
fi
|
||
|
|
local ns
|
||
|
|
ns=$(echo "$pod" | awk '{print $1}')
|
||
|
|
local name
|
||
|
|
name=$(echo "$pod" | awk '{print $2}')
|
||
|
|
local phase
|
||
|
|
phase=$($KUBECTL get pod "$name" -n "$ns" -o jsonpath='{.status.phase}' 2>/dev/null)
|
||
|
|
if [ "$phase" = "Running" ]; then
|
||
|
|
running=$((running + 1))
|
||
|
|
add_check "exporter-$exporter" "ok" "$exporter running in $ns"
|
||
|
|
else
|
||
|
|
add_check "exporter-$exporter" "warn" "$exporter phase: $phase in $ns"
|
||
|
|
fi
|
||
|
|
else
|
||
|
|
local phase
|
||
|
|
phase=$($KUBECTL get "$pod" -n "$MONITORING_NS" -o jsonpath='{.status.phase}' 2>/dev/null)
|
||
|
|
if [ "$phase" = "Running" ]; then
|
||
|
|
running=$((running + 1))
|
||
|
|
add_check "exporter-$exporter" "ok" "$exporter running"
|
||
|
|
else
|
||
|
|
add_check "exporter-$exporter" "warn" "$exporter phase: $phase"
|
||
|
|
fi
|
||
|
|
fi
|
||
|
|
done
|
||
|
|
}
|
||
|
|
|
||
|
|
check_prometheus_storage() {
|
||
|
|
if $DRY_RUN; then
|
||
|
|
add_check "prometheus-storage" "ok" "dry-run: would check Prometheus storage usage"
|
||
|
|
return
|
||
|
|
fi
|
||
|
|
|
||
|
|
local prom_pvc
|
||
|
|
prom_pvc=$($KUBECTL get pvc -n "$MONITORING_NS" -o name 2>/dev/null | grep prometheus-server | head -1)
|
||
|
|
|
||
|
|
if [ -z "$prom_pvc" ]; then
|
||
|
|
add_check "prometheus-storage" "warn" "No Prometheus server PVC found"
|
||
|
|
return
|
||
|
|
fi
|
||
|
|
|
||
|
|
# Check storage via Prometheus TSDB stats
|
||
|
|
local prom_pod
|
||
|
|
prom_pod=$($KUBECTL get pods -n "$MONITORING_NS" -l app.kubernetes.io/name=prometheus,app.kubernetes.io/component=server -o name 2>/dev/null | head -1)
|
||
|
|
if [ -z "$prom_pod" ]; then
|
||
|
|
prom_pod=$($KUBECTL get pods -n "$MONITORING_NS" -o name 2>/dev/null | grep prometheus-server | head -1)
|
||
|
|
fi
|
||
|
|
|
||
|
|
if [ -n "$prom_pod" ]; then
|
||
|
|
local storage_info
|
||
|
|
storage_info=$($KUBECTL exec "$prom_pod" -n "$MONITORING_NS" -c prometheus-server -- \
|
||
|
|
df -h /data 2>/dev/null | tail -1 | awk '{printf "%s used of %s (%s)", $3, $2, $5}' || echo "unknown")
|
||
|
|
add_check "prometheus-storage" "ok" "Prometheus storage: $storage_info"
|
||
|
|
else
|
||
|
|
add_check "prometheus-storage" "warn" "Could not check Prometheus storage"
|
||
|
|
fi
|
||
|
|
}
|
||
|
|
|
||
|
|
# Run checks
|
||
|
|
check_prometheus
|
||
|
|
check_alertmanager
|
||
|
|
check_grafana
|
||
|
|
check_snmp_exporters
|
||
|
|
check_prometheus_storage
|
||
|
|
|
||
|
|
# Determine overall status
|
||
|
|
overall="ok"
|
||
|
|
for c in "${checks[@]}"; do
|
||
|
|
if echo "$c" | grep -q '"status": "fail"'; then
|
||
|
|
overall="fail"
|
||
|
|
break
|
||
|
|
elif echo "$c" | grep -q '"status": "warn"'; then
|
||
|
|
overall="warn"
|
||
|
|
fi
|
||
|
|
done
|
||
|
|
|
||
|
|
# Output JSON
|
||
|
|
checks_json=$(IFS=,; echo "${checks[*]}")
|
||
|
|
cat <<EOF
|
||
|
|
{"status": "$overall", "agent": "$AGENT", "checks": [$checks_json]}
|
||
|
|
EOF
|