infra/.claude/scripts/monitoring-health.sh

#!/usr/bin/env bash
set -euo pipefail

AGENT="monitoring-health"
KUBECTL="kubectl --kubeconfig /Users/viktorbarzin/code/infra/config"
MONITORING_NS="monitoring"
DRY_RUN=false

for arg in "$@"; do
  case "$arg" in
    --dry-run) DRY_RUN=true ;;
  esac
done

checks=()

add_check() {
  local name="$1" status="$2" message="$3"
  checks+=("{\"name\": \"$name\", \"status\": \"$status\", \"message\": \"$message\"}")
}

check_prometheus() {
  if $DRY_RUN; then
    add_check "prometheus" "ok" "dry-run: would check Prometheus server health"
    return
  fi

  # Discover Prometheus server pod via labels
  local prom_pod
  prom_pod=$($KUBECTL get pods -n "$MONITORING_NS" -l app.kubernetes.io/name=prometheus,app.kubernetes.io/component=server -o name 2>/dev/null | head -1)
  if [ -z "$prom_pod" ]; then
    prom_pod=$($KUBECTL get pods -n "$MONITORING_NS" -l app=prometheus,component=server -o name 2>/dev/null | head -1)
  fi
  if [ -z "$prom_pod" ]; then
    prom_pod=$($KUBECTL get pods -n "$MONITORING_NS" -o name 2>/dev/null | grep prometheus-server | head -1)
  fi

  if [ -z "$prom_pod" ]; then
    add_check "prometheus" "fail" "No Prometheus server pod found in $MONITORING_NS"
    return
  fi

  local phase
  phase=$($KUBECTL get "$prom_pod" -n "$MONITORING_NS" -o jsonpath='{.status.phase}' 2>/dev/null)
  if [ "$phase" != "Running" ]; then
    add_check "prometheus" "fail" "Prometheus server pod phase: $phase"
    return
  fi

  # Check Prometheus is responding
  local prom_healthy
  prom_healthy=$($KUBECTL exec "$prom_pod" -n "$MONITORING_NS" -c prometheus-server -- \
    wget -q -O- "http://localhost:9090/-/healthy" 2>/dev/null || echo "unhealthy")

  if echo "$prom_healthy" | grep -qi "ok\|healthy"; then
    # Check target scraping
    local targets_up
    targets_up=$($KUBECTL exec "$prom_pod" -n "$MONITORING_NS" -c prometheus-server -- \
      wget -q -O- "http://localhost:9090/api/v1/targets" 2>/dev/null | \
      python3 -c "
import sys, json
try:
    data = json.load(sys.stdin)
    active = data.get('data',{}).get('activeTargets',[])
    up = sum(1 for t in active if t.get('health') == 'up')
    total = len(active)
    print(f'{up}/{total}')
except: print('unknown')
" 2>/dev/null || echo "unknown")
    add_check "prometheus" "ok" "Prometheus server healthy, targets: $targets_up up"
  else
    add_check "prometheus" "warn" "Prometheus server running but health check unclear"
  fi
}

check_alertmanager() {
  if $DRY_RUN; then
    add_check "alertmanager" "ok" "dry-run: would check Alertmanager health"
    return
  fi

  # Discover Alertmanager pod
  local am_pod
  am_pod=$($KUBECTL get pods -n "$MONITORING_NS" -l app.kubernetes.io/name=alertmanager -o name 2>/dev/null | head -1)
  if [ -z "$am_pod" ]; then
    am_pod=$($KUBECTL get pods -n "$MONITORING_NS" -o name 2>/dev/null | grep alertmanager | head -1)
  fi

  if [ -z "$am_pod" ]; then
    add_check "alertmanager" "fail" "No Alertmanager pod found in $MONITORING_NS"
    return
  fi

  local phase
  phase=$($KUBECTL get "$am_pod" -n "$MONITORING_NS" -o jsonpath='{.status.phase}' 2>/dev/null)
  if [ "$phase" != "Running" ]; then
    add_check "alertmanager" "fail" "Alertmanager pod phase: $phase"
    return
  fi

  # Check firing alerts
  local alert_info
  alert_info=$($KUBECTL exec "$am_pod" -n "$MONITORING_NS" -- \
    wget -q -O- "http://localhost:9093/api/v2/alerts?active=true" 2>/dev/null | \
    python3 -c "
import sys, json
try:
    alerts = json.load(sys.stdin)
    firing = [a for a in alerts if a.get('status',{}).get('state') == 'active']
    print(len(firing))
except: print('unknown')
" 2>/dev/null || echo "unknown")

  # Check silences
  local silence_count
  silence_count=$($KUBECTL exec "$am_pod" -n "$MONITORING_NS" -- \
    wget -q -O- "http://localhost:9093/api/v2/silences" 2>/dev/null | \
    python3 -c "
import sys, json
try:
    silences = json.load(sys.stdin)
    active = [s for s in silences if s.get('status',{}).get('state') == 'active']
    print(len(active))
except: print('0')
" 2>/dev/null || echo "0")

  if [ "$alert_info" = "unknown" ]; then
    add_check "alertmanager" "warn" "Alertmanager running but could not query alerts"
  else
    local status="ok"
    [ "$alert_info" -gt 0 ] 2>/dev/null && status="warn"
    add_check "alertmanager" "$status" "Alertmanager healthy: $alert_info firing alerts, $silence_count active silences"
  fi
}

check_grafana() {
  if $DRY_RUN; then
    add_check "grafana" "ok" "dry-run: would check Grafana health"
    return
  fi

  # Discover Grafana pod
  local grafana_pod
  grafana_pod=$($KUBECTL get pods -n "$MONITORING_NS" -l app.kubernetes.io/name=grafana -o name 2>/dev/null | head -1)
  if [ -z "$grafana_pod" ]; then
    grafana_pod=$($KUBECTL get pods -n "$MONITORING_NS" -o name 2>/dev/null | grep grafana | grep -v test | head -1)
  fi

  if [ -z "$grafana_pod" ]; then
    add_check "grafana" "fail" "No Grafana pod found in $MONITORING_NS"
    return
  fi

  local phase
  phase=$($KUBECTL get "$grafana_pod" -n "$MONITORING_NS" -o jsonpath='{.status.phase}' 2>/dev/null)
  if [ "$phase" != "Running" ]; then
    add_check "grafana" "fail" "Grafana pod phase: $phase"
    return
  fi

  # Check datasource connectivity
  local ds_info
  ds_info=$($KUBECTL exec "$grafana_pod" -n "$MONITORING_NS" -- \
    curl -sf "http://localhost:3000/api/datasources" 2>/dev/null | \
    python3 -c "
import sys, json
try:
    ds = json.load(sys.stdin)
    names = [d.get('name','?') for d in ds]
    print(f'{len(ds)} datasources: {\", \".join(names)}')
except: print('unknown')
" 2>/dev/null || echo "unknown")

  if [ "$ds_info" = "unknown" ]; then
    add_check "grafana" "warn" "Grafana running but could not query datasources (may need auth)"
  else
    add_check "grafana" "ok" "Grafana healthy, $ds_info"
  fi
}

check_snmp_exporters() {
  if $DRY_RUN; then
    add_check "snmp-exporters" "ok" "dry-run: would check SNMP exporter pods"
    return
  fi

  local exporters=("snmp-exporter" "idrac-redfish-exporter" "proxmox-exporter")
  local running=0 total=0

  for exporter in "${exporters[@]}"; do
    total=$((total + 1))
    local pod
    pod=$($KUBECTL get pods -n "$MONITORING_NS" -o name 2>/dev/null | grep "$exporter" | head -1)

    if [ -z "$pod" ]; then
      # Try all namespaces
      pod=$($KUBECTL get pods --all-namespaces -o custom-columns=NS:.metadata.namespace,NAME:.metadata.name --no-headers 2>/dev/null | \
        grep "$exporter" | head -1)
      if [ -z "$pod" ]; then
        add_check "exporter-$exporter" "warn" "$exporter pod not found"
        continue
      fi
      local ns
      ns=$(echo "$pod" | awk '{print $1}')
      local name
      name=$(echo "$pod" | awk '{print $2}')
      local phase
      phase=$($KUBECTL get pod "$name" -n "$ns" -o jsonpath='{.status.phase}' 2>/dev/null)
      if [ "$phase" = "Running" ]; then
        running=$((running + 1))
        add_check "exporter-$exporter" "ok" "$exporter running in $ns"
      else
        add_check "exporter-$exporter" "warn" "$exporter phase: $phase in $ns"
      fi
    else
      local phase
      phase=$($KUBECTL get "$pod" -n "$MONITORING_NS" -o jsonpath='{.status.phase}' 2>/dev/null)
      if [ "$phase" = "Running" ]; then
        running=$((running + 1))
        add_check "exporter-$exporter" "ok" "$exporter running"
      else
        add_check "exporter-$exporter" "warn" "$exporter phase: $phase"
      fi
    fi
  done
}

check_prometheus_storage() {
  if $DRY_RUN; then
    add_check "prometheus-storage" "ok" "dry-run: would check Prometheus storage usage"
    return
  fi

  local prom_pvc
  prom_pvc=$($KUBECTL get pvc -n "$MONITORING_NS" -o name 2>/dev/null | grep prometheus-server | head -1)

  if [ -z "$prom_pvc" ]; then
    add_check "prometheus-storage" "warn" "No Prometheus server PVC found"
    return
  fi

  # Check storage via Prometheus TSDB stats
  local prom_pod
  prom_pod=$($KUBECTL get pods -n "$MONITORING_NS" -l app.kubernetes.io/name=prometheus,app.kubernetes.io/component=server -o name 2>/dev/null | head -1)
  if [ -z "$prom_pod" ]; then
    prom_pod=$($KUBECTL get pods -n "$MONITORING_NS" -o name 2>/dev/null | grep prometheus-server | head -1)
  fi

  if [ -n "$prom_pod" ]; then
    local storage_info
    storage_info=$($KUBECTL exec "$prom_pod" -n "$MONITORING_NS" -c prometheus-server -- \
      df -h /data 2>/dev/null | tail -1 | awk '{printf "%s used of %s (%s)", $3, $2, $5}' || echo "unknown")
    add_check "prometheus-storage" "ok" "Prometheus storage: $storage_info"
  else
    add_check "prometheus-storage" "warn" "Could not check Prometheus storage"
  fi
}

# Run checks
check_prometheus
check_alertmanager
check_grafana
check_snmp_exporters
check_prometheus_storage

# Determine overall status
overall="ok"
for c in "${checks[@]}"; do
  if echo "$c" | grep -q '"status": "fail"'; then
    overall="fail"
    break
  elif echo "$c" | grep -q '"status": "warn"'; then
    overall="warn"
  fi
done

# Output JSON
checks_json=$(IFS=,; echo "${checks[*]}")
cat <<EOF
{"status": "$overall", "agent": "$AGENT", "checks": [$checks_json]}
EOF
add infrastructure agent team: 8 specialized agents + 14 diagnostic scripts Agents: devops-engineer, dba, security-engineer, sre, network-engineer, platform-engineer, observability-engineer, home-automation-engineer. Scripts: deploy-status, db-health, backup-verify, tls-check, crowdsec-status, authentik-audit, oom-investigator, resource-report, dns-check, network-health, nfs-health, truenas-status, platform-status, monitoring-health. Also: known-issues.md suppression list, cluster-health-checker port-forward fix. 2026-03-15 02:01:07 +00:00			`#!/usr/bin/env bash`
			`set -euo pipefail`

			`AGENT="monitoring-health"`
			`KUBECTL="kubectl --kubeconfig /Users/viktorbarzin/code/infra/config"`
			`MONITORING_NS="monitoring"`
			`DRY_RUN=false`

			`for arg in "$@"; do`
			`case "$arg" in`
			`--dry-run) DRY_RUN=true ;;`
			`esac`
			`done`

			`checks=()`

			`add_check() {`
			`local name="$1" status="$2" message="$3"`
			`checks+=("{\"name\": \"$name\", \"status\": \"$status\", \"message\": \"$message\"}")`
			`}`

			`check_prometheus() {`
			`if $DRY_RUN; then`
			`add_check "prometheus" "ok" "dry-run: would check Prometheus server health"`
			`return`
			`fi`

			`# Discover Prometheus server pod via labels`
			`local prom_pod`
			`prom_pod=$($KUBECTL get pods -n "$MONITORING_NS" -l app.kubernetes.io/name=prometheus,app.kubernetes.io/component=server -o name 2>/dev/null \| head -1)`
			`if [ -z "$prom_pod" ]; then`
			`prom_pod=$($KUBECTL get pods -n "$MONITORING_NS" -l app=prometheus,component=server -o name 2>/dev/null \| head -1)`
			`fi`
			`if [ -z "$prom_pod" ]; then`
			`prom_pod=$($KUBECTL get pods -n "$MONITORING_NS" -o name 2>/dev/null \| grep prometheus-server \| head -1)`
			`fi`

			`if [ -z "$prom_pod" ]; then`
			`add_check "prometheus" "fail" "No Prometheus server pod found in $MONITORING_NS"`
			`return`
			`fi`

			`local phase`
			`phase=$($KUBECTL get "$prom_pod" -n "$MONITORING_NS" -o jsonpath='{.status.phase}' 2>/dev/null)`
			`if [ "$phase" != "Running" ]; then`
			`add_check "prometheus" "fail" "Prometheus server pod phase: $phase"`
			`return`
			`fi`

			`# Check Prometheus is responding`
			`local prom_healthy`
			`prom_healthy=$($KUBECTL exec "$prom_pod" -n "$MONITORING_NS" -c prometheus-server -- \`
			`wget -q -O- "http://localhost:9090/-/healthy" 2>/dev/null \|\| echo "unhealthy")`

			`if echo "$prom_healthy" \| grep -qi "ok\\|healthy"; then`
			`# Check target scraping`
			`local targets_up`
			`targets_up=$($KUBECTL exec "$prom_pod" -n "$MONITORING_NS" -c prometheus-server -- \`
			`wget -q -O- "http://localhost:9090/api/v1/targets" 2>/dev/null \| \`
			`python3 -c "`
			`import sys, json`
			`try:`
			`data = json.load(sys.stdin)`
			`active = data.get('data',{}).get('activeTargets',[])`
			`up = sum(1 for t in active if t.get('health') == 'up')`
			`total = len(active)`
			`print(f'{up}/{total}')`
			`except: print('unknown')`
			`" 2>/dev/null \|\| echo "unknown")`
			`add_check "prometheus" "ok" "Prometheus server healthy, targets: $targets_up up"`
			`else`
			`add_check "prometheus" "warn" "Prometheus server running but health check unclear"`
			`fi`
			`}`

			`check_alertmanager() {`
			`if $DRY_RUN; then`
			`add_check "alertmanager" "ok" "dry-run: would check Alertmanager health"`
			`return`
			`fi`

			`# Discover Alertmanager pod`
			`local am_pod`
			`am_pod=$($KUBECTL get pods -n "$MONITORING_NS" -l app.kubernetes.io/name=alertmanager -o name 2>/dev/null \| head -1)`
			`if [ -z "$am_pod" ]; then`
			`am_pod=$($KUBECTL get pods -n "$MONITORING_NS" -o name 2>/dev/null \| grep alertmanager \| head -1)`
			`fi`

			`if [ -z "$am_pod" ]; then`
			`add_check "alertmanager" "fail" "No Alertmanager pod found in $MONITORING_NS"`
			`return`
			`fi`

			`local phase`
			`phase=$($KUBECTL get "$am_pod" -n "$MONITORING_NS" -o jsonpath='{.status.phase}' 2>/dev/null)`
			`if [ "$phase" != "Running" ]; then`
			`add_check "alertmanager" "fail" "Alertmanager pod phase: $phase"`
			`return`
			`fi`

			`# Check firing alerts`
			`local alert_info`
			`alert_info=$($KUBECTL exec "$am_pod" -n "$MONITORING_NS" -- \`
			`wget -q -O- "http://localhost:9093/api/v2/alerts?active=true" 2>/dev/null \| \`
			`python3 -c "`
			`import sys, json`
			`try:`
			`alerts = json.load(sys.stdin)`
			`firing = [a for a in alerts if a.get('status',{}).get('state') == 'active']`
			`print(len(firing))`
			`except: print('unknown')`
			`" 2>/dev/null \|\| echo "unknown")`

			`# Check silences`
			`local silence_count`
			`silence_count=$($KUBECTL exec "$am_pod" -n "$MONITORING_NS" -- \`
			`wget -q -O- "http://localhost:9093/api/v2/silences" 2>/dev/null \| \`
			`python3 -c "`
			`import sys, json`
			`try:`
			`silences = json.load(sys.stdin)`
			`active = [s for s in silences if s.get('status',{}).get('state') == 'active']`
			`print(len(active))`
			`except: print('0')`
			`" 2>/dev/null \|\| echo "0")`

			`if [ "$alert_info" = "unknown" ]; then`
			`add_check "alertmanager" "warn" "Alertmanager running but could not query alerts"`
			`else`
			`local status="ok"`
			`[ "$alert_info" -gt 0 ] 2>/dev/null && status="warn"`
			`add_check "alertmanager" "$status" "Alertmanager healthy: $alert_info firing alerts, $silence_count active silences"`
			`fi`
			`}`

			`check_grafana() {`
			`if $DRY_RUN; then`
			`add_check "grafana" "ok" "dry-run: would check Grafana health"`
			`return`
			`fi`

			`# Discover Grafana pod`
			`local grafana_pod`
			`grafana_pod=$($KUBECTL get pods -n "$MONITORING_NS" -l app.kubernetes.io/name=grafana -o name 2>/dev/null \| head -1)`
			`if [ -z "$grafana_pod" ]; then`
			`grafana_pod=$($KUBECTL get pods -n "$MONITORING_NS" -o name 2>/dev/null \| grep grafana \| grep -v test \| head -1)`
			`fi`

			`if [ -z "$grafana_pod" ]; then`
			`add_check "grafana" "fail" "No Grafana pod found in $MONITORING_NS"`
			`return`
			`fi`

			`local phase`
			`phase=$($KUBECTL get "$grafana_pod" -n "$MONITORING_NS" -o jsonpath='{.status.phase}' 2>/dev/null)`
			`if [ "$phase" != "Running" ]; then`
			`add_check "grafana" "fail" "Grafana pod phase: $phase"`
			`return`
			`fi`

			`# Check datasource connectivity`
			`local ds_info`
			`ds_info=$($KUBECTL exec "$grafana_pod" -n "$MONITORING_NS" -- \`
			`curl -sf "http://localhost:3000/api/datasources" 2>/dev/null \| \`
			`python3 -c "`
			`import sys, json`
			`try:`
			`ds = json.load(sys.stdin)`
			`names = [d.get('name','?') for d in ds]`
			`print(f'{len(ds)} datasources: {\", \".join(names)}')`
			`except: print('unknown')`
			`" 2>/dev/null \|\| echo "unknown")`

			`if [ "$ds_info" = "unknown" ]; then`
			`add_check "grafana" "warn" "Grafana running but could not query datasources (may need auth)"`
			`else`
			`add_check "grafana" "ok" "Grafana healthy, $ds_info"`
			`fi`
			`}`

			`check_snmp_exporters() {`
			`if $DRY_RUN; then`
			`add_check "snmp-exporters" "ok" "dry-run: would check SNMP exporter pods"`
			`return`
			`fi`

			`local exporters=("snmp-exporter" "idrac-redfish-exporter" "proxmox-exporter")`
			`local running=0 total=0`

			`for exporter in "${exporters[@]}"; do`
			`total=$((total + 1))`
			`local pod`
			`pod=$($KUBECTL get pods -n "$MONITORING_NS" -o name 2>/dev/null \| grep "$exporter" \| head -1)`

			`if [ -z "$pod" ]; then`
			`# Try all namespaces`
			`pod=$($KUBECTL get pods --all-namespaces -o custom-columns=NS:.metadata.namespace,NAME:.metadata.name --no-headers 2>/dev/null \| \`
			`grep "$exporter" \| head -1)`
			`if [ -z "$pod" ]; then`
			`add_check "exporter-$exporter" "warn" "$exporter pod not found"`
			`continue`
			`fi`
			`local ns`
			`ns=$(echo "$pod" \| awk '{print $1}')`
			`local name`
			`name=$(echo "$pod" \| awk '{print $2}')`
			`local phase`
			`phase=$($KUBECTL get pod "$name" -n "$ns" -o jsonpath='{.status.phase}' 2>/dev/null)`
			`if [ "$phase" = "Running" ]; then`
			`running=$((running + 1))`
			`add_check "exporter-$exporter" "ok" "$exporter running in $ns"`
			`else`
			`add_check "exporter-$exporter" "warn" "$exporter phase: $phase in $ns"`
			`fi`
			`else`
			`local phase`
			`phase=$($KUBECTL get "$pod" -n "$MONITORING_NS" -o jsonpath='{.status.phase}' 2>/dev/null)`
			`if [ "$phase" = "Running" ]; then`
			`running=$((running + 1))`
			`add_check "exporter-$exporter" "ok" "$exporter running"`
			`else`
			`add_check "exporter-$exporter" "warn" "$exporter phase: $phase"`
			`fi`
			`fi`
			`done`
			`}`

			`check_prometheus_storage() {`
			`if $DRY_RUN; then`
			`add_check "prometheus-storage" "ok" "dry-run: would check Prometheus storage usage"`
			`return`
			`fi`

			`local prom_pvc`
			`prom_pvc=$($KUBECTL get pvc -n "$MONITORING_NS" -o name 2>/dev/null \| grep prometheus-server \| head -1)`

			`if [ -z "$prom_pvc" ]; then`
			`add_check "prometheus-storage" "warn" "No Prometheus server PVC found"`
			`return`
			`fi`

			`# Check storage via Prometheus TSDB stats`
			`local prom_pod`
			`prom_pod=$($KUBECTL get pods -n "$MONITORING_NS" -l app.kubernetes.io/name=prometheus,app.kubernetes.io/component=server -o name 2>/dev/null \| head -1)`
			`if [ -z "$prom_pod" ]; then`
			`prom_pod=$($KUBECTL get pods -n "$MONITORING_NS" -o name 2>/dev/null \| grep prometheus-server \| head -1)`
			`fi`

			`if [ -n "$prom_pod" ]; then`
			`local storage_info`
			`storage_info=$($KUBECTL exec "$prom_pod" -n "$MONITORING_NS" -c prometheus-server -- \`
			`df -h /data 2>/dev/null \| tail -1 \| awk '{printf "%s used of %s (%s)", $3, $2, $5}' \|\| echo "unknown")`
			`add_check "prometheus-storage" "ok" "Prometheus storage: $storage_info"`
			`else`
			`add_check "prometheus-storage" "warn" "Could not check Prometheus storage"`
			`fi`
			`}`

			`# Run checks`
			`check_prometheus`
			`check_alertmanager`
			`check_grafana`
			`check_snmp_exporters`
			`check_prometheus_storage`

			`# Determine overall status`
			`overall="ok"`
			`for c in "${checks[@]}"; do`
			`if echo "$c" \| grep -q '"status": "fail"'; then`
			`overall="fail"`
			`break`
			`elif echo "$c" \| grep -q '"status": "warn"'; then`
			`overall="warn"`
			`fi`
			`done`

			`# Output JSON`
			`checks_json=$(IFS=,; echo "${checks[*]}")`
			`cat <<EOF`
			`{"status": "$overall", "agent": "$AGENT", "checks": [$checks_json]}`
			`EOF`