infra/.claude/scripts/db-health.sh
Viktor Barzin ff83ec3325 add infrastructure agent team: 8 specialized agents + 14 diagnostic scripts
Agents: devops-engineer, dba, security-engineer, sre, network-engineer,
platform-engineer, observability-engineer, home-automation-engineer.
Scripts: deploy-status, db-health, backup-verify, tls-check, crowdsec-status,
authentik-audit, oom-investigator, resource-report, dns-check, network-health,
nfs-health, truenas-status, platform-status, monitoring-health.
Also: known-issues.md suppression list, cluster-health-checker port-forward fix.
2026-03-15 02:01:07 +00:00

194 lines
6.7 KiB
Bash
Executable file

#!/usr/bin/env bash
set -euo pipefail
KUBECTL="kubectl --kubeconfig /Users/viktorbarzin/code/infra/config"
DRY_RUN=false
AGENT="db-health"
for arg in "$@"; do
case "$arg" in
--dry-run) DRY_RUN=true ;;
esac
done
CHECKS="[]"
add_check() {
local name="$1" status="$2" message="$3"
CHECKS=$(echo "$CHECKS" | python3 -c "
import sys, json
checks = json.load(sys.stdin)
checks.append({'name': '''$name''', 'status': '''$status''', 'message': '''$message'''})
json.dump(checks, sys.stdout)
")
}
# MySQL InnoDB Cluster - Group Replication status
check_mysql_gr() {
if $DRY_RUN; then
add_check "mysql-group-replication" "ok" "DRY RUN: would check MySQL Group Replication status"
return
fi
# Discover MySQL pod via labels first, fall back to known name
local mysql_pod
mysql_pod=$($KUBECTL get pods -n dbaas -l app=mysql-cluster -o name 2>/dev/null | head -1) || true
if [ -z "$mysql_pod" ]; then
mysql_pod=$($KUBECTL get pods -n dbaas -l app.kubernetes.io/name=mysql -o name 2>/dev/null | head -1) || true
fi
if [ -z "$mysql_pod" ]; then
mysql_pod="sts/mysql-cluster"
fi
local gr_status
gr_status=$($KUBECTL exec "$mysql_pod" -n dbaas -- mysql -N -e \
"SELECT MEMBER_HOST, MEMBER_STATE, MEMBER_ROLE FROM performance_schema.replication_group_members" 2>/dev/null) || {
add_check "mysql-group-replication" "fail" "Cannot connect to MySQL cluster to check GR status"
return
}
local member_count online_count
member_count=$(echo "$gr_status" | grep -c . || true)
online_count=$(echo "$gr_status" | grep -c "ONLINE" || true)
if [ "$online_count" -eq "$member_count" ] && [ "$member_count" -ge 3 ]; then
add_check "mysql-group-replication" "ok" "All $member_count members ONLINE: $(echo "$gr_status" | tr '\t' ' ' | tr '\n' '; ')"
elif [ "$online_count" -lt "$member_count" ]; then
add_check "mysql-group-replication" "fail" "Only $online_count/$member_count members ONLINE: $(echo "$gr_status" | tr '\t' ' ' | tr '\n' '; ')"
else
add_check "mysql-group-replication" "warn" "Cluster has $member_count members (expected 3): $(echo "$gr_status" | tr '\t' ' ' | tr '\n' '; ')"
fi
}
# MySQL pod health
check_mysql_pods() {
if $DRY_RUN; then
add_check "mysql-pods" "ok" "DRY RUN: would check MySQL pod status"
return
fi
local pod_status
pod_status=$($KUBECTL get pods -n dbaas -l app=mysql-cluster -o wide --no-headers 2>/dev/null) || \
pod_status=$($KUBECTL get pods -n dbaas --no-headers 2>/dev/null | grep -i mysql) || {
add_check "mysql-pods" "warn" "Cannot find MySQL pods in dbaas namespace"
return
}
local not_running
not_running=$(echo "$pod_status" | grep -v "Running" | grep -v "Completed" || true)
if [ -z "$not_running" ]; then
local count
count=$(echo "$pod_status" | grep -c "Running" || true)
add_check "mysql-pods" "ok" "$count MySQL pod(s) running in dbaas namespace"
else
add_check "mysql-pods" "fail" "Unhealthy MySQL pods: $(echo "$not_running" | awk '{print $1": "$3}' | tr '\n' '; ')"
fi
}
# CNPG PostgreSQL cluster health
check_cnpg() {
if $DRY_RUN; then
add_check "cnpg-clusters" "ok" "DRY RUN: would check CNPG PostgreSQL cluster health"
return
fi
# Check if CNPG CRDs exist
local cnpg_clusters
cnpg_clusters=$($KUBECTL get cluster.postgresql.cnpg.io --all-namespaces -o json 2>/dev/null) || {
add_check "cnpg-clusters" "warn" "CNPG CRD not found or no clusters deployed"
return
}
local report
report=$(echo "$cnpg_clusters" | python3 -c "
import sys, json
data = json.load(sys.stdin)
results = []
all_healthy = True
for cluster in data.get('items', []):
ns = cluster['metadata']['namespace']
name = cluster['metadata']['name']
phase = cluster.get('status', {}).get('phase', 'unknown')
ready = cluster.get('status', {}).get('readyInstances', 0)
instances = cluster.get('spec', {}).get('instances', 0)
primary = cluster.get('status', {}).get('currentPrimary', 'unknown')
if phase != 'Cluster in healthy state' and phase != 'Healthy':
all_healthy = False
if ready < instances:
all_healthy = False
results.append(f'{ns}/{name}: phase={phase} ready={ready}/{instances} primary={primary}')
print('HEALTHY' if all_healthy else 'UNHEALTHY')
print('; '.join(results))
" 2>/dev/null) || report="Failed to parse CNPG status"
local health_line
health_line=$(echo "$report" | head -1)
local detail_line
detail_line=$(echo "$report" | tail -1)
if [ "$health_line" = "HEALTHY" ]; then
add_check "cnpg-clusters" "ok" "$detail_line"
else
add_check "cnpg-clusters" "fail" "$detail_line"
fi
}
# Database connection counts (MySQL)
check_mysql_connections() {
if $DRY_RUN; then
add_check "mysql-connections" "ok" "DRY RUN: would check MySQL connection counts"
return
fi
local mysql_pod
mysql_pod=$($KUBECTL get pods -n dbaas -l app=mysql-cluster -o name 2>/dev/null | head -1) || true
if [ -z "$mysql_pod" ]; then
mysql_pod="sts/mysql-cluster"
fi
local conn_info
conn_info=$($KUBECTL exec "$mysql_pod" -n dbaas -- mysql -N -e \
"SELECT 'threads_connected', VARIABLE_VALUE FROM performance_schema.global_status WHERE VARIABLE_NAME='Threads_connected' UNION ALL SELECT 'max_connections', VARIABLE_VALUE FROM performance_schema.global_variables WHERE VARIABLE_NAME='max_connections'" 2>/dev/null) || {
add_check "mysql-connections" "warn" "Cannot query MySQL connection info"
return
}
local threads_connected max_connections
threads_connected=$(echo "$conn_info" | grep threads_connected | awk '{print $2}') || threads_connected="unknown"
max_connections=$(echo "$conn_info" | grep max_connections | awk '{print $2}') || max_connections="unknown"
if [ "$threads_connected" != "unknown" ] && [ "$max_connections" != "unknown" ]; then
local pct=$((threads_connected * 100 / max_connections))
if [ "$pct" -gt 80 ]; then
add_check "mysql-connections" "fail" "MySQL connections at ${pct}%: $threads_connected/$max_connections"
elif [ "$pct" -gt 60 ]; then
add_check "mysql-connections" "warn" "MySQL connections at ${pct}%: $threads_connected/$max_connections"
else
add_check "mysql-connections" "ok" "MySQL connections: $threads_connected/$max_connections (${pct}%)"
fi
else
add_check "mysql-connections" "warn" "MySQL connections: threads=$threads_connected max=$max_connections"
fi
}
# Run all checks
check_mysql_gr
check_mysql_pods
check_cnpg
check_mysql_connections
# Determine overall status
OVERALL=$(echo "$CHECKS" | python3 -c "
import sys, json
checks = json.load(sys.stdin)
statuses = [c['status'] for c in checks]
if 'fail' in statuses:
print('fail')
elif 'warn' in statuses:
print('warn')
else:
print('ok')
")
echo "{\"status\": \"$OVERALL\", \"agent\": \"$AGENT\", \"checks\": $CHECKS}" | python3 -m json.tool