fix: restore tree dropped by 6d224861; land stem95su gdrive-sync (10m) [ci skip]
6d224861 came from a --no-checkout worktree whose empty index made the
commit drop every file except two. This restores 05b50d2b's full tree and
correctly adds stacks/stem95su/gdrive-sync.tf + the service-catalog stem95su
entry. Forward-only (parent=6d224861, no force-push); [ci skip] since the
live infra was never applied from the broken commit.
Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
parent
6d224861c4
commit
fd0f4a0365
1166 changed files with 358546 additions and 0 deletions
194
.claude/scripts/db-health.sh
Executable file
194
.claude/scripts/db-health.sh
Executable file
|
|
@ -0,0 +1,194 @@
|
|||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
KUBECTL="kubectl --kubeconfig /Users/viktorbarzin/code/infra/config"
|
||||
DRY_RUN=false
|
||||
AGENT="db-health"
|
||||
|
||||
for arg in "$@"; do
|
||||
case "$arg" in
|
||||
--dry-run) DRY_RUN=true ;;
|
||||
esac
|
||||
done
|
||||
|
||||
CHECKS="[]"
|
||||
|
||||
add_check() {
|
||||
local name="$1" status="$2" message="$3"
|
||||
CHECKS=$(echo "$CHECKS" | python3 -c "
|
||||
import sys, json
|
||||
checks = json.load(sys.stdin)
|
||||
checks.append({'name': '''$name''', 'status': '''$status''', 'message': '''$message'''})
|
||||
json.dump(checks, sys.stdout)
|
||||
")
|
||||
}
|
||||
|
||||
# MySQL InnoDB Cluster - Group Replication status
|
||||
check_mysql_gr() {
|
||||
if $DRY_RUN; then
|
||||
add_check "mysql-group-replication" "ok" "DRY RUN: would check MySQL Group Replication status"
|
||||
return
|
||||
fi
|
||||
|
||||
# Discover MySQL pod via labels first, fall back to known name
|
||||
local mysql_pod
|
||||
mysql_pod=$($KUBECTL get pods -n dbaas -l app=mysql-cluster -o name 2>/dev/null | head -1) || true
|
||||
if [ -z "$mysql_pod" ]; then
|
||||
mysql_pod=$($KUBECTL get pods -n dbaas -l app.kubernetes.io/name=mysql -o name 2>/dev/null | head -1) || true
|
||||
fi
|
||||
if [ -z "$mysql_pod" ]; then
|
||||
mysql_pod="sts/mysql-cluster"
|
||||
fi
|
||||
|
||||
local gr_status
|
||||
gr_status=$($KUBECTL exec "$mysql_pod" -n dbaas -- mysql -N -e \
|
||||
"SELECT MEMBER_HOST, MEMBER_STATE, MEMBER_ROLE FROM performance_schema.replication_group_members" 2>/dev/null) || {
|
||||
add_check "mysql-group-replication" "fail" "Cannot connect to MySQL cluster to check GR status"
|
||||
return
|
||||
}
|
||||
|
||||
local member_count online_count
|
||||
member_count=$(echo "$gr_status" | grep -c . || true)
|
||||
online_count=$(echo "$gr_status" | grep -c "ONLINE" || true)
|
||||
|
||||
if [ "$online_count" -eq "$member_count" ] && [ "$member_count" -ge 3 ]; then
|
||||
add_check "mysql-group-replication" "ok" "All $member_count members ONLINE: $(echo "$gr_status" | tr '\t' ' ' | tr '\n' '; ')"
|
||||
elif [ "$online_count" -lt "$member_count" ]; then
|
||||
add_check "mysql-group-replication" "fail" "Only $online_count/$member_count members ONLINE: $(echo "$gr_status" | tr '\t' ' ' | tr '\n' '; ')"
|
||||
else
|
||||
add_check "mysql-group-replication" "warn" "Cluster has $member_count members (expected 3): $(echo "$gr_status" | tr '\t' ' ' | tr '\n' '; ')"
|
||||
fi
|
||||
}
|
||||
|
||||
# MySQL pod health
|
||||
check_mysql_pods() {
|
||||
if $DRY_RUN; then
|
||||
add_check "mysql-pods" "ok" "DRY RUN: would check MySQL pod status"
|
||||
return
|
||||
fi
|
||||
|
||||
local pod_status
|
||||
pod_status=$($KUBECTL get pods -n dbaas -l app=mysql-cluster -o wide --no-headers 2>/dev/null) || \
|
||||
pod_status=$($KUBECTL get pods -n dbaas --no-headers 2>/dev/null | grep -i mysql) || {
|
||||
add_check "mysql-pods" "warn" "Cannot find MySQL pods in dbaas namespace"
|
||||
return
|
||||
}
|
||||
|
||||
local not_running
|
||||
not_running=$(echo "$pod_status" | grep -v "Running" | grep -v "Completed" || true)
|
||||
|
||||
if [ -z "$not_running" ]; then
|
||||
local count
|
||||
count=$(echo "$pod_status" | grep -c "Running" || true)
|
||||
add_check "mysql-pods" "ok" "$count MySQL pod(s) running in dbaas namespace"
|
||||
else
|
||||
add_check "mysql-pods" "fail" "Unhealthy MySQL pods: $(echo "$not_running" | awk '{print $1": "$3}' | tr '\n' '; ')"
|
||||
fi
|
||||
}
|
||||
|
||||
# CNPG PostgreSQL cluster health
|
||||
check_cnpg() {
|
||||
if $DRY_RUN; then
|
||||
add_check "cnpg-clusters" "ok" "DRY RUN: would check CNPG PostgreSQL cluster health"
|
||||
return
|
||||
fi
|
||||
|
||||
# Check if CNPG CRDs exist
|
||||
local cnpg_clusters
|
||||
cnpg_clusters=$($KUBECTL get cluster.postgresql.cnpg.io --all-namespaces -o json 2>/dev/null) || {
|
||||
add_check "cnpg-clusters" "warn" "CNPG CRD not found or no clusters deployed"
|
||||
return
|
||||
}
|
||||
|
||||
local report
|
||||
report=$(echo "$cnpg_clusters" | python3 -c "
|
||||
import sys, json
|
||||
data = json.load(sys.stdin)
|
||||
results = []
|
||||
all_healthy = True
|
||||
for cluster in data.get('items', []):
|
||||
ns = cluster['metadata']['namespace']
|
||||
name = cluster['metadata']['name']
|
||||
phase = cluster.get('status', {}).get('phase', 'unknown')
|
||||
ready = cluster.get('status', {}).get('readyInstances', 0)
|
||||
instances = cluster.get('spec', {}).get('instances', 0)
|
||||
primary = cluster.get('status', {}).get('currentPrimary', 'unknown')
|
||||
if phase != 'Cluster in healthy state' and phase != 'Healthy':
|
||||
all_healthy = False
|
||||
if ready < instances:
|
||||
all_healthy = False
|
||||
results.append(f'{ns}/{name}: phase={phase} ready={ready}/{instances} primary={primary}')
|
||||
print('HEALTHY' if all_healthy else 'UNHEALTHY')
|
||||
print('; '.join(results))
|
||||
" 2>/dev/null) || report="Failed to parse CNPG status"
|
||||
|
||||
local health_line
|
||||
health_line=$(echo "$report" | head -1)
|
||||
local detail_line
|
||||
detail_line=$(echo "$report" | tail -1)
|
||||
|
||||
if [ "$health_line" = "HEALTHY" ]; then
|
||||
add_check "cnpg-clusters" "ok" "$detail_line"
|
||||
else
|
||||
add_check "cnpg-clusters" "fail" "$detail_line"
|
||||
fi
|
||||
}
|
||||
|
||||
# Database connection counts (MySQL)
|
||||
check_mysql_connections() {
|
||||
if $DRY_RUN; then
|
||||
add_check "mysql-connections" "ok" "DRY RUN: would check MySQL connection counts"
|
||||
return
|
||||
fi
|
||||
|
||||
local mysql_pod
|
||||
mysql_pod=$($KUBECTL get pods -n dbaas -l app=mysql-cluster -o name 2>/dev/null | head -1) || true
|
||||
if [ -z "$mysql_pod" ]; then
|
||||
mysql_pod="sts/mysql-cluster"
|
||||
fi
|
||||
|
||||
local conn_info
|
||||
conn_info=$($KUBECTL exec "$mysql_pod" -n dbaas -- mysql -N -e \
|
||||
"SELECT 'threads_connected', VARIABLE_VALUE FROM performance_schema.global_status WHERE VARIABLE_NAME='Threads_connected' UNION ALL SELECT 'max_connections', VARIABLE_VALUE FROM performance_schema.global_variables WHERE VARIABLE_NAME='max_connections'" 2>/dev/null) || {
|
||||
add_check "mysql-connections" "warn" "Cannot query MySQL connection info"
|
||||
return
|
||||
}
|
||||
|
||||
local threads_connected max_connections
|
||||
threads_connected=$(echo "$conn_info" | grep threads_connected | awk '{print $2}') || threads_connected="unknown"
|
||||
max_connections=$(echo "$conn_info" | grep max_connections | awk '{print $2}') || max_connections="unknown"
|
||||
|
||||
if [ "$threads_connected" != "unknown" ] && [ "$max_connections" != "unknown" ]; then
|
||||
local pct=$((threads_connected * 100 / max_connections))
|
||||
if [ "$pct" -gt 80 ]; then
|
||||
add_check "mysql-connections" "fail" "MySQL connections at ${pct}%: $threads_connected/$max_connections"
|
||||
elif [ "$pct" -gt 60 ]; then
|
||||
add_check "mysql-connections" "warn" "MySQL connections at ${pct}%: $threads_connected/$max_connections"
|
||||
else
|
||||
add_check "mysql-connections" "ok" "MySQL connections: $threads_connected/$max_connections (${pct}%)"
|
||||
fi
|
||||
else
|
||||
add_check "mysql-connections" "warn" "MySQL connections: threads=$threads_connected max=$max_connections"
|
||||
fi
|
||||
}
|
||||
|
||||
# Run all checks
|
||||
check_mysql_gr
|
||||
check_mysql_pods
|
||||
check_cnpg
|
||||
check_mysql_connections
|
||||
|
||||
# Determine overall status
|
||||
OVERALL=$(echo "$CHECKS" | python3 -c "
|
||||
import sys, json
|
||||
checks = json.load(sys.stdin)
|
||||
statuses = [c['status'] for c in checks]
|
||||
if 'fail' in statuses:
|
||||
print('fail')
|
||||
elif 'warn' in statuses:
|
||||
print('warn')
|
||||
else:
|
||||
print('ok')
|
||||
")
|
||||
|
||||
echo "{\"status\": \"$OVERALL\", \"agent\": \"$AGENT\", \"checks\": $CHECKS}" | python3 -m json.tool
|
||||
Loading…
Add table
Add a link
Reference in a new issue