infra/.claude/scripts/deploy-status.sh
Viktor Barzin ff83ec3325 add infrastructure agent team: 8 specialized agents + 14 diagnostic scripts
Agents: devops-engineer, dba, security-engineer, sre, network-engineer,
platform-engineer, observability-engineer, home-automation-engineer.
Scripts: deploy-status, db-health, backup-verify, tls-check, crowdsec-status,
authentik-audit, oom-investigator, resource-report, dns-check, network-health,
nfs-health, truenas-status, platform-status, monitoring-health.
Also: known-issues.md suppression list, cluster-health-checker port-forward fix.
2026-03-15 02:01:07 +00:00

217 lines
6.5 KiB
Bash
Executable file

#!/usr/bin/env bash
set -euo pipefail
KUBECTL="kubectl --kubeconfig /Users/viktorbarzin/code/infra/config"
DRY_RUN=false
AGENT="deploy-status"
for arg in "$@"; do
case "$arg" in
--dry-run) DRY_RUN=true ;;
esac
done
CHECKS="[]"
add_check() {
local name="$1" status="$2" message="$3"
CHECKS=$(echo "$CHECKS" | python3 -c "
import sys, json
checks = json.load(sys.stdin)
checks.append({'name': '''$name''', 'status': '''$status''', 'message': '''$message'''})
json.dump(checks, sys.stdout)
")
}
# Check for stalled rollouts (Progressing=False or deadline exceeded)
check_stalled_rollouts() {
if $DRY_RUN; then
add_check "stalled-rollouts" "ok" "DRY RUN: would check for stalled deployment rollouts"
return
fi
local stalled
stalled=$($KUBECTL get deployments --all-namespaces -o json 2>/dev/null | python3 -c "
import sys, json
data = json.load(sys.stdin)
stalled = []
for dep in data.get('items', []):
ns = dep['metadata']['namespace']
name = dep['metadata']['name']
conditions = dep.get('status', {}).get('conditions', [])
for cond in conditions:
if cond.get('type') == 'Progressing' and cond.get('status') == 'False':
reason = cond.get('reason', 'unknown')
stalled.append(f'{ns}/{name}: {reason}')
elif cond.get('type') == 'Available' and cond.get('status') == 'False':
reason = cond.get('reason', 'unknown')
stalled.append(f'{ns}/{name}: unavailable ({reason})')
if stalled:
print('; '.join(stalled))
else:
print('')
" 2>/dev/null) || stalled="Failed to check deployments"
if [ -z "$stalled" ]; then
add_check "stalled-rollouts" "ok" "No stalled rollouts detected"
else
add_check "stalled-rollouts" "fail" "Stalled rollouts: $stalled"
fi
}
# Check for unavailable replicas
check_unavailable_replicas() {
if $DRY_RUN; then
add_check "unavailable-replicas" "ok" "DRY RUN: would check for deployments with unavailable replicas"
return
fi
local unavail
unavail=$($KUBECTL get deployments --all-namespaces -o json 2>/dev/null | python3 -c "
import sys, json
data = json.load(sys.stdin)
issues = []
for dep in data.get('items', []):
ns = dep['metadata']['namespace']
name = dep['metadata']['name']
spec_replicas = dep.get('spec', {}).get('replicas', 1)
ready = dep.get('status', {}).get('readyReplicas', 0) or 0
unavailable = dep.get('status', {}).get('unavailableReplicas', 0) or 0
if unavailable > 0 or ready < spec_replicas:
issues.append(f'{ns}/{name}: {ready}/{spec_replicas} ready, {unavailable} unavailable')
if issues:
print('; '.join(issues))
else:
print('')
" 2>/dev/null) || unavail="Failed to check replicas"
if [ -z "$unavail" ]; then
add_check "unavailable-replicas" "ok" "All deployments have desired replicas ready"
else
add_check "unavailable-replicas" "warn" "Unavailable replicas: $unavail"
fi
}
# Check for image pull errors
check_image_pull_errors() {
if $DRY_RUN; then
add_check "image-pull-errors" "ok" "DRY RUN: would check for ImagePullBackOff/ErrImagePull pods"
return
fi
local pull_errors
pull_errors=$($KUBECTL get pods --all-namespaces -o json 2>/dev/null | python3 -c "
import sys, json
data = json.load(sys.stdin)
errors = []
for pod in data.get('items', []):
ns = pod['metadata']['namespace']
name = pod['metadata']['name']
for cs in pod.get('status', {}).get('containerStatuses', []) + pod.get('status', {}).get('initContainerStatuses', []):
waiting = cs.get('state', {}).get('waiting', {})
reason = waiting.get('reason', '')
if reason in ('ImagePullBackOff', 'ErrImagePull', 'InvalidImageName'):
image = cs.get('image', 'unknown')
msg = waiting.get('message', '')[:100]
errors.append(f'{ns}/{name}: {reason} image={image} ({msg})')
if errors:
print('; '.join(errors))
else:
print('')
" 2>/dev/null) || pull_errors="Failed to check image pulls"
if [ -z "$pull_errors" ]; then
add_check "image-pull-errors" "ok" "No image pull errors found"
else
add_check "image-pull-errors" "fail" "Image pull errors: $pull_errors"
fi
}
# Check for recent restarts (>5 in last hour)
check_recent_restarts() {
if $DRY_RUN; then
add_check "recent-restarts" "ok" "DRY RUN: would check for pods with high restart counts"
return
fi
local restarts
restarts=$($KUBECTL get pods --all-namespaces -o json 2>/dev/null | python3 -c "
import sys, json
data = json.load(sys.stdin)
high_restart = []
for pod in data.get('items', []):
ns = pod['metadata']['namespace']
name = pod['metadata']['name']
for cs in pod.get('status', {}).get('containerStatuses', []):
count = cs.get('restartCount', 0)
if count >= 5:
container = cs['name']
high_restart.append(f'{ns}/{name}:{container} restarts={count}')
if high_restart:
print('; '.join(sorted(high_restart, key=lambda x: int(x.split('=')[1]), reverse=True)[:20]))
else:
print('')
" 2>/dev/null) || restarts="Failed to check restarts"
if [ -z "$restarts" ]; then
add_check "recent-restarts" "ok" "No pods with 5+ restarts"
else
add_check "recent-restarts" "warn" "High restart counts: $restarts"
fi
}
# Check CrashLoopBackOff pods
check_crashloop() {
if $DRY_RUN; then
add_check "crashloop" "ok" "DRY RUN: would check for CrashLoopBackOff pods"
return
fi
local crashloop
crashloop=$($KUBECTL get pods --all-namespaces -o json 2>/dev/null | python3 -c "
import sys, json
data = json.load(sys.stdin)
crashes = []
for pod in data.get('items', []):
ns = pod['metadata']['namespace']
name = pod['metadata']['name']
for cs in pod.get('status', {}).get('containerStatuses', []):
waiting = cs.get('state', {}).get('waiting', {})
if waiting.get('reason') == 'CrashLoopBackOff':
container = cs['name']
restarts = cs.get('restartCount', 0)
crashes.append(f'{ns}/{name}:{container} restarts={restarts}')
if crashes:
print('; '.join(crashes))
else:
print('')
" 2>/dev/null) || crashloop="Failed to check crashloop"
if [ -z "$crashloop" ]; then
add_check "crashloop" "ok" "No CrashLoopBackOff pods"
else
add_check "crashloop" "fail" "CrashLoopBackOff: $crashloop"
fi
}
# Run all checks
check_stalled_rollouts
check_unavailable_replicas
check_image_pull_errors
check_recent_restarts
check_crashloop
# Determine overall status
OVERALL=$(echo "$CHECKS" | python3 -c "
import sys, json
checks = json.load(sys.stdin)
statuses = [c['status'] for c in checks]
if 'fail' in statuses:
print('fail')
elif 'warn' in statuses:
print('warn')
else:
print('ok')
")
echo "{\"status\": \"$OVERALL\", \"agent\": \"$AGENT\", \"checks\": $CHECKS}" | python3 -m json.tool