Agents: devops-engineer, dba, security-engineer, sre, network-engineer, platform-engineer, observability-engineer, home-automation-engineer. Scripts: deploy-status, db-health, backup-verify, tls-check, crowdsec-status, authentik-audit, oom-investigator, resource-report, dns-check, network-health, nfs-health, truenas-status, platform-status, monitoring-health. Also: known-issues.md suppression list, cluster-health-checker port-forward fix.
217 lines
6.5 KiB
Bash
Executable file
217 lines
6.5 KiB
Bash
Executable file
#!/usr/bin/env bash
|
|
set -euo pipefail
|
|
|
|
KUBECTL="kubectl --kubeconfig /Users/viktorbarzin/code/infra/config"
|
|
DRY_RUN=false
|
|
AGENT="deploy-status"
|
|
|
|
for arg in "$@"; do
|
|
case "$arg" in
|
|
--dry-run) DRY_RUN=true ;;
|
|
esac
|
|
done
|
|
|
|
CHECKS="[]"
|
|
|
|
add_check() {
|
|
local name="$1" status="$2" message="$3"
|
|
CHECKS=$(echo "$CHECKS" | python3 -c "
|
|
import sys, json
|
|
checks = json.load(sys.stdin)
|
|
checks.append({'name': '''$name''', 'status': '''$status''', 'message': '''$message'''})
|
|
json.dump(checks, sys.stdout)
|
|
")
|
|
}
|
|
|
|
# Check for stalled rollouts (Progressing=False or deadline exceeded)
|
|
check_stalled_rollouts() {
|
|
if $DRY_RUN; then
|
|
add_check "stalled-rollouts" "ok" "DRY RUN: would check for stalled deployment rollouts"
|
|
return
|
|
fi
|
|
|
|
local stalled
|
|
stalled=$($KUBECTL get deployments --all-namespaces -o json 2>/dev/null | python3 -c "
|
|
import sys, json
|
|
data = json.load(sys.stdin)
|
|
stalled = []
|
|
for dep in data.get('items', []):
|
|
ns = dep['metadata']['namespace']
|
|
name = dep['metadata']['name']
|
|
conditions = dep.get('status', {}).get('conditions', [])
|
|
for cond in conditions:
|
|
if cond.get('type') == 'Progressing' and cond.get('status') == 'False':
|
|
reason = cond.get('reason', 'unknown')
|
|
stalled.append(f'{ns}/{name}: {reason}')
|
|
elif cond.get('type') == 'Available' and cond.get('status') == 'False':
|
|
reason = cond.get('reason', 'unknown')
|
|
stalled.append(f'{ns}/{name}: unavailable ({reason})')
|
|
if stalled:
|
|
print('; '.join(stalled))
|
|
else:
|
|
print('')
|
|
" 2>/dev/null) || stalled="Failed to check deployments"
|
|
|
|
if [ -z "$stalled" ]; then
|
|
add_check "stalled-rollouts" "ok" "No stalled rollouts detected"
|
|
else
|
|
add_check "stalled-rollouts" "fail" "Stalled rollouts: $stalled"
|
|
fi
|
|
}
|
|
|
|
# Check for unavailable replicas
|
|
check_unavailable_replicas() {
|
|
if $DRY_RUN; then
|
|
add_check "unavailable-replicas" "ok" "DRY RUN: would check for deployments with unavailable replicas"
|
|
return
|
|
fi
|
|
|
|
local unavail
|
|
unavail=$($KUBECTL get deployments --all-namespaces -o json 2>/dev/null | python3 -c "
|
|
import sys, json
|
|
data = json.load(sys.stdin)
|
|
issues = []
|
|
for dep in data.get('items', []):
|
|
ns = dep['metadata']['namespace']
|
|
name = dep['metadata']['name']
|
|
spec_replicas = dep.get('spec', {}).get('replicas', 1)
|
|
ready = dep.get('status', {}).get('readyReplicas', 0) or 0
|
|
unavailable = dep.get('status', {}).get('unavailableReplicas', 0) or 0
|
|
if unavailable > 0 or ready < spec_replicas:
|
|
issues.append(f'{ns}/{name}: {ready}/{spec_replicas} ready, {unavailable} unavailable')
|
|
if issues:
|
|
print('; '.join(issues))
|
|
else:
|
|
print('')
|
|
" 2>/dev/null) || unavail="Failed to check replicas"
|
|
|
|
if [ -z "$unavail" ]; then
|
|
add_check "unavailable-replicas" "ok" "All deployments have desired replicas ready"
|
|
else
|
|
add_check "unavailable-replicas" "warn" "Unavailable replicas: $unavail"
|
|
fi
|
|
}
|
|
|
|
# Check for image pull errors
|
|
check_image_pull_errors() {
|
|
if $DRY_RUN; then
|
|
add_check "image-pull-errors" "ok" "DRY RUN: would check for ImagePullBackOff/ErrImagePull pods"
|
|
return
|
|
fi
|
|
|
|
local pull_errors
|
|
pull_errors=$($KUBECTL get pods --all-namespaces -o json 2>/dev/null | python3 -c "
|
|
import sys, json
|
|
data = json.load(sys.stdin)
|
|
errors = []
|
|
for pod in data.get('items', []):
|
|
ns = pod['metadata']['namespace']
|
|
name = pod['metadata']['name']
|
|
for cs in pod.get('status', {}).get('containerStatuses', []) + pod.get('status', {}).get('initContainerStatuses', []):
|
|
waiting = cs.get('state', {}).get('waiting', {})
|
|
reason = waiting.get('reason', '')
|
|
if reason in ('ImagePullBackOff', 'ErrImagePull', 'InvalidImageName'):
|
|
image = cs.get('image', 'unknown')
|
|
msg = waiting.get('message', '')[:100]
|
|
errors.append(f'{ns}/{name}: {reason} image={image} ({msg})')
|
|
if errors:
|
|
print('; '.join(errors))
|
|
else:
|
|
print('')
|
|
" 2>/dev/null) || pull_errors="Failed to check image pulls"
|
|
|
|
if [ -z "$pull_errors" ]; then
|
|
add_check "image-pull-errors" "ok" "No image pull errors found"
|
|
else
|
|
add_check "image-pull-errors" "fail" "Image pull errors: $pull_errors"
|
|
fi
|
|
}
|
|
|
|
# Check for recent restarts (>5 in last hour)
|
|
check_recent_restarts() {
|
|
if $DRY_RUN; then
|
|
add_check "recent-restarts" "ok" "DRY RUN: would check for pods with high restart counts"
|
|
return
|
|
fi
|
|
|
|
local restarts
|
|
restarts=$($KUBECTL get pods --all-namespaces -o json 2>/dev/null | python3 -c "
|
|
import sys, json
|
|
data = json.load(sys.stdin)
|
|
high_restart = []
|
|
for pod in data.get('items', []):
|
|
ns = pod['metadata']['namespace']
|
|
name = pod['metadata']['name']
|
|
for cs in pod.get('status', {}).get('containerStatuses', []):
|
|
count = cs.get('restartCount', 0)
|
|
if count >= 5:
|
|
container = cs['name']
|
|
high_restart.append(f'{ns}/{name}:{container} restarts={count}')
|
|
if high_restart:
|
|
print('; '.join(sorted(high_restart, key=lambda x: int(x.split('=')[1]), reverse=True)[:20]))
|
|
else:
|
|
print('')
|
|
" 2>/dev/null) || restarts="Failed to check restarts"
|
|
|
|
if [ -z "$restarts" ]; then
|
|
add_check "recent-restarts" "ok" "No pods with 5+ restarts"
|
|
else
|
|
add_check "recent-restarts" "warn" "High restart counts: $restarts"
|
|
fi
|
|
}
|
|
|
|
# Check CrashLoopBackOff pods
|
|
check_crashloop() {
|
|
if $DRY_RUN; then
|
|
add_check "crashloop" "ok" "DRY RUN: would check for CrashLoopBackOff pods"
|
|
return
|
|
fi
|
|
|
|
local crashloop
|
|
crashloop=$($KUBECTL get pods --all-namespaces -o json 2>/dev/null | python3 -c "
|
|
import sys, json
|
|
data = json.load(sys.stdin)
|
|
crashes = []
|
|
for pod in data.get('items', []):
|
|
ns = pod['metadata']['namespace']
|
|
name = pod['metadata']['name']
|
|
for cs in pod.get('status', {}).get('containerStatuses', []):
|
|
waiting = cs.get('state', {}).get('waiting', {})
|
|
if waiting.get('reason') == 'CrashLoopBackOff':
|
|
container = cs['name']
|
|
restarts = cs.get('restartCount', 0)
|
|
crashes.append(f'{ns}/{name}:{container} restarts={restarts}')
|
|
if crashes:
|
|
print('; '.join(crashes))
|
|
else:
|
|
print('')
|
|
" 2>/dev/null) || crashloop="Failed to check crashloop"
|
|
|
|
if [ -z "$crashloop" ]; then
|
|
add_check "crashloop" "ok" "No CrashLoopBackOff pods"
|
|
else
|
|
add_check "crashloop" "fail" "CrashLoopBackOff: $crashloop"
|
|
fi
|
|
}
|
|
|
|
# Run all checks
|
|
check_stalled_rollouts
|
|
check_unavailable_replicas
|
|
check_image_pull_errors
|
|
check_recent_restarts
|
|
check_crashloop
|
|
|
|
# Determine overall status
|
|
OVERALL=$(echo "$CHECKS" | python3 -c "
|
|
import sys, json
|
|
checks = json.load(sys.stdin)
|
|
statuses = [c['status'] for c in checks]
|
|
if 'fail' in statuses:
|
|
print('fail')
|
|
elif 'warn' in statuses:
|
|
print('warn')
|
|
else:
|
|
print('ok')
|
|
")
|
|
|
|
echo "{\"status\": \"$OVERALL\", \"agent\": \"$AGENT\", \"checks\": $CHECKS}" | python3 -m json.tool
|