add infrastructure agent team: 8 specialized agents + 14 diagnostic scripts
Agents: devops-engineer, dba, security-engineer, sre, network-engineer, platform-engineer, observability-engineer, home-automation-engineer. Scripts: deploy-status, db-health, backup-verify, tls-check, crowdsec-status, authentik-audit, oom-investigator, resource-report, dns-check, network-health, nfs-health, truenas-status, platform-status, monitoring-health. Also: known-issues.md suppression list, cluster-health-checker port-forward fix.
This commit is contained in:
parent
fca4e02c54
commit
ff83ec3325
24 changed files with 3153 additions and 1 deletions
217
.claude/scripts/deploy-status.sh
Executable file
217
.claude/scripts/deploy-status.sh
Executable file
|
|
@ -0,0 +1,217 @@
|
|||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
KUBECTL="kubectl --kubeconfig /Users/viktorbarzin/code/infra/config"
|
||||
DRY_RUN=false
|
||||
AGENT="deploy-status"
|
||||
|
||||
for arg in "$@"; do
|
||||
case "$arg" in
|
||||
--dry-run) DRY_RUN=true ;;
|
||||
esac
|
||||
done
|
||||
|
||||
CHECKS="[]"
|
||||
|
||||
add_check() {
|
||||
local name="$1" status="$2" message="$3"
|
||||
CHECKS=$(echo "$CHECKS" | python3 -c "
|
||||
import sys, json
|
||||
checks = json.load(sys.stdin)
|
||||
checks.append({'name': '''$name''', 'status': '''$status''', 'message': '''$message'''})
|
||||
json.dump(checks, sys.stdout)
|
||||
")
|
||||
}
|
||||
|
||||
# Check for stalled rollouts (Progressing=False or deadline exceeded)
|
||||
check_stalled_rollouts() {
|
||||
if $DRY_RUN; then
|
||||
add_check "stalled-rollouts" "ok" "DRY RUN: would check for stalled deployment rollouts"
|
||||
return
|
||||
fi
|
||||
|
||||
local stalled
|
||||
stalled=$($KUBECTL get deployments --all-namespaces -o json 2>/dev/null | python3 -c "
|
||||
import sys, json
|
||||
data = json.load(sys.stdin)
|
||||
stalled = []
|
||||
for dep in data.get('items', []):
|
||||
ns = dep['metadata']['namespace']
|
||||
name = dep['metadata']['name']
|
||||
conditions = dep.get('status', {}).get('conditions', [])
|
||||
for cond in conditions:
|
||||
if cond.get('type') == 'Progressing' and cond.get('status') == 'False':
|
||||
reason = cond.get('reason', 'unknown')
|
||||
stalled.append(f'{ns}/{name}: {reason}')
|
||||
elif cond.get('type') == 'Available' and cond.get('status') == 'False':
|
||||
reason = cond.get('reason', 'unknown')
|
||||
stalled.append(f'{ns}/{name}: unavailable ({reason})')
|
||||
if stalled:
|
||||
print('; '.join(stalled))
|
||||
else:
|
||||
print('')
|
||||
" 2>/dev/null) || stalled="Failed to check deployments"
|
||||
|
||||
if [ -z "$stalled" ]; then
|
||||
add_check "stalled-rollouts" "ok" "No stalled rollouts detected"
|
||||
else
|
||||
add_check "stalled-rollouts" "fail" "Stalled rollouts: $stalled"
|
||||
fi
|
||||
}
|
||||
|
||||
# Check for unavailable replicas
|
||||
check_unavailable_replicas() {
|
||||
if $DRY_RUN; then
|
||||
add_check "unavailable-replicas" "ok" "DRY RUN: would check for deployments with unavailable replicas"
|
||||
return
|
||||
fi
|
||||
|
||||
local unavail
|
||||
unavail=$($KUBECTL get deployments --all-namespaces -o json 2>/dev/null | python3 -c "
|
||||
import sys, json
|
||||
data = json.load(sys.stdin)
|
||||
issues = []
|
||||
for dep in data.get('items', []):
|
||||
ns = dep['metadata']['namespace']
|
||||
name = dep['metadata']['name']
|
||||
spec_replicas = dep.get('spec', {}).get('replicas', 1)
|
||||
ready = dep.get('status', {}).get('readyReplicas', 0) or 0
|
||||
unavailable = dep.get('status', {}).get('unavailableReplicas', 0) or 0
|
||||
if unavailable > 0 or ready < spec_replicas:
|
||||
issues.append(f'{ns}/{name}: {ready}/{spec_replicas} ready, {unavailable} unavailable')
|
||||
if issues:
|
||||
print('; '.join(issues))
|
||||
else:
|
||||
print('')
|
||||
" 2>/dev/null) || unavail="Failed to check replicas"
|
||||
|
||||
if [ -z "$unavail" ]; then
|
||||
add_check "unavailable-replicas" "ok" "All deployments have desired replicas ready"
|
||||
else
|
||||
add_check "unavailable-replicas" "warn" "Unavailable replicas: $unavail"
|
||||
fi
|
||||
}
|
||||
|
||||
# Check for image pull errors
|
||||
check_image_pull_errors() {
|
||||
if $DRY_RUN; then
|
||||
add_check "image-pull-errors" "ok" "DRY RUN: would check for ImagePullBackOff/ErrImagePull pods"
|
||||
return
|
||||
fi
|
||||
|
||||
local pull_errors
|
||||
pull_errors=$($KUBECTL get pods --all-namespaces -o json 2>/dev/null | python3 -c "
|
||||
import sys, json
|
||||
data = json.load(sys.stdin)
|
||||
errors = []
|
||||
for pod in data.get('items', []):
|
||||
ns = pod['metadata']['namespace']
|
||||
name = pod['metadata']['name']
|
||||
for cs in pod.get('status', {}).get('containerStatuses', []) + pod.get('status', {}).get('initContainerStatuses', []):
|
||||
waiting = cs.get('state', {}).get('waiting', {})
|
||||
reason = waiting.get('reason', '')
|
||||
if reason in ('ImagePullBackOff', 'ErrImagePull', 'InvalidImageName'):
|
||||
image = cs.get('image', 'unknown')
|
||||
msg = waiting.get('message', '')[:100]
|
||||
errors.append(f'{ns}/{name}: {reason} image={image} ({msg})')
|
||||
if errors:
|
||||
print('; '.join(errors))
|
||||
else:
|
||||
print('')
|
||||
" 2>/dev/null) || pull_errors="Failed to check image pulls"
|
||||
|
||||
if [ -z "$pull_errors" ]; then
|
||||
add_check "image-pull-errors" "ok" "No image pull errors found"
|
||||
else
|
||||
add_check "image-pull-errors" "fail" "Image pull errors: $pull_errors"
|
||||
fi
|
||||
}
|
||||
|
||||
# Check for recent restarts (>5 in last hour)
|
||||
check_recent_restarts() {
|
||||
if $DRY_RUN; then
|
||||
add_check "recent-restarts" "ok" "DRY RUN: would check for pods with high restart counts"
|
||||
return
|
||||
fi
|
||||
|
||||
local restarts
|
||||
restarts=$($KUBECTL get pods --all-namespaces -o json 2>/dev/null | python3 -c "
|
||||
import sys, json
|
||||
data = json.load(sys.stdin)
|
||||
high_restart = []
|
||||
for pod in data.get('items', []):
|
||||
ns = pod['metadata']['namespace']
|
||||
name = pod['metadata']['name']
|
||||
for cs in pod.get('status', {}).get('containerStatuses', []):
|
||||
count = cs.get('restartCount', 0)
|
||||
if count >= 5:
|
||||
container = cs['name']
|
||||
high_restart.append(f'{ns}/{name}:{container} restarts={count}')
|
||||
if high_restart:
|
||||
print('; '.join(sorted(high_restart, key=lambda x: int(x.split('=')[1]), reverse=True)[:20]))
|
||||
else:
|
||||
print('')
|
||||
" 2>/dev/null) || restarts="Failed to check restarts"
|
||||
|
||||
if [ -z "$restarts" ]; then
|
||||
add_check "recent-restarts" "ok" "No pods with 5+ restarts"
|
||||
else
|
||||
add_check "recent-restarts" "warn" "High restart counts: $restarts"
|
||||
fi
|
||||
}
|
||||
|
||||
# Check CrashLoopBackOff pods
|
||||
check_crashloop() {
|
||||
if $DRY_RUN; then
|
||||
add_check "crashloop" "ok" "DRY RUN: would check for CrashLoopBackOff pods"
|
||||
return
|
||||
fi
|
||||
|
||||
local crashloop
|
||||
crashloop=$($KUBECTL get pods --all-namespaces -o json 2>/dev/null | python3 -c "
|
||||
import sys, json
|
||||
data = json.load(sys.stdin)
|
||||
crashes = []
|
||||
for pod in data.get('items', []):
|
||||
ns = pod['metadata']['namespace']
|
||||
name = pod['metadata']['name']
|
||||
for cs in pod.get('status', {}).get('containerStatuses', []):
|
||||
waiting = cs.get('state', {}).get('waiting', {})
|
||||
if waiting.get('reason') == 'CrashLoopBackOff':
|
||||
container = cs['name']
|
||||
restarts = cs.get('restartCount', 0)
|
||||
crashes.append(f'{ns}/{name}:{container} restarts={restarts}')
|
||||
if crashes:
|
||||
print('; '.join(crashes))
|
||||
else:
|
||||
print('')
|
||||
" 2>/dev/null) || crashloop="Failed to check crashloop"
|
||||
|
||||
if [ -z "$crashloop" ]; then
|
||||
add_check "crashloop" "ok" "No CrashLoopBackOff pods"
|
||||
else
|
||||
add_check "crashloop" "fail" "CrashLoopBackOff: $crashloop"
|
||||
fi
|
||||
}
|
||||
|
||||
# Run all checks
|
||||
check_stalled_rollouts
|
||||
check_unavailable_replicas
|
||||
check_image_pull_errors
|
||||
check_recent_restarts
|
||||
check_crashloop
|
||||
|
||||
# Determine overall status
|
||||
OVERALL=$(echo "$CHECKS" | python3 -c "
|
||||
import sys, json
|
||||
checks = json.load(sys.stdin)
|
||||
statuses = [c['status'] for c in checks]
|
||||
if 'fail' in statuses:
|
||||
print('fail')
|
||||
elif 'warn' in statuses:
|
||||
print('warn')
|
||||
else:
|
||||
print('ok')
|
||||
")
|
||||
|
||||
echo "{\"status\": \"$OVERALL\", \"agent\": \"$AGENT\", \"checks\": $CHECKS}" | python3 -m json.tool
|
||||
Loading…
Add table
Add a link
Reference in a new issue