infra/.claude/scripts/oom-investigator.sh
Viktor Barzin ff83ec3325 add infrastructure agent team: 8 specialized agents + 14 diagnostic scripts
Agents: devops-engineer, dba, security-engineer, sre, network-engineer,
platform-engineer, observability-engineer, home-automation-engineer.
Scripts: deploy-status, db-health, backup-verify, tls-check, crowdsec-status,
authentik-audit, oom-investigator, resource-report, dns-check, network-health,
nfs-health, truenas-status, platform-status, monitoring-health.
Also: known-issues.md suppression list, cluster-health-checker port-forward fix.
2026-03-15 02:01:07 +00:00

214 lines
7.1 KiB
Bash
Executable file

#!/usr/bin/env bash
set -euo pipefail
KUBECTL="kubectl --kubeconfig /Users/viktorbarzin/code/infra/config"
DRY_RUN=false
AGENT="oom-investigator"
for arg in "$@"; do
case "$arg" in
--dry-run) DRY_RUN=true ;;
esac
done
CHECKS="[]"
add_check() {
local name="$1" status="$2" message="$3"
CHECKS=$(echo "$CHECKS" | python3 -c "
import sys, json
checks = json.load(sys.stdin)
checks.append({'name': '''$name''', 'status': '''$status''', 'message': '''$message'''})
json.dump(checks, sys.stdout)
")
}
# Find OOMKilled pods across all namespaces
find_oomkilled() {
if $DRY_RUN; then
add_check "oom-killed-pods" "ok" "DRY RUN: would check for OOMKilled pods across all namespaces"
return
fi
local oom_pods
oom_pods=$($KUBECTL get pods --all-namespaces -o json | python3 -c "
import sys, json
data = json.load(sys.stdin)
results = []
for pod in data.get('items', []):
ns = pod['metadata']['namespace']
name = pod['metadata']['name']
for cs in pod.get('status', {}).get('containerStatuses', []) + pod.get('status', {}).get('initContainerStatuses', []):
last = cs.get('lastState', {}).get('terminated', {})
current = cs.get('state', {}).get('terminated', {})
for state in [last, current]:
if state.get('reason') == 'OOMKilled':
container = cs['name']
restart_count = cs.get('restartCount', 0)
finished = state.get('finishedAt', 'unknown')
results.append({'namespace': ns, 'pod': name, 'container': container, 'restarts': restart_count, 'finishedAt': finished})
json.dump(results, sys.stdout)
" 2>/dev/null) || oom_pods="[]"
local count
count=$(echo "$oom_pods" | python3 -c "import sys,json; print(len(json.load(sys.stdin)))")
if [ "$count" -eq 0 ]; then
add_check "oom-killed-pods" "ok" "No OOMKilled pods found"
else
add_check "oom-killed-pods" "fail" "Found $count OOMKilled container(s): $(echo "$oom_pods" | python3 -c "
import sys,json
pods = json.load(sys.stdin)
print('; '.join(f\"{p['namespace']}/{p['pod']}:{p['container']} (restarts={p['restarts']}, at={p['finishedAt']})\" for p in pods))
")"
fi
}
# Check LimitRange defaults in namespaces with OOM events
check_limitranges() {
if $DRY_RUN; then
add_check "limitranges" "ok" "DRY RUN: would check LimitRange defaults"
return
fi
local namespaces
namespaces=$($KUBECTL get pods --all-namespaces -o json | python3 -c "
import sys, json
data = json.load(sys.stdin)
ns_set = set()
for pod in data.get('items', []):
for cs in pod.get('status', {}).get('containerStatuses', []) + pod.get('status', {}).get('initContainerStatuses', []):
for state in [cs.get('lastState', {}).get('terminated', {}), cs.get('state', {}).get('terminated', {})]:
if state.get('reason') == 'OOMKilled':
ns_set.add(pod['metadata']['namespace'])
for ns in sorted(ns_set):
print(ns)
" 2>/dev/null) || namespaces=""
if [ -z "$namespaces" ]; then
add_check "limitranges" "ok" "No namespaces with OOMKilled pods to check"
return
fi
local lr_info=""
while IFS= read -r ns; do
local lr
lr=$($KUBECTL get limitrange -n "$ns" -o json 2>/dev/null | python3 -c "
import sys, json
data = json.load(sys.stdin)
for item in data.get('items', []):
for limit in item.get('spec', {}).get('limits', []):
if limit.get('type') == 'Container':
default_mem = limit.get('default', {}).get('memory', 'none')
default_cpu = limit.get('default', {}).get('cpu', 'none')
print(f'$ns: default memory={default_mem}, cpu={default_cpu}')
" 2>/dev/null) || lr=""
if [ -n "$lr" ]; then
lr_info="${lr_info}${lr}; "
else
lr_info="${lr_info}${ns}: no LimitRange; "
fi
done <<< "$namespaces"
add_check "limitranges" "warn" "LimitRange defaults for OOM namespaces: ${lr_info}"
}
# Check VPA recommendations from Goldilocks
check_vpa_recommendations() {
if $DRY_RUN; then
add_check "vpa-recommendations" "ok" "DRY RUN: would check VPA recommendations"
return
fi
local vpa_count
vpa_count=$($KUBECTL get vpa --all-namespaces --no-headers 2>/dev/null | wc -l | tr -d ' ') || vpa_count=0
if [ "$vpa_count" -eq 0 ]; then
add_check "vpa-recommendations" "warn" "No VPA objects found — Goldilocks may not be deployed"
return
fi
local vpa_recs
vpa_recs=$($KUBECTL get vpa --all-namespaces -o json 2>/dev/null | python3 -c "
import sys, json
data = json.load(sys.stdin)
recs = []
for vpa in data.get('items', []):
ns = vpa['metadata']['namespace']
name = vpa['metadata']['name']
for cr in vpa.get('status', {}).get('recommendation', {}).get('containerRecommendations', []):
container = cr.get('containerName', 'unknown')
target_mem = cr.get('target', {}).get('memory', 'n/a')
target_cpu = cr.get('target', {}).get('cpu', 'n/a')
upper_mem = cr.get('upperBound', {}).get('memory', 'n/a')
recs.append(f'{ns}/{name}:{container} target_mem={target_mem} target_cpu={target_cpu} upper_mem={upper_mem}')
if recs:
print('; '.join(recs[:20]))
else:
print('No recommendations available yet')
" 2>/dev/null) || vpa_recs="Failed to read VPA recommendations"
add_check "vpa-recommendations" "ok" "$vpa_recs"
}
# Check resource requests/limits on OOMKilled pods
check_pod_resources() {
if $DRY_RUN; then
add_check "pod-resources" "ok" "DRY RUN: would check pod resource specs"
return
fi
local resources
resources=$($KUBECTL get pods --all-namespaces -o json | python3 -c "
import sys, json
data = json.load(sys.stdin)
results = []
for pod in data.get('items', []):
ns = pod['metadata']['namespace']
name = pod['metadata']['name']
has_oom = False
for cs in pod.get('status', {}).get('containerStatuses', []) + pod.get('status', {}).get('initContainerStatuses', []):
for state in [cs.get('lastState', {}).get('terminated', {}), cs.get('state', {}).get('terminated', {})]:
if state.get('reason') == 'OOMKilled':
has_oom = True
break
if has_oom:
for c in pod.get('spec', {}).get('containers', []) + pod.get('spec', {}).get('initContainers', []):
req_mem = c.get('resources', {}).get('requests', {}).get('memory', 'none')
lim_mem = c.get('resources', {}).get('limits', {}).get('memory', 'none')
req_cpu = c.get('resources', {}).get('requests', {}).get('cpu', 'none')
lim_cpu = c.get('resources', {}).get('limits', {}).get('cpu', 'none')
results.append(f\"{ns}/{name}:{c['name']} req_mem={req_mem} lim_mem={lim_mem} req_cpu={req_cpu} lim_cpu={lim_cpu}\")
if results:
print('; '.join(results))
else:
print('No OOMKilled pods to inspect')
" 2>/dev/null) || resources="Failed to check pod resources"
if echo "$resources" | grep -q "No OOMKilled"; then
add_check "pod-resources" "ok" "$resources"
else
add_check "pod-resources" "warn" "$resources"
fi
}
# Run all checks
find_oomkilled
check_limitranges
check_vpa_recommendations
check_pod_resources
# Determine overall status
OVERALL=$(echo "$CHECKS" | python3 -c "
import sys, json
checks = json.load(sys.stdin)
statuses = [c['status'] for c in checks]
if 'fail' in statuses:
print('fail')
elif 'warn' in statuses:
print('warn')
else:
print('ok')
")
echo "{\"status\": \"$OVERALL\", \"agent\": \"$AGENT\", \"checks\": $CHECKS}" | python3 -m json.tool