Agents: devops-engineer, dba, security-engineer, sre, network-engineer, platform-engineer, observability-engineer, home-automation-engineer. Scripts: deploy-status, db-health, backup-verify, tls-check, crowdsec-status, authentik-audit, oom-investigator, resource-report, dns-check, network-health, nfs-health, truenas-status, platform-status, monitoring-health. Also: known-issues.md suppression list, cluster-health-checker port-forward fix.
214 lines
7.1 KiB
Bash
Executable file
214 lines
7.1 KiB
Bash
Executable file
#!/usr/bin/env bash
|
|
set -euo pipefail
|
|
|
|
KUBECTL="kubectl --kubeconfig /Users/viktorbarzin/code/infra/config"
|
|
DRY_RUN=false
|
|
AGENT="oom-investigator"
|
|
|
|
for arg in "$@"; do
|
|
case "$arg" in
|
|
--dry-run) DRY_RUN=true ;;
|
|
esac
|
|
done
|
|
|
|
CHECKS="[]"
|
|
|
|
add_check() {
|
|
local name="$1" status="$2" message="$3"
|
|
CHECKS=$(echo "$CHECKS" | python3 -c "
|
|
import sys, json
|
|
checks = json.load(sys.stdin)
|
|
checks.append({'name': '''$name''', 'status': '''$status''', 'message': '''$message'''})
|
|
json.dump(checks, sys.stdout)
|
|
")
|
|
}
|
|
|
|
# Find OOMKilled pods across all namespaces
|
|
find_oomkilled() {
|
|
if $DRY_RUN; then
|
|
add_check "oom-killed-pods" "ok" "DRY RUN: would check for OOMKilled pods across all namespaces"
|
|
return
|
|
fi
|
|
|
|
local oom_pods
|
|
oom_pods=$($KUBECTL get pods --all-namespaces -o json | python3 -c "
|
|
import sys, json
|
|
data = json.load(sys.stdin)
|
|
results = []
|
|
for pod in data.get('items', []):
|
|
ns = pod['metadata']['namespace']
|
|
name = pod['metadata']['name']
|
|
for cs in pod.get('status', {}).get('containerStatuses', []) + pod.get('status', {}).get('initContainerStatuses', []):
|
|
last = cs.get('lastState', {}).get('terminated', {})
|
|
current = cs.get('state', {}).get('terminated', {})
|
|
for state in [last, current]:
|
|
if state.get('reason') == 'OOMKilled':
|
|
container = cs['name']
|
|
restart_count = cs.get('restartCount', 0)
|
|
finished = state.get('finishedAt', 'unknown')
|
|
results.append({'namespace': ns, 'pod': name, 'container': container, 'restarts': restart_count, 'finishedAt': finished})
|
|
json.dump(results, sys.stdout)
|
|
" 2>/dev/null) || oom_pods="[]"
|
|
|
|
local count
|
|
count=$(echo "$oom_pods" | python3 -c "import sys,json; print(len(json.load(sys.stdin)))")
|
|
|
|
if [ "$count" -eq 0 ]; then
|
|
add_check "oom-killed-pods" "ok" "No OOMKilled pods found"
|
|
else
|
|
add_check "oom-killed-pods" "fail" "Found $count OOMKilled container(s): $(echo "$oom_pods" | python3 -c "
|
|
import sys,json
|
|
pods = json.load(sys.stdin)
|
|
print('; '.join(f\"{p['namespace']}/{p['pod']}:{p['container']} (restarts={p['restarts']}, at={p['finishedAt']})\" for p in pods))
|
|
")"
|
|
fi
|
|
}
|
|
|
|
# Check LimitRange defaults in namespaces with OOM events
|
|
check_limitranges() {
|
|
if $DRY_RUN; then
|
|
add_check "limitranges" "ok" "DRY RUN: would check LimitRange defaults"
|
|
return
|
|
fi
|
|
|
|
local namespaces
|
|
namespaces=$($KUBECTL get pods --all-namespaces -o json | python3 -c "
|
|
import sys, json
|
|
data = json.load(sys.stdin)
|
|
ns_set = set()
|
|
for pod in data.get('items', []):
|
|
for cs in pod.get('status', {}).get('containerStatuses', []) + pod.get('status', {}).get('initContainerStatuses', []):
|
|
for state in [cs.get('lastState', {}).get('terminated', {}), cs.get('state', {}).get('terminated', {})]:
|
|
if state.get('reason') == 'OOMKilled':
|
|
ns_set.add(pod['metadata']['namespace'])
|
|
for ns in sorted(ns_set):
|
|
print(ns)
|
|
" 2>/dev/null) || namespaces=""
|
|
|
|
if [ -z "$namespaces" ]; then
|
|
add_check "limitranges" "ok" "No namespaces with OOMKilled pods to check"
|
|
return
|
|
fi
|
|
|
|
local lr_info=""
|
|
while IFS= read -r ns; do
|
|
local lr
|
|
lr=$($KUBECTL get limitrange -n "$ns" -o json 2>/dev/null | python3 -c "
|
|
import sys, json
|
|
data = json.load(sys.stdin)
|
|
for item in data.get('items', []):
|
|
for limit in item.get('spec', {}).get('limits', []):
|
|
if limit.get('type') == 'Container':
|
|
default_mem = limit.get('default', {}).get('memory', 'none')
|
|
default_cpu = limit.get('default', {}).get('cpu', 'none')
|
|
print(f'$ns: default memory={default_mem}, cpu={default_cpu}')
|
|
" 2>/dev/null) || lr=""
|
|
if [ -n "$lr" ]; then
|
|
lr_info="${lr_info}${lr}; "
|
|
else
|
|
lr_info="${lr_info}${ns}: no LimitRange; "
|
|
fi
|
|
done <<< "$namespaces"
|
|
|
|
add_check "limitranges" "warn" "LimitRange defaults for OOM namespaces: ${lr_info}"
|
|
}
|
|
|
|
# Check VPA recommendations from Goldilocks
|
|
check_vpa_recommendations() {
|
|
if $DRY_RUN; then
|
|
add_check "vpa-recommendations" "ok" "DRY RUN: would check VPA recommendations"
|
|
return
|
|
fi
|
|
|
|
local vpa_count
|
|
vpa_count=$($KUBECTL get vpa --all-namespaces --no-headers 2>/dev/null | wc -l | tr -d ' ') || vpa_count=0
|
|
|
|
if [ "$vpa_count" -eq 0 ]; then
|
|
add_check "vpa-recommendations" "warn" "No VPA objects found — Goldilocks may not be deployed"
|
|
return
|
|
fi
|
|
|
|
local vpa_recs
|
|
vpa_recs=$($KUBECTL get vpa --all-namespaces -o json 2>/dev/null | python3 -c "
|
|
import sys, json
|
|
data = json.load(sys.stdin)
|
|
recs = []
|
|
for vpa in data.get('items', []):
|
|
ns = vpa['metadata']['namespace']
|
|
name = vpa['metadata']['name']
|
|
for cr in vpa.get('status', {}).get('recommendation', {}).get('containerRecommendations', []):
|
|
container = cr.get('containerName', 'unknown')
|
|
target_mem = cr.get('target', {}).get('memory', 'n/a')
|
|
target_cpu = cr.get('target', {}).get('cpu', 'n/a')
|
|
upper_mem = cr.get('upperBound', {}).get('memory', 'n/a')
|
|
recs.append(f'{ns}/{name}:{container} target_mem={target_mem} target_cpu={target_cpu} upper_mem={upper_mem}')
|
|
if recs:
|
|
print('; '.join(recs[:20]))
|
|
else:
|
|
print('No recommendations available yet')
|
|
" 2>/dev/null) || vpa_recs="Failed to read VPA recommendations"
|
|
|
|
add_check "vpa-recommendations" "ok" "$vpa_recs"
|
|
}
|
|
|
|
# Check resource requests/limits on OOMKilled pods
|
|
check_pod_resources() {
|
|
if $DRY_RUN; then
|
|
add_check "pod-resources" "ok" "DRY RUN: would check pod resource specs"
|
|
return
|
|
fi
|
|
|
|
local resources
|
|
resources=$($KUBECTL get pods --all-namespaces -o json | python3 -c "
|
|
import sys, json
|
|
data = json.load(sys.stdin)
|
|
results = []
|
|
for pod in data.get('items', []):
|
|
ns = pod['metadata']['namespace']
|
|
name = pod['metadata']['name']
|
|
has_oom = False
|
|
for cs in pod.get('status', {}).get('containerStatuses', []) + pod.get('status', {}).get('initContainerStatuses', []):
|
|
for state in [cs.get('lastState', {}).get('terminated', {}), cs.get('state', {}).get('terminated', {})]:
|
|
if state.get('reason') == 'OOMKilled':
|
|
has_oom = True
|
|
break
|
|
if has_oom:
|
|
for c in pod.get('spec', {}).get('containers', []) + pod.get('spec', {}).get('initContainers', []):
|
|
req_mem = c.get('resources', {}).get('requests', {}).get('memory', 'none')
|
|
lim_mem = c.get('resources', {}).get('limits', {}).get('memory', 'none')
|
|
req_cpu = c.get('resources', {}).get('requests', {}).get('cpu', 'none')
|
|
lim_cpu = c.get('resources', {}).get('limits', {}).get('cpu', 'none')
|
|
results.append(f\"{ns}/{name}:{c['name']} req_mem={req_mem} lim_mem={lim_mem} req_cpu={req_cpu} lim_cpu={lim_cpu}\")
|
|
if results:
|
|
print('; '.join(results))
|
|
else:
|
|
print('No OOMKilled pods to inspect')
|
|
" 2>/dev/null) || resources="Failed to check pod resources"
|
|
|
|
if echo "$resources" | grep -q "No OOMKilled"; then
|
|
add_check "pod-resources" "ok" "$resources"
|
|
else
|
|
add_check "pod-resources" "warn" "$resources"
|
|
fi
|
|
}
|
|
|
|
# Run all checks
|
|
find_oomkilled
|
|
check_limitranges
|
|
check_vpa_recommendations
|
|
check_pod_resources
|
|
|
|
# Determine overall status
|
|
OVERALL=$(echo "$CHECKS" | python3 -c "
|
|
import sys, json
|
|
checks = json.load(sys.stdin)
|
|
statuses = [c['status'] for c in checks]
|
|
if 'fail' in statuses:
|
|
print('fail')
|
|
elif 'warn' in statuses:
|
|
print('warn')
|
|
else:
|
|
print('ok')
|
|
")
|
|
|
|
echo "{\"status\": \"$OVERALL\", \"agent\": \"$AGENT\", \"checks\": $CHECKS}" | python3 -m json.tool
|