infra/.claude/scripts/oom-investigator.sh

#!/usr/bin/env bash
set -euo pipefail

KUBECTL="kubectl --kubeconfig /Users/viktorbarzin/code/infra/config"
DRY_RUN=false
AGENT="oom-investigator"

for arg in "$@"; do
  case "$arg" in
    --dry-run) DRY_RUN=true ;;
  esac
done

CHECKS="[]"

add_check() {
  local name="$1" status="$2" message="$3"
  CHECKS=$(echo "$CHECKS" | python3 -c "
import sys, json
checks = json.load(sys.stdin)
checks.append({'name': '''$name''', 'status': '''$status''', 'message': '''$message'''})
json.dump(checks, sys.stdout)
")
}

# Find OOMKilled pods across all namespaces
find_oomkilled() {
  if $DRY_RUN; then
    add_check "oom-killed-pods" "ok" "DRY RUN: would check for OOMKilled pods across all namespaces"
    return
  fi

  local oom_pods
  oom_pods=$($KUBECTL get pods --all-namespaces -o json | python3 -c "
import sys, json
data = json.load(sys.stdin)
results = []
for pod in data.get('items', []):
  ns = pod['metadata']['namespace']
  name = pod['metadata']['name']
  for cs in pod.get('status', {}).get('containerStatuses', []) + pod.get('status', {}).get('initContainerStatuses', []):
    last = cs.get('lastState', {}).get('terminated', {})
    current = cs.get('state', {}).get('terminated', {})
    for state in [last, current]:
      if state.get('reason') == 'OOMKilled':
        container = cs['name']
        restart_count = cs.get('restartCount', 0)
        finished = state.get('finishedAt', 'unknown')
        results.append({'namespace': ns, 'pod': name, 'container': container, 'restarts': restart_count, 'finishedAt': finished})
json.dump(results, sys.stdout)
" 2>/dev/null) || oom_pods="[]"

  local count
  count=$(echo "$oom_pods" | python3 -c "import sys,json; print(len(json.load(sys.stdin)))")

  if [ "$count" -eq 0 ]; then
    add_check "oom-killed-pods" "ok" "No OOMKilled pods found"
  else
    add_check "oom-killed-pods" "fail" "Found $count OOMKilled container(s): $(echo "$oom_pods" | python3 -c "
import sys,json
pods = json.load(sys.stdin)
print('; '.join(f\"{p['namespace']}/{p['pod']}:{p['container']} (restarts={p['restarts']}, at={p['finishedAt']})\" for p in pods))
")"
  fi
}

# Check LimitRange defaults in namespaces with OOM events
check_limitranges() {
  if $DRY_RUN; then
    add_check "limitranges" "ok" "DRY RUN: would check LimitRange defaults"
    return
  fi

  local namespaces
  namespaces=$($KUBECTL get pods --all-namespaces -o json | python3 -c "
import sys, json
data = json.load(sys.stdin)
ns_set = set()
for pod in data.get('items', []):
  for cs in pod.get('status', {}).get('containerStatuses', []) + pod.get('status', {}).get('initContainerStatuses', []):
    for state in [cs.get('lastState', {}).get('terminated', {}), cs.get('state', {}).get('terminated', {})]:
      if state.get('reason') == 'OOMKilled':
        ns_set.add(pod['metadata']['namespace'])
for ns in sorted(ns_set):
  print(ns)
" 2>/dev/null) || namespaces=""

  if [ -z "$namespaces" ]; then
    add_check "limitranges" "ok" "No namespaces with OOMKilled pods to check"
    return
  fi

  local lr_info=""
  while IFS= read -r ns; do
    local lr
    lr=$($KUBECTL get limitrange -n "$ns" -o json 2>/dev/null | python3 -c "
import sys, json
data = json.load(sys.stdin)
for item in data.get('items', []):
  for limit in item.get('spec', {}).get('limits', []):
    if limit.get('type') == 'Container':
      default_mem = limit.get('default', {}).get('memory', 'none')
      default_cpu = limit.get('default', {}).get('cpu', 'none')
      print(f'$ns: default memory={default_mem}, cpu={default_cpu}')
" 2>/dev/null) || lr=""
    if [ -n "$lr" ]; then
      lr_info="${lr_info}${lr}; "
    else
      lr_info="${lr_info}${ns}: no LimitRange; "
    fi
  done <<< "$namespaces"

  add_check "limitranges" "warn" "LimitRange defaults for OOM namespaces: ${lr_info}"
}

# Check VPA recommendations from Goldilocks
check_vpa_recommendations() {
  if $DRY_RUN; then
    add_check "vpa-recommendations" "ok" "DRY RUN: would check VPA recommendations"
    return
  fi

  local vpa_count
  vpa_count=$($KUBECTL get vpa --all-namespaces --no-headers 2>/dev/null | wc -l | tr -d ' ') || vpa_count=0

  if [ "$vpa_count" -eq 0 ]; then
    add_check "vpa-recommendations" "warn" "No VPA objects found — Goldilocks may not be deployed"
    return
  fi

  local vpa_recs
  vpa_recs=$($KUBECTL get vpa --all-namespaces -o json 2>/dev/null | python3 -c "
import sys, json
data = json.load(sys.stdin)
recs = []
for vpa in data.get('items', []):
  ns = vpa['metadata']['namespace']
  name = vpa['metadata']['name']
  for cr in vpa.get('status', {}).get('recommendation', {}).get('containerRecommendations', []):
    container = cr.get('containerName', 'unknown')
    target_mem = cr.get('target', {}).get('memory', 'n/a')
    target_cpu = cr.get('target', {}).get('cpu', 'n/a')
    upper_mem = cr.get('upperBound', {}).get('memory', 'n/a')
    recs.append(f'{ns}/{name}:{container} target_mem={target_mem} target_cpu={target_cpu} upper_mem={upper_mem}')
if recs:
  print('; '.join(recs[:20]))
else:
  print('No recommendations available yet')
" 2>/dev/null) || vpa_recs="Failed to read VPA recommendations"

  add_check "vpa-recommendations" "ok" "$vpa_recs"
}

# Check resource requests/limits on OOMKilled pods
check_pod_resources() {
  if $DRY_RUN; then
    add_check "pod-resources" "ok" "DRY RUN: would check pod resource specs"
    return
  fi

  local resources
  resources=$($KUBECTL get pods --all-namespaces -o json | python3 -c "
import sys, json
data = json.load(sys.stdin)
results = []
for pod in data.get('items', []):
  ns = pod['metadata']['namespace']
  name = pod['metadata']['name']
  has_oom = False
  for cs in pod.get('status', {}).get('containerStatuses', []) + pod.get('status', {}).get('initContainerStatuses', []):
    for state in [cs.get('lastState', {}).get('terminated', {}), cs.get('state', {}).get('terminated', {})]:
      if state.get('reason') == 'OOMKilled':
        has_oom = True
        break
  if has_oom:
    for c in pod.get('spec', {}).get('containers', []) + pod.get('spec', {}).get('initContainers', []):
      req_mem = c.get('resources', {}).get('requests', {}).get('memory', 'none')
      lim_mem = c.get('resources', {}).get('limits', {}).get('memory', 'none')
      req_cpu = c.get('resources', {}).get('requests', {}).get('cpu', 'none')
      lim_cpu = c.get('resources', {}).get('limits', {}).get('cpu', 'none')
      results.append(f\"{ns}/{name}:{c['name']} req_mem={req_mem} lim_mem={lim_mem} req_cpu={req_cpu} lim_cpu={lim_cpu}\")
if results:
  print('; '.join(results))
else:
  print('No OOMKilled pods to inspect')
" 2>/dev/null) || resources="Failed to check pod resources"

  if echo "$resources" | grep -q "No OOMKilled"; then
    add_check "pod-resources" "ok" "$resources"
  else
    add_check "pod-resources" "warn" "$resources"
  fi
}

# Run all checks
find_oomkilled
check_limitranges
check_vpa_recommendations
check_pod_resources

# Determine overall status
OVERALL=$(echo "$CHECKS" | python3 -c "
import sys, json
checks = json.load(sys.stdin)
statuses = [c['status'] for c in checks]
if 'fail' in statuses:
  print('fail')
elif 'warn' in statuses:
  print('warn')
else:
  print('ok')
")

echo "{\"status\": \"$OVERALL\", \"agent\": \"$AGENT\", \"checks\": $CHECKS}" | python3 -m json.tool