fix: restore tree dropped by 6d224861; land stem95su gdrive-sync (10m) [ci skip]

6d224861 came from a --no-checkout worktree whose empty index made the commit drop every file except two. This restores 05b50d2b's full tree and correctly adds stacks/stem95su/gdrive-sync.tf + the service-catalog stem95su entry. Forward-only (parent=6d224861, no force-push); [ci skip] since the live infra was never applied from the broken commit. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-09 08:45:33 +00:00 · 2026-06-09 08:45:33 +00:00 · fd0f4a0365
commit fd0f4a0365
parent 6d224861c4
1166 changed files with 358546 additions and 0 deletions
--- a/.claude/scripts/oom-investigator.sh
+++ b/.claude/scripts/oom-investigator.sh
@ -0,0 +1,214 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+KUBECTL="kubectl --kubeconfig /Users/viktorbarzin/code/infra/config"
+DRY_RUN=false
+AGENT="oom-investigator"
+
+for arg in "$@"; do
+  case "$arg" in
+    --dry-run) DRY_RUN=true ;;
+  esac
+done
+
+CHECKS="[]"
+
+add_check() {
+  local name="$1" status="$2" message="$3"
+  CHECKS=$(echo "$CHECKS" | python3 -c "
+import sys, json
+checks = json.load(sys.stdin)
+checks.append({'name': '''$name''', 'status': '''$status''', 'message': '''$message'''})
+json.dump(checks, sys.stdout)
+")
+}
+
+# Find OOMKilled pods across all namespaces
+find_oomkilled() {
+  if $DRY_RUN; then
+    add_check "oom-killed-pods" "ok" "DRY RUN: would check for OOMKilled pods across all namespaces"
+    return
+  fi
+
+  local oom_pods
+  oom_pods=$($KUBECTL get pods --all-namespaces -o json | python3 -c "
+import sys, json
+data = json.load(sys.stdin)
+results = []
+for pod in data.get('items', []):
+  ns = pod['metadata']['namespace']
+  name = pod['metadata']['name']
+  for cs in pod.get('status', {}).get('containerStatuses', []) + pod.get('status', {}).get('initContainerStatuses', []):
+    last = cs.get('lastState', {}).get('terminated', {})
+    current = cs.get('state', {}).get('terminated', {})
+    for state in [last, current]:
+      if state.get('reason') == 'OOMKilled':
+        container = cs['name']
+        restart_count = cs.get('restartCount', 0)
+        finished = state.get('finishedAt', 'unknown')
+        results.append({'namespace': ns, 'pod': name, 'container': container, 'restarts': restart_count, 'finishedAt': finished})
+json.dump(results, sys.stdout)
+" 2>/dev/null) || oom_pods="[]"
+
+  local count
+  count=$(echo "$oom_pods" | python3 -c "import sys,json; print(len(json.load(sys.stdin)))")
+
+  if [ "$count" -eq 0 ]; then
+    add_check "oom-killed-pods" "ok" "No OOMKilled pods found"
+  else
+    add_check "oom-killed-pods" "fail" "Found $count OOMKilled container(s): $(echo "$oom_pods" | python3 -c "
+import sys,json
+pods = json.load(sys.stdin)
+print('; '.join(f\"{p['namespace']}/{p['pod']}:{p['container']} (restarts={p['restarts']}, at={p['finishedAt']})\" for p in pods))
+")"
+  fi
+}
+
+# Check LimitRange defaults in namespaces with OOM events
+check_limitranges() {
+  if $DRY_RUN; then
+    add_check "limitranges" "ok" "DRY RUN: would check LimitRange defaults"
+    return
+  fi
+
+  local namespaces
+  namespaces=$($KUBECTL get pods --all-namespaces -o json | python3 -c "
+import sys, json
+data = json.load(sys.stdin)
+ns_set = set()
+for pod in data.get('items', []):
+  for cs in pod.get('status', {}).get('containerStatuses', []) + pod.get('status', {}).get('initContainerStatuses', []):
+    for state in [cs.get('lastState', {}).get('terminated', {}), cs.get('state', {}).get('terminated', {})]:
+      if state.get('reason') == 'OOMKilled':
+        ns_set.add(pod['metadata']['namespace'])
+for ns in sorted(ns_set):
+  print(ns)
+" 2>/dev/null) || namespaces=""
+
+  if [ -z "$namespaces" ]; then
+    add_check "limitranges" "ok" "No namespaces with OOMKilled pods to check"
+    return
+  fi
+
+  local lr_info=""
+  while IFS= read -r ns; do
+    local lr
+    lr=$($KUBECTL get limitrange -n "$ns" -o json 2>/dev/null | python3 -c "
+import sys, json
+data = json.load(sys.stdin)
+for item in data.get('items', []):
+  for limit in item.get('spec', {}).get('limits', []):
+    if limit.get('type') == 'Container':
+      default_mem = limit.get('default', {}).get('memory', 'none')
+      default_cpu = limit.get('default', {}).get('cpu', 'none')
+      print(f'$ns: default memory={default_mem}, cpu={default_cpu}')
+" 2>/dev/null) || lr=""
+    if [ -n "$lr" ]; then
+      lr_info="${lr_info}${lr}; "
+    else
+      lr_info="${lr_info}${ns}: no LimitRange; "
+    fi
+  done <<< "$namespaces"
+
+  add_check "limitranges" "warn" "LimitRange defaults for OOM namespaces: ${lr_info}"
+}
+
+# Check VPA recommendations from Goldilocks
+check_vpa_recommendations() {
+  if $DRY_RUN; then
+    add_check "vpa-recommendations" "ok" "DRY RUN: would check VPA recommendations"
+    return
+  fi
+
+  local vpa_count
+  vpa_count=$($KUBECTL get vpa --all-namespaces --no-headers 2>/dev/null | wc -l | tr -d ' ') || vpa_count=0
+
+  if [ "$vpa_count" -eq 0 ]; then
+    add_check "vpa-recommendations" "warn" "No VPA objects found — Goldilocks may not be deployed"
+    return
+  fi
+
+  local vpa_recs
+  vpa_recs=$($KUBECTL get vpa --all-namespaces -o json 2>/dev/null | python3 -c "
+import sys, json
+data = json.load(sys.stdin)
+recs = []
+for vpa in data.get('items', []):
+  ns = vpa['metadata']['namespace']
+  name = vpa['metadata']['name']
+  for cr in vpa.get('status', {}).get('recommendation', {}).get('containerRecommendations', []):
+    container = cr.get('containerName', 'unknown')
+    target_mem = cr.get('target', {}).get('memory', 'n/a')
+    target_cpu = cr.get('target', {}).get('cpu', 'n/a')
+    upper_mem = cr.get('upperBound', {}).get('memory', 'n/a')
+    recs.append(f'{ns}/{name}:{container} target_mem={target_mem} target_cpu={target_cpu} upper_mem={upper_mem}')
+if recs:
+  print('; '.join(recs[:20]))
+else:
+  print('No recommendations available yet')
+" 2>/dev/null) || vpa_recs="Failed to read VPA recommendations"
+
+  add_check "vpa-recommendations" "ok" "$vpa_recs"
+}
+
+# Check resource requests/limits on OOMKilled pods
+check_pod_resources() {
+  if $DRY_RUN; then
+    add_check "pod-resources" "ok" "DRY RUN: would check pod resource specs"
+    return
+  fi
+
+  local resources
+  resources=$($KUBECTL get pods --all-namespaces -o json | python3 -c "
+import sys, json
+data = json.load(sys.stdin)
+results = []
+for pod in data.get('items', []):
+  ns = pod['metadata']['namespace']
+  name = pod['metadata']['name']
+  has_oom = False
+  for cs in pod.get('status', {}).get('containerStatuses', []) + pod.get('status', {}).get('initContainerStatuses', []):
+    for state in [cs.get('lastState', {}).get('terminated', {}), cs.get('state', {}).get('terminated', {})]:
+      if state.get('reason') == 'OOMKilled':
+        has_oom = True
+        break
+  if has_oom:
+    for c in pod.get('spec', {}).get('containers', []) + pod.get('spec', {}).get('initContainers', []):
+      req_mem = c.get('resources', {}).get('requests', {}).get('memory', 'none')
+      lim_mem = c.get('resources', {}).get('limits', {}).get('memory', 'none')
+      req_cpu = c.get('resources', {}).get('requests', {}).get('cpu', 'none')
+      lim_cpu = c.get('resources', {}).get('limits', {}).get('cpu', 'none')
+      results.append(f\"{ns}/{name}:{c['name']} req_mem={req_mem} lim_mem={lim_mem} req_cpu={req_cpu} lim_cpu={lim_cpu}\")
+if results:
+  print('; '.join(results))
+else:
+  print('No OOMKilled pods to inspect')
+" 2>/dev/null) || resources="Failed to check pod resources"
+
+  if echo "$resources" | grep -q "No OOMKilled"; then
+    add_check "pod-resources" "ok" "$resources"
+  else
+    add_check "pod-resources" "warn" "$resources"
+  fi
+}
+
+# Run all checks
+find_oomkilled
+check_limitranges
+check_vpa_recommendations
+check_pod_resources
+
+# Determine overall status
+OVERALL=$(echo "$CHECKS" | python3 -c "
+import sys, json
+checks = json.load(sys.stdin)
+statuses = [c['status'] for c in checks]
+if 'fail' in statuses:
+  print('fail')
+elif 'warn' in statuses:
+  print('warn')
+else:
+  print('ok')
+")
+
+echo "{\"status\": \"$OVERALL\", \"agent\": \"$AGENT\", \"checks\": $CHECKS}" | python3 -m json.tool