infra/.claude/scripts/sev-context.sh
Viktor Barzin fd0f4a0365 fix: restore tree dropped by 6d224861; land stem95su gdrive-sync (10m) [ci skip]
6d224861 came from a --no-checkout worktree whose empty index made the
commit drop every file except two. This restores 05b50d2b's full tree and
correctly adds stacks/stem95su/gdrive-sync.tf + the service-catalog stem95su
entry. Forward-only (parent=6d224861, no force-push); [ci skip] since the
live infra was never applied from the broken commit.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-09 08:45:33 +00:00

95 lines
3.8 KiB
Bash
Executable file

#!/usr/bin/env bash
# sev-context.sh — Gather structured cluster context for post-mortem triage
# Used by sev-triage agent and available to all pipeline stages
set -euo pipefail
KUBECONFIG="${KUBECONFIG:-/Users/viktorbarzin/code/infra/config}"
INFRA_DIR="${INFRA_DIR:-/Users/viktorbarzin/code/infra}"
export KUBECONFIG
echo "=== NODE STATUS ==="
kubectl get nodes -o custom-columns=\
'NAME:.metadata.name,STATUS:.status.conditions[?(@.type=="Ready")].status,VERSION:.status.nodeInfo.kubeletVersion,CPU_CAP:.status.capacity.cpu,MEM_CAP:.status.capacity.memory' \
--no-headers 2>/dev/null || echo "ERROR: Cannot reach cluster"
echo ""
echo "=== UNHEALTHY PODS ==="
# Pods not Running/Succeeded, with UTC start time instead of relative age
kubectl get pods --all-namespaces \
--field-selector='status.phase!=Running,status.phase!=Succeeded' \
-o custom-columns=\
'NAMESPACE:.metadata.namespace,POD:.metadata.name,STATUS:.status.phase,RESTARTS:.status.containerStatuses[0].restartCount,STARTED_UTC:.status.startTime,NODE:.spec.nodeName' \
--no-headers 2>/dev/null || true
# Also show pods that are Running but have containers not ready or high restarts
kubectl get pods --all-namespaces -o json 2>/dev/null | python3 -c "
import json, sys
try:
data = json.load(sys.stdin)
except:
sys.exit(0)
for pod in data.get('items', []):
ns = pod['metadata']['namespace']
name = pod['metadata']['name']
node = pod['spec'].get('nodeName', 'N/A')
start = pod['status'].get('startTime', 'N/A')
phase = pod['status'].get('phase', 'Unknown')
if phase != 'Running':
continue
for cs in pod['status'].get('containerStatuses', []):
restarts = cs.get('restartCount', 0)
ready = cs.get('ready', True)
if restarts > 3 or not ready:
reason = ''
waiting = cs.get('state', {}).get('waiting', {})
if waiting:
reason = waiting.get('reason', '')
print(f'{ns}\t{name}\t{phase}/NotReady\t{restarts}\t{start}\t{node}\t{reason}')
break
" 2>/dev/null || true
echo ""
echo "=== RECENT EVENTS (last 2h, Warning/Error only) ==="
kubectl get events --all-namespaces \
--field-selector='type!=Normal' \
--sort-by='.lastTimestamp' \
-o custom-columns=\
'NAMESPACE:.metadata.namespace,TYPE:.type,REASON:.reason,OBJECT:.involvedObject.name,LAST_SEEN_UTC:.lastTimestamp,MESSAGE:.message' \
--no-headers 2>/dev/null | tail -50 || true
echo ""
echo "=== NAMESPACE TO STACK MAPPING ==="
# Parse terragrunt.hcl files to map k8s namespaces to stack directories
for tg in "$INFRA_DIR"/stacks/*/terragrunt.hcl; do
stack_dir=$(dirname "$tg")
stack_name=$(basename "$stack_dir")
# Try to find namespace from the stack - check main.tf for namespace references
ns=$(grep -h 'namespace' "$stack_dir"/main.tf 2>/dev/null | grep -oP '"\K[a-z0-9-]+(?=")' | head -1 || echo "$stack_name")
echo "$ns → stacks/$stack_name"
done 2>/dev/null | sort -u || true
echo ""
echo "=== SERVICE TIERS ==="
# Parse service-catalog.md for tier classifications
catalog="$INFRA_DIR/.claude/reference/service-catalog.md"
if [ -f "$catalog" ]; then
current_tier=""
while IFS= read -r line; do
case "$line" in
*"Tier: core"*) current_tier="core" ;;
*"Tier: cluster"*) current_tier="cluster" ;;
*"Admin"*) current_tier="admin" ;;
*"Active Use"*) current_tier="active" ;;
*"Optional"*|*"Inactive"*) current_tier="optional" ;;
esac
if [[ "$line" =~ ^\|[[:space:]]+([a-z0-9_-]+)[[:space:]]+\| && "$current_tier" != "" ]]; then
svc="${BASH_REMATCH[1]}"
[[ "$svc" == "Service" || "$svc" == "---" ]] && continue
echo "$svc=$current_tier"
fi
done < "$catalog"
fi
echo ""
echo "=== CURRENT UTC TIME ==="
date -u '+%Y-%m-%dT%H:%M:%SZ'