post-mortem v2: pipeline team architecture with 4-stage agents [ci skip]
Split monolithic orchestrator into triage (haiku), historian (sonnet), and report-writer (opus) stages. Each stage gets its own tool budget. Added sev-context.sh for structured cluster context gathering.
This commit is contained in:
parent
66c70ce10f
commit
6efaed096d
5 changed files with 527 additions and 0 deletions
95
.claude/scripts/sev-context.sh
Executable file
95
.claude/scripts/sev-context.sh
Executable file
|
|
@ -0,0 +1,95 @@
|
|||
#!/usr/bin/env bash
|
||||
# sev-context.sh — Gather structured cluster context for post-mortem triage
|
||||
# Used by sev-triage agent and available to all pipeline stages
|
||||
set -euo pipefail
|
||||
|
||||
KUBECONFIG="${KUBECONFIG:-/Users/viktorbarzin/code/infra/config}"
|
||||
INFRA_DIR="${INFRA_DIR:-/Users/viktorbarzin/code/infra}"
|
||||
export KUBECONFIG
|
||||
|
||||
echo "=== NODE STATUS ==="
|
||||
kubectl get nodes -o custom-columns=\
|
||||
'NAME:.metadata.name,STATUS:.status.conditions[?(@.type=="Ready")].status,VERSION:.status.nodeInfo.kubeletVersion,CPU_CAP:.status.capacity.cpu,MEM_CAP:.status.capacity.memory' \
|
||||
--no-headers 2>/dev/null || echo "ERROR: Cannot reach cluster"
|
||||
|
||||
echo ""
|
||||
echo "=== UNHEALTHY PODS ==="
|
||||
# Pods not Running/Succeeded, with UTC start time instead of relative age
|
||||
kubectl get pods --all-namespaces \
|
||||
--field-selector='status.phase!=Running,status.phase!=Succeeded' \
|
||||
-o custom-columns=\
|
||||
'NAMESPACE:.metadata.namespace,POD:.metadata.name,STATUS:.status.phase,RESTARTS:.status.containerStatuses[0].restartCount,STARTED_UTC:.status.startTime,NODE:.spec.nodeName' \
|
||||
--no-headers 2>/dev/null || true
|
||||
|
||||
# Also show pods that are Running but have containers not ready or high restarts
|
||||
kubectl get pods --all-namespaces -o json 2>/dev/null | python3 -c "
|
||||
import json, sys
|
||||
try:
|
||||
data = json.load(sys.stdin)
|
||||
except:
|
||||
sys.exit(0)
|
||||
for pod in data.get('items', []):
|
||||
ns = pod['metadata']['namespace']
|
||||
name = pod['metadata']['name']
|
||||
node = pod['spec'].get('nodeName', 'N/A')
|
||||
start = pod['status'].get('startTime', 'N/A')
|
||||
phase = pod['status'].get('phase', 'Unknown')
|
||||
if phase != 'Running':
|
||||
continue
|
||||
for cs in pod['status'].get('containerStatuses', []):
|
||||
restarts = cs.get('restartCount', 0)
|
||||
ready = cs.get('ready', True)
|
||||
if restarts > 3 or not ready:
|
||||
reason = ''
|
||||
waiting = cs.get('state', {}).get('waiting', {})
|
||||
if waiting:
|
||||
reason = waiting.get('reason', '')
|
||||
print(f'{ns}\t{name}\t{phase}/NotReady\t{restarts}\t{start}\t{node}\t{reason}')
|
||||
break
|
||||
" 2>/dev/null || true
|
||||
|
||||
echo ""
|
||||
echo "=== RECENT EVENTS (last 2h, Warning/Error only) ==="
|
||||
kubectl get events --all-namespaces \
|
||||
--field-selector='type!=Normal' \
|
||||
--sort-by='.lastTimestamp' \
|
||||
-o custom-columns=\
|
||||
'NAMESPACE:.metadata.namespace,TYPE:.type,REASON:.reason,OBJECT:.involvedObject.name,LAST_SEEN_UTC:.lastTimestamp,MESSAGE:.message' \
|
||||
--no-headers 2>/dev/null | tail -50 || true
|
||||
|
||||
echo ""
|
||||
echo "=== NAMESPACE TO STACK MAPPING ==="
|
||||
# Parse terragrunt.hcl files to map k8s namespaces to stack directories
|
||||
for tg in "$INFRA_DIR"/stacks/*/terragrunt.hcl; do
|
||||
stack_dir=$(dirname "$tg")
|
||||
stack_name=$(basename "$stack_dir")
|
||||
# Try to find namespace from the stack - check main.tf for namespace references
|
||||
ns=$(grep -h 'namespace' "$stack_dir"/main.tf 2>/dev/null | grep -oP '"\K[a-z0-9-]+(?=")' | head -1 || echo "$stack_name")
|
||||
echo "$ns → stacks/$stack_name"
|
||||
done 2>/dev/null | sort -u || true
|
||||
|
||||
echo ""
|
||||
echo "=== SERVICE TIERS ==="
|
||||
# Parse service-catalog.md for tier classifications
|
||||
catalog="$INFRA_DIR/.claude/reference/service-catalog.md"
|
||||
if [ -f "$catalog" ]; then
|
||||
current_tier=""
|
||||
while IFS= read -r line; do
|
||||
case "$line" in
|
||||
*"Tier: core"*) current_tier="core" ;;
|
||||
*"Tier: cluster"*) current_tier="cluster" ;;
|
||||
*"Admin"*) current_tier="admin" ;;
|
||||
*"Active Use"*) current_tier="active" ;;
|
||||
*"Optional"*|*"Inactive"*) current_tier="optional" ;;
|
||||
esac
|
||||
if [[ "$line" =~ ^\|[[:space:]]+([a-z0-9_-]+)[[:space:]]+\| && "$current_tier" != "" ]]; then
|
||||
svc="${BASH_REMATCH[1]}"
|
||||
[[ "$svc" == "Service" || "$svc" == "---" ]] && continue
|
||||
echo "$svc=$current_tier"
|
||||
fi
|
||||
done < "$catalog"
|
||||
fi
|
||||
|
||||
echo ""
|
||||
echo "=== CURRENT UTC TIME ==="
|
||||
date -u '+%Y-%m-%dT%H:%M:%SZ'
|
||||
Loading…
Add table
Add a link
Reference in a new issue