infra/.claude/scripts/resource-report.sh

#!/usr/bin/env bash
set -euo pipefail

KUBECTL="kubectl --kubeconfig /Users/viktorbarzin/code/infra/config"
DRY_RUN=false
AGENT="resource-report"

for arg in "$@"; do
  case "$arg" in
    --dry-run) DRY_RUN=true ;;
  esac
done

CHECKS="[]"

add_check() {
  local name="$1" status="$2" message="$3"
  CHECKS=$(echo "$CHECKS" | python3 -c "
import sys, json
checks = json.load(sys.stdin)
checks.append({'name': '''$name''', 'status': '''$status''', 'message': '''$message'''})
json.dump(checks, sys.stdout)
")
}

# Node capacity report: allocatable vs requests vs limits
check_node_capacity() {
  if $DRY_RUN; then
    add_check "node-capacity" "ok" "DRY RUN: would report node allocatable vs requests vs limits"
    return
  fi

  local report
  report=$($KUBECTL get nodes -o json | python3 -c "
import sys, json

def parse_cpu(val):
  if val.endswith('m'):
    return int(val[:-1])
  return int(float(val) * 1000)

def parse_mem(val):
  units = {'Ki': 1024, 'Mi': 1024**2, 'Gi': 1024**3, 'Ti': 1024**4}
  for suffix, mult in units.items():
    if val.endswith(suffix):
      return int(float(val[:-len(suffix)]) * mult)
  return int(val)

def fmt_mem(b):
  return f'{b / (1024**3):.1f}Gi'

def fmt_cpu(m):
  return f'{m}m'

data = json.load(sys.stdin)
nodes = []
for node in data.get('items', []):
  name = node['metadata']['name']
  alloc = node.get('status', {}).get('allocatable', {})
  cpu_alloc = parse_cpu(alloc.get('cpu', '0'))
  mem_alloc = parse_mem(alloc.get('memory', '0'))
  nodes.append({'name': name, 'cpu_alloc': cpu_alloc, 'mem_alloc': mem_alloc})

for n in nodes:
  print(f\"{n['name']}: cpu_alloc={fmt_cpu(n['cpu_alloc'])} mem_alloc={fmt_mem(n['mem_alloc'])}\")
" 2>/dev/null) || report="Failed to get node capacity"

  # Get requests/limits per node
  local usage
  usage=$($KUBECTL get pods --all-namespaces -o json | python3 -c "
import sys, json

def parse_cpu(val):
  if not val: return 0
  if val.endswith('m'):
    return int(val[:-1])
  return int(float(val) * 1000)

def parse_mem(val):
  if not val: return 0
  units = {'Ki': 1024, 'Mi': 1024**2, 'Gi': 1024**3, 'Ti': 1024**4}
  for suffix, mult in units.items():
    if val.endswith(suffix):
      return int(float(val[:-len(suffix)]) * mult)
  return int(val)

def fmt_mem(b):
  return f'{b / (1024**3):.1f}Gi'

def fmt_cpu(m):
  return f'{m}m'

data = json.load(sys.stdin)
per_node = {}
for pod in data.get('items', []):
  phase = pod.get('status', {}).get('phase', '')
  if phase not in ('Running', 'Pending'):
    continue
  node = pod.get('spec', {}).get('nodeName', 'unscheduled')
  if node not in per_node:
    per_node[node] = {'cpu_req': 0, 'cpu_lim': 0, 'mem_req': 0, 'mem_lim': 0}
  for c in pod.get('spec', {}).get('containers', []) + pod.get('spec', {}).get('initContainers', []):
    res = c.get('resources', {})
    per_node[node]['cpu_req'] += parse_cpu(res.get('requests', {}).get('cpu', ''))
    per_node[node]['cpu_lim'] += parse_cpu(res.get('limits', {}).get('cpu', ''))
    per_node[node]['mem_req'] += parse_mem(res.get('requests', {}).get('memory', ''))
    per_node[node]['mem_lim'] += parse_mem(res.get('limits', {}).get('memory', ''))

for node in sorted(per_node.keys()):
  n = per_node[node]
  print(f\"{node}: cpu_req={fmt_cpu(n['cpu_req'])} cpu_lim={fmt_cpu(n['cpu_lim'])} mem_req={fmt_mem(n['mem_req'])} mem_lim={fmt_mem(n['mem_lim'])}\")
" 2>/dev/null) || usage="Failed to get pod resource usage"

  add_check "node-capacity" "ok" "Allocatable: ${report} | Usage: ${usage}"
}

# Per-namespace ResourceQuota usage
check_resource_quotas() {
  if $DRY_RUN; then
    add_check "resource-quotas" "ok" "DRY RUN: would check ResourceQuota usage per namespace"
    return
  fi

  local quota_count
  quota_count=$($KUBECTL get resourcequota --all-namespaces --no-headers 2>/dev/null | wc -l | tr -d ' ') || quota_count=0

  if [ "$quota_count" -eq 0 ]; then
    add_check "resource-quotas" "ok" "No ResourceQuotas defined in the cluster"
    return
  fi

  local quota_report
  quota_report=$($KUBECTL get resourcequota --all-namespaces -o json 2>/dev/null | python3 -c "
import sys, json
data = json.load(sys.stdin)
results = []
for rq in data.get('items', []):
  ns = rq['metadata']['namespace']
  name = rq['metadata']['name']
  hard = rq.get('status', {}).get('hard', {})
  used = rq.get('status', {}).get('used', {})
  for resource in hard:
    h = hard[resource]
    u = used.get(resource, '0')
    results.append(f'{ns}/{name}: {resource} used={u} hard={h}')
if results:
  print('; '.join(results[:30]))
else:
  print('No quota usage data')
" 2>/dev/null) || quota_report="Failed to read ResourceQuotas"

  add_check "resource-quotas" "ok" "$quota_report"
}

# Top pods by memory usage
check_top_consumers() {
  if $DRY_RUN; then
    add_check "top-consumers" "ok" "DRY RUN: would report top memory-consuming pods"
    return
  fi

  local top_pods
  top_pods=$($KUBECTL top pods --all-namespaces --no-headers 2>/dev/null | sort -k4 -h -r | head -10 | awk '{print $1"/"$2": cpu="$3" mem="$4}' | tr '\n' '; ') || top_pods="metrics-server may not be available"

  if [ -z "$top_pods" ]; then
    add_check "top-consumers" "warn" "kubectl top returned no data — metrics-server may not be running"
  else
    add_check "top-consumers" "ok" "Top 10 by memory: ${top_pods}"
  fi
}

# Run all checks
check_node_capacity
check_resource_quotas
check_top_consumers

# Determine overall status
OVERALL=$(echo "$CHECKS" | python3 -c "
import sys, json
checks = json.load(sys.stdin)
statuses = [c['status'] for c in checks]
if 'fail' in statuses:
  print('fail')
elif 'warn' in statuses:
  print('warn')
else:
  print('ok')
")

echo "{\"status\": \"$OVERALL\", \"agent\": \"$AGENT\", \"checks\": $CHECKS}" | python3 -m json.tool