infra/.claude/scripts/resource-report.sh
Viktor Barzin ff83ec3325 add infrastructure agent team: 8 specialized agents + 14 diagnostic scripts
Agents: devops-engineer, dba, security-engineer, sre, network-engineer,
platform-engineer, observability-engineer, home-automation-engineer.
Scripts: deploy-status, db-health, backup-verify, tls-check, crowdsec-status,
authentik-audit, oom-investigator, resource-report, dns-check, network-health,
nfs-health, truenas-status, platform-status, monitoring-health.
Also: known-issues.md suppression list, cluster-health-checker port-forward fix.
2026-03-15 02:01:07 +00:00

190 lines
5.6 KiB
Bash
Executable file

#!/usr/bin/env bash
set -euo pipefail
KUBECTL="kubectl --kubeconfig /Users/viktorbarzin/code/infra/config"
DRY_RUN=false
AGENT="resource-report"
for arg in "$@"; do
case "$arg" in
--dry-run) DRY_RUN=true ;;
esac
done
CHECKS="[]"
add_check() {
local name="$1" status="$2" message="$3"
CHECKS=$(echo "$CHECKS" | python3 -c "
import sys, json
checks = json.load(sys.stdin)
checks.append({'name': '''$name''', 'status': '''$status''', 'message': '''$message'''})
json.dump(checks, sys.stdout)
")
}
# Node capacity report: allocatable vs requests vs limits
check_node_capacity() {
if $DRY_RUN; then
add_check "node-capacity" "ok" "DRY RUN: would report node allocatable vs requests vs limits"
return
fi
local report
report=$($KUBECTL get nodes -o json | python3 -c "
import sys, json
def parse_cpu(val):
if val.endswith('m'):
return int(val[:-1])
return int(float(val) * 1000)
def parse_mem(val):
units = {'Ki': 1024, 'Mi': 1024**2, 'Gi': 1024**3, 'Ti': 1024**4}
for suffix, mult in units.items():
if val.endswith(suffix):
return int(float(val[:-len(suffix)]) * mult)
return int(val)
def fmt_mem(b):
return f'{b / (1024**3):.1f}Gi'
def fmt_cpu(m):
return f'{m}m'
data = json.load(sys.stdin)
nodes = []
for node in data.get('items', []):
name = node['metadata']['name']
alloc = node.get('status', {}).get('allocatable', {})
cpu_alloc = parse_cpu(alloc.get('cpu', '0'))
mem_alloc = parse_mem(alloc.get('memory', '0'))
nodes.append({'name': name, 'cpu_alloc': cpu_alloc, 'mem_alloc': mem_alloc})
for n in nodes:
print(f\"{n['name']}: cpu_alloc={fmt_cpu(n['cpu_alloc'])} mem_alloc={fmt_mem(n['mem_alloc'])}\")
" 2>/dev/null) || report="Failed to get node capacity"
# Get requests/limits per node
local usage
usage=$($KUBECTL get pods --all-namespaces -o json | python3 -c "
import sys, json
def parse_cpu(val):
if not val: return 0
if val.endswith('m'):
return int(val[:-1])
return int(float(val) * 1000)
def parse_mem(val):
if not val: return 0
units = {'Ki': 1024, 'Mi': 1024**2, 'Gi': 1024**3, 'Ti': 1024**4}
for suffix, mult in units.items():
if val.endswith(suffix):
return int(float(val[:-len(suffix)]) * mult)
return int(val)
def fmt_mem(b):
return f'{b / (1024**3):.1f}Gi'
def fmt_cpu(m):
return f'{m}m'
data = json.load(sys.stdin)
per_node = {}
for pod in data.get('items', []):
phase = pod.get('status', {}).get('phase', '')
if phase not in ('Running', 'Pending'):
continue
node = pod.get('spec', {}).get('nodeName', 'unscheduled')
if node not in per_node:
per_node[node] = {'cpu_req': 0, 'cpu_lim': 0, 'mem_req': 0, 'mem_lim': 0}
for c in pod.get('spec', {}).get('containers', []) + pod.get('spec', {}).get('initContainers', []):
res = c.get('resources', {})
per_node[node]['cpu_req'] += parse_cpu(res.get('requests', {}).get('cpu', ''))
per_node[node]['cpu_lim'] += parse_cpu(res.get('limits', {}).get('cpu', ''))
per_node[node]['mem_req'] += parse_mem(res.get('requests', {}).get('memory', ''))
per_node[node]['mem_lim'] += parse_mem(res.get('limits', {}).get('memory', ''))
for node in sorted(per_node.keys()):
n = per_node[node]
print(f\"{node}: cpu_req={fmt_cpu(n['cpu_req'])} cpu_lim={fmt_cpu(n['cpu_lim'])} mem_req={fmt_mem(n['mem_req'])} mem_lim={fmt_mem(n['mem_lim'])}\")
" 2>/dev/null) || usage="Failed to get pod resource usage"
add_check "node-capacity" "ok" "Allocatable: ${report} | Usage: ${usage}"
}
# Per-namespace ResourceQuota usage
check_resource_quotas() {
if $DRY_RUN; then
add_check "resource-quotas" "ok" "DRY RUN: would check ResourceQuota usage per namespace"
return
fi
local quota_count
quota_count=$($KUBECTL get resourcequota --all-namespaces --no-headers 2>/dev/null | wc -l | tr -d ' ') || quota_count=0
if [ "$quota_count" -eq 0 ]; then
add_check "resource-quotas" "ok" "No ResourceQuotas defined in the cluster"
return
fi
local quota_report
quota_report=$($KUBECTL get resourcequota --all-namespaces -o json 2>/dev/null | python3 -c "
import sys, json
data = json.load(sys.stdin)
results = []
for rq in data.get('items', []):
ns = rq['metadata']['namespace']
name = rq['metadata']['name']
hard = rq.get('status', {}).get('hard', {})
used = rq.get('status', {}).get('used', {})
for resource in hard:
h = hard[resource]
u = used.get(resource, '0')
results.append(f'{ns}/{name}: {resource} used={u} hard={h}')
if results:
print('; '.join(results[:30]))
else:
print('No quota usage data')
" 2>/dev/null) || quota_report="Failed to read ResourceQuotas"
add_check "resource-quotas" "ok" "$quota_report"
}
# Top pods by memory usage
check_top_consumers() {
if $DRY_RUN; then
add_check "top-consumers" "ok" "DRY RUN: would report top memory-consuming pods"
return
fi
local top_pods
top_pods=$($KUBECTL top pods --all-namespaces --no-headers 2>/dev/null | sort -k4 -h -r | head -10 | awk '{print $1"/"$2": cpu="$3" mem="$4}' | tr '\n' '; ') || top_pods="metrics-server may not be available"
if [ -z "$top_pods" ]; then
add_check "top-consumers" "warn" "kubectl top returned no data — metrics-server may not be running"
else
add_check "top-consumers" "ok" "Top 10 by memory: ${top_pods}"
fi
}
# Run all checks
check_node_capacity
check_resource_quotas
check_top_consumers
# Determine overall status
OVERALL=$(echo "$CHECKS" | python3 -c "
import sys, json
checks = json.load(sys.stdin)
statuses = [c['status'] for c in checks]
if 'fail' in statuses:
print('fail')
elif 'warn' in statuses:
print('warn')
else:
print('ok')
")
echo "{\"status\": \"$OVERALL\", \"agent\": \"$AGENT\", \"checks\": $CHECKS}" | python3 -m json.tool