fix: restore tree dropped by 6d224861; land stem95su gdrive-sync (10m) [ci skip]

6d224861 came from a --no-checkout worktree whose empty index made the
commit drop every file except two. This restores 05b50d2b's full tree and
correctly adds stacks/stem95su/gdrive-sync.tf + the service-catalog stem95su
entry. Forward-only (parent=6d224861, no force-push); [ci skip] since the
live infra was never applied from the broken commit.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
Viktor Barzin 2026-06-09 08:45:33 +00:00
parent 6d224861c4
commit fd0f4a0365
1166 changed files with 358546 additions and 0 deletions

View file

@ -0,0 +1,134 @@
#!/usr/bin/env bash
set -euo pipefail
KUBECTL="kubectl --kubeconfig /Users/viktorbarzin/code/infra/config"
AGENT="authentik-audit"
DRY_RUN=false
NAMESPACE="authentik"
for arg in "$@"; do
case "$arg" in
--dry-run) DRY_RUN=true ;;
esac
done
checks=()
add_check() {
local name="$1" status="$2" message="$3"
checks+=("{\"name\": \"$name\", \"status\": \"$status\", \"message\": \"$message\"}")
}
find_authentik_pod() {
local pod
pod=$($KUBECTL get pods -n "$NAMESPACE" -l app.kubernetes.io/name=authentik,app.kubernetes.io/component=server -o jsonpath='{.items[0].metadata.name}' 2>/dev/null) || \
pod=$($KUBECTL get pods -n "$NAMESPACE" --no-headers 2>/dev/null | grep -i "goauthentik-server\|authentik-server" | grep "Running" | head -1 | awk '{print $1}') || true
echo "$pod"
}
check_server_health() {
if $DRY_RUN; then
add_check "authentik-server" "ok" "dry-run: would check goauthentik-server pod health"
return
fi
local pods
pods=$($KUBECTL get pods -n "$NAMESPACE" --no-headers 2>/dev/null | grep -i "authentik") || {
add_check "authentik-server" "fail" "No Authentik pods found in namespace ${NAMESPACE}"
return
}
local not_running
not_running=$(echo "$pods" | grep -v "Running" | grep -v "Completed" | grep -c "." 2>/dev/null || echo "0")
local total
total=$(echo "$pods" | grep -c "." 2>/dev/null || echo "0")
if [ "$not_running" -gt 0 ]; then
add_check "authentik-server" "warn" "${not_running}/${total} Authentik pod(s) not running"
else
add_check "authentik-server" "ok" "All ${total} Authentik pod(s) running"
fi
}
check_outposts() {
if $DRY_RUN; then
add_check "authentik-outposts" "ok" "dry-run: would check Authentik outpost pods"
return
fi
local outpost_pods
outpost_pods=$($KUBECTL get pods -n "$NAMESPACE" -l app.kubernetes.io/managed-by=goauthentik.io --no-headers 2>/dev/null) || \
outpost_pods=$($KUBECTL get pods -n "$NAMESPACE" --no-headers 2>/dev/null | grep -i "outpost" || true)
if [ -z "$outpost_pods" ]; then
add_check "authentik-outposts" "warn" "No outpost pods found"
return
fi
local total not_running
total=$(echo "$outpost_pods" | grep -c "." 2>/dev/null || echo "0")
not_running=$(echo "$outpost_pods" | grep -v "Running" | grep -c "." 2>/dev/null || echo "0")
if [ "$not_running" -gt 0 ]; then
add_check "authentik-outposts" "warn" "${not_running}/${total} outpost pod(s) not running"
else
add_check "authentik-outposts" "ok" "All ${total} outpost pod(s) running"
fi
}
check_user_count() {
if $DRY_RUN; then
add_check "authentik-users" "ok" "dry-run: would check user count via ak CLI"
return
fi
local pod
pod=$(find_authentik_pod)
if [ -z "$pod" ]; then
add_check "authentik-users" "warn" "No Authentik server pod found to query users"
return
fi
# Use the ak CLI to get user count
local user_output
user_output=$($KUBECTL exec -n "$NAMESPACE" "$pod" -- ak user list 2>/dev/null) || {
# Fallback: try management command
user_output=$($KUBECTL exec -n "$NAMESPACE" "$pod" -- python -c "
import django; django.setup()
from authentik.core.models import User
print(f'total={User.objects.count()} active={User.objects.filter(is_active=True).count()}')
" 2>/dev/null) || {
add_check "authentik-users" "warn" "Could not query user count from Authentik"
return
}
}
local user_count
if echo "$user_output" | grep -q "total="; then
user_count=$(echo "$user_output" | grep "total=" | sed 's/.*total=\([0-9]*\).*/\1/')
local active_count
active_count=$(echo "$user_output" | grep "active=" | sed 's/.*active=\([0-9]*\).*/\1/')
add_check "authentik-users" "ok" "${user_count} total users, ${active_count} active"
else
# Count lines of output as fallback
user_count=$(echo "$user_output" | wc -l | tr -d ' ')
add_check "authentik-users" "ok" "User query returned ${user_count} lines of output"
fi
}
check_server_health
check_outposts
check_user_count
# Output JSON
overall="ok"
for c in "${checks[@]}"; do
s=$(echo "$c" | jq -r '.status')
if [ "$s" = "fail" ]; then overall="fail"; break; fi
if [ "$s" = "warn" ]; then overall="warn"; fi
done
printf '{"status": "%s", "agent": "%s", "checks": [%s]}\n' \
"$overall" "$AGENT" "$(IFS=,; echo "${checks[*]}")"

View file

@ -0,0 +1,180 @@
#!/usr/bin/env bash
set -euo pipefail
# Authentik Invitation Management Script
# Usage:
# ./authentik-invite.sh create "Group Name" # Single-use, no expiry
# ./authentik-invite.sh create "Group Name" --days 7 # Expires in 7 days
# ./authentik-invite.sh assign <username> "Group Name" # Add user to group
# ./authentik-invite.sh list # Show pending invitations
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
INFRA_DIR="$(cd "$SCRIPT_DIR/../.." && pwd)"
API="https://authentik.viktorbarzin.me/api/v3"
FLOW_SLUG="invitation-enrollment"
get_token() {
grep authentik_api_token "$INFRA_DIR/terraform.tfvars" | cut -d'"' -f2
}
api_get() {
curl -sf -H "Authorization: Bearer $(get_token)" "$API/$1"
}
api_post() {
curl -sf -X POST \
-H "Authorization: Bearer $(get_token)" \
-H "Content-Type: application/json" \
"$API/$1" -d "$2"
}
api_patch() {
curl -sf -X PATCH \
-H "Authorization: Bearer $(get_token)" \
-H "Content-Type: application/json" \
"$API/$1" -d "$2"
}
cmd_create() {
local group_name="${1:?Usage: create <group-name> [--days N]}"
local days=""
shift
while [[ $# -gt 0 ]]; do
case "$1" in
--days) days="$2"; shift 2 ;;
*) echo "Unknown option: $1"; exit 1 ;;
esac
done
# Build invitation payload
# Get flow PK
local flow_pk
flow_pk=$(api_get "flows/instances/$FLOW_SLUG/" | python3 -c "import json,sys; print(json.load(sys.stdin)['pk'])")
local payload
payload=$(python3 -c "
import json, sys, re
from datetime import datetime, timedelta, timezone
slug = re.sub(r'[^a-z0-9-]', '-', '$group_name'.lower()).strip('-')
data = {
'name': 'invite-' + slug + '-' + datetime.now(timezone.utc).strftime('%Y%m%d-%H%M'),
'single_use': True,
'fixed_data': {'group': '$group_name'},
'flow': '$flow_pk'
}
days = '$days'
if days:
expires = datetime.now(timezone.utc) + timedelta(days=int(days))
data['expires'] = expires.isoformat()
print(json.dumps(data))
")
local result
result=$(api_post "stages/invitation/invitations/" "$payload")
local token
token=$(echo "$result" | python3 -c "import json,sys; print(json.load(sys.stdin)['pk'])")
echo ""
echo "Invitation created for group: $group_name"
if [[ -n "$days" ]]; then
echo "Expires in: $days days"
else
echo "Expires: never"
fi
echo "Single-use: yes"
echo ""
echo "Share this link:"
echo " https://authentik.viktorbarzin.me/if/flow/$FLOW_SLUG/?itoken=$token"
echo ""
}
cmd_assign() {
local username="${1:?Usage: assign <username> <group-name>}"
local group_name="${2:?Usage: assign <username> <group-name>}"
# Find user PK
local user_pk
user_pk=$(api_get "core/users/?search=$username" | python3 -c "
import json, sys
users = json.load(sys.stdin)['results']
if not users:
print('NOT_FOUND', file=sys.stderr)
sys.exit(1)
print(users[0]['pk'])
")
# Find group PK and current users
local group_data
group_data=$(api_get "core/groups/?search=$(python3 -c "import urllib.parse; print(urllib.parse.quote('$group_name'))")" | python3 -c "
import json, sys
groups = json.load(sys.stdin)['results']
matches = [g for g in groups if g['name'] == '$group_name']
if not matches:
print('NOT_FOUND', file=sys.stderr)
sys.exit(1)
g = matches[0]
users = g.get('users', [])
print(json.dumps({'pk': g['pk'], 'users': users}))
")
local group_pk
group_pk=$(echo "$group_data" | python3 -c "import json,sys; print(json.load(sys.stdin)['pk'])")
# Add user to group
local updated_users
updated_users=$(echo "$group_data" | python3 -c "
import json, sys
d = json.load(sys.stdin)
users = d['users']
uid = $user_pk
if uid not in users:
users.append(uid)
print(json.dumps(users))
")
api_patch "core/groups/$group_pk/" "{\"users\": $updated_users}" > /dev/null
echo "Added $username (pk=$user_pk) to group '$group_name'"
}
cmd_list() {
api_get "stages/invitation/invitations/?page_size=50" | python3 -c "
import json, sys
data = json.load(sys.stdin)
if not data['results']:
print('No pending invitations.')
sys.exit(0)
print(f\"{'Token (itoken)':<40} {'Name':<50} {'Single-Use':<12} {'Expires':<25} {'Group'}\")
print('-' * 160)
for inv in data['results']:
token = inv['pk']
name = inv.get('name', '')
single = 'yes' if inv.get('single_use') else 'no'
expires = inv.get('expires') or 'never'
if expires != 'never':
expires = expires[:19]
group = inv.get('fixed_data', {}).get('group', '—')
print(f'{token:<40} {name:<50} {single:<12} {expires:<25} {group}')
print(f\"\\nTotal: {data['pagination']['count']}\")
"
}
case "${1:-help}" in
create) shift; cmd_create "$@" ;;
assign) shift; cmd_assign "$@" ;;
list) cmd_list ;;
*)
echo "Authentik Invitation Manager"
echo ""
echo "Usage:"
echo " $0 create <group-name> [--days N] Create single-use invite link"
echo " $0 assign <username> <group-name> Add user to group"
echo " $0 list Show pending invitations"
;;
esac

566
.claude/scripts/backup-verify.sh Executable file
View file

@ -0,0 +1,566 @@
#!/usr/bin/env bash
# backup-verify.sh — Full 3-2-1 backup health inspection
# Checks: LVM snapshots, weekly backup, PVC file copies, pfsense, NFS mirror,
# offsite sync, DB CronJobs, CNPG backups
# Usage: backup-verify.sh [--fix] [--dry-run]
set -euo pipefail
KUBECTL="kubectl --kubeconfig /Users/viktorbarzin/code/config"
PVE_SSH="ssh -o ConnectTimeout=5 -o BatchMode=yes root@192.168.1.127"
DRY_RUN=false
FIX=false
AGENT="backup-verify"
for arg in "$@"; do
case "$arg" in
--dry-run) DRY_RUN=true ;;
--fix) FIX=true ;;
esac
done
CHECKS="[]"
PVE_REACHABLE=true
add_check() {
local name="$1" status="$2" message="$3"
CHECKS=$(echo "$CHECKS" | python3 -c "
import sys, json
checks = json.load(sys.stdin)
checks.append({'name': '''$name''', 'status': '''$status''', 'message': '''$message'''})
json.dump(checks, sys.stdout)
")
}
# Test PVE host connectivity (all Layer 1+2 checks depend on this)
check_pve_connectivity() {
if $DRY_RUN; then return; fi
if ! $PVE_SSH "true" 2>/dev/null; then
PVE_REACHABLE=false
add_check "pve-connectivity" "fail" "PVE host (192.168.1.127) unreachable via SSH"
fi
}
# ============================================================
# LAYER 1: LVM Thin Snapshots
# ============================================================
check_lvm_snapshot_freshness() {
if $DRY_RUN; then add_check "lvm-snapshot-freshness" "ok" "DRY RUN"; return; fi
if ! $PVE_REACHABLE; then add_check "lvm-snapshot-freshness" "fail" "PVE unreachable"; return; fi
local ts
ts=$($PVE_SSH "curl -s http://10.0.20.100:30091/metrics 2>/dev/null | grep '^lvm_snapshot_last_run_timestamp' | head -1 | awk '{print \$2}'" 2>/dev/null) || true
if [ -z "$ts" ] || [ "$ts" = "" ]; then
add_check "lvm-snapshot-freshness" "fail" "No Pushgateway metric found — snapshots may have never run"
return
fi
local now age_h
now=$(date +%s)
age_h=$(python3 -c "print(f'{($now - $ts) / 3600:.1f}')" 2>/dev/null)
if python3 -c "exit(0 if ($now - $ts) < 129600 else 1)" 2>/dev/null; then # 36h
add_check "lvm-snapshot-freshness" "ok" "Last snapshot ${age_h}h ago"
elif python3 -c "exit(0 if ($now - $ts) < 172800 else 1)" 2>/dev/null; then # 48h
add_check "lvm-snapshot-freshness" "warn" "Snapshot getting stale: ${age_h}h ago (threshold: 36h)"
else
add_check "lvm-snapshot-freshness" "fail" "Snapshot stale: ${age_h}h ago (threshold: 48h)"
fi
}
check_lvm_snapshot_status() {
if $DRY_RUN; then add_check "lvm-snapshot-status" "ok" "DRY RUN"; return; fi
if ! $PVE_REACHABLE; then add_check "lvm-snapshot-status" "fail" "PVE unreachable"; return; fi
local status
status=$($PVE_SSH "curl -s http://10.0.20.100:30091/metrics 2>/dev/null | grep '^lvm_snapshot_last_status' | head -1 | awk '{print \$2}'" 2>/dev/null) || true
if [ "$status" = "0" ] || [ "$status" = "0.0" ]; then
add_check "lvm-snapshot-status" "ok" "Last snapshot run succeeded"
elif [ -z "$status" ]; then
add_check "lvm-snapshot-status" "warn" "No status metric found"
else
add_check "lvm-snapshot-status" "fail" "Last snapshot run failed (status=$status)"
fi
}
check_lvm_snapshot_count() {
if $DRY_RUN; then add_check "lvm-snapshot-count" "ok" "DRY RUN"; return; fi
if ! $PVE_REACHABLE; then add_check "lvm-snapshot-count" "fail" "PVE unreachable"; return; fi
local count
count=$($PVE_SSH "lvs pve 2>/dev/null | grep -c '_snap_' || echo 0" 2>/dev/null) || count=0
if [ "$count" -ge 50 ]; then
add_check "lvm-snapshot-count" "ok" "${count} snapshots exist"
elif [ "$count" -gt 0 ]; then
add_check "lvm-snapshot-count" "warn" "Only ${count} snapshots (expected ≥50)"
else
add_check "lvm-snapshot-count" "fail" "No snapshots exist"
fi
}
check_lvm_thinpool_free() {
if $DRY_RUN; then add_check "lvm-thinpool-free" "ok" "DRY RUN"; return; fi
if ! $PVE_REACHABLE; then add_check "lvm-thinpool-free" "fail" "PVE unreachable"; return; fi
local data_pct free_pct
data_pct=$($PVE_SSH "lvs --noheadings --nosuffix -o data_percent pve/data 2>/dev/null | tr -d ' '" 2>/dev/null) || true
if [ -z "$data_pct" ]; then
add_check "lvm-thinpool-free" "warn" "Cannot read thin pool usage"
return
fi
free_pct=$(python3 -c "print(f'{100 - $data_pct:.1f}')" 2>/dev/null)
if python3 -c "exit(0 if (100 - $data_pct) > 15 else 1)" 2>/dev/null; then
add_check "lvm-thinpool-free" "ok" "Thin pool ${free_pct}% free"
elif python3 -c "exit(0 if (100 - $data_pct) > 10 else 1)" 2>/dev/null; then
add_check "lvm-thinpool-free" "warn" "Thin pool low: ${free_pct}% free (threshold: 15%)"
else
add_check "lvm-thinpool-free" "fail" "Thin pool critical: ${free_pct}% free (threshold: 10%)"
fi
}
check_lvm_snapshot_timer() {
if $DRY_RUN; then add_check "lvm-snapshot-timer" "ok" "DRY RUN"; return; fi
if ! $PVE_REACHABLE; then add_check "lvm-snapshot-timer" "fail" "PVE unreachable"; return; fi
local active enabled
active=$($PVE_SSH "systemctl is-active lvm-pvc-snapshot.timer 2>/dev/null" 2>/dev/null) || active="unknown"
enabled=$($PVE_SSH "systemctl is-enabled lvm-pvc-snapshot.timer 2>/dev/null" 2>/dev/null) || enabled="unknown"
if [ "$active" = "active" ] && [ "$enabled" = "enabled" ]; then
add_check "lvm-snapshot-timer" "ok" "Timer active and enabled"
else
add_check "lvm-snapshot-timer" "fail" "Timer: active=$active enabled=$enabled"
if $FIX; then
$PVE_SSH "systemctl enable --now lvm-pvc-snapshot.timer" 2>/dev/null && \
add_check "lvm-snapshot-timer-fix" "ok" "AUTO-FIX: Timer re-enabled" || \
add_check "lvm-snapshot-timer-fix" "fail" "AUTO-FIX: Failed to re-enable timer"
fi
fi
}
# ============================================================
# LAYER 2: Weekly Backup (sda)
# ============================================================
check_daily_backup_freshness() {
if $DRY_RUN; then add_check "daily-backup-freshness" "ok" "DRY RUN"; return; fi
if ! $PVE_REACHABLE; then add_check "daily-backup-freshness" "fail" "PVE unreachable"; return; fi
local ts
ts=$($PVE_SSH "curl -s http://10.0.20.100:30091/metrics 2>/dev/null | grep '^daily_backup_last_run_timestamp' | head -1 | awk '{print \$2}'" 2>/dev/null) || true
if [ -z "$ts" ]; then
add_check "daily-backup-freshness" "fail" "No weekly backup metric — may have never run"
return
fi
local now age_h
now=$(date +%s)
age_h=$(python3 -c "print(f'{($now - $ts) / 3600:.1f}')" 2>/dev/null)
if python3 -c "exit(0 if ($now - $ts) < 777600 else 1)" 2>/dev/null; then # 9d
add_check "daily-backup-freshness" "ok" "Last run ${age_h}h ago"
else
add_check "daily-backup-freshness" "fail" "Daily backup stale: ${age_h}h ago (threshold: 9d)"
fi
}
check_daily_backup_status() {
if $DRY_RUN; then add_check "daily-backup-status" "ok" "DRY RUN"; return; fi
if ! $PVE_REACHABLE; then add_check "daily-backup-status" "fail" "PVE unreachable"; return; fi
local status
status=$($PVE_SSH "curl -s http://10.0.20.100:30091/metrics 2>/dev/null | grep '^daily_backup_last_status' | head -1 | awk '{print \$2}'" 2>/dev/null) || true
if [ "$status" = "0" ] || [ "$status" = "0.0" ]; then
add_check "daily-backup-status" "ok" "Last weekly backup succeeded"
elif [ -z "$status" ]; then
add_check "daily-backup-status" "warn" "No status metric found"
else
add_check "daily-backup-status" "fail" "Last weekly backup failed (status=$status)"
fi
}
check_daily_backup_timer() {
if $DRY_RUN; then add_check "daily-backup-timer" "ok" "DRY RUN"; return; fi
if ! $PVE_REACHABLE; then add_check "daily-backup-timer" "fail" "PVE unreachable"; return; fi
local active enabled
active=$($PVE_SSH "systemctl is-active daily-backup.timer 2>/dev/null" 2>/dev/null) || active="unknown"
enabled=$($PVE_SSH "systemctl is-enabled daily-backup.timer 2>/dev/null" 2>/dev/null) || enabled="unknown"
if [ "$active" = "active" ] && [ "$enabled" = "enabled" ]; then
add_check "daily-backup-timer" "ok" "Timer active and enabled"
else
add_check "daily-backup-timer" "fail" "Timer: active=$active enabled=$enabled"
if $FIX; then
$PVE_SSH "systemctl enable --now daily-backup.timer" 2>/dev/null && \
add_check "daily-backup-timer-fix" "ok" "AUTO-FIX: Timer re-enabled" || \
add_check "daily-backup-timer-fix" "fail" "AUTO-FIX: Failed to re-enable timer"
fi
fi
}
check_sda_mount() {
if $DRY_RUN; then add_check "sda-mount" "ok" "DRY RUN"; return; fi
if ! $PVE_REACHABLE; then add_check "sda-mount" "fail" "PVE unreachable"; return; fi
if $PVE_SSH "mountpoint -q /mnt/backup" 2>/dev/null; then
add_check "sda-mount" "ok" "/mnt/backup is mounted"
else
add_check "sda-mount" "fail" "/mnt/backup is NOT mounted"
if $FIX; then
$PVE_SSH "mount /mnt/backup" 2>/dev/null && \
add_check "sda-mount-fix" "ok" "AUTO-FIX: Mounted /mnt/backup" || \
add_check "sda-mount-fix" "fail" "AUTO-FIX: Failed to mount /mnt/backup"
fi
fi
}
check_sda_disk_usage() {
if $DRY_RUN; then add_check "sda-disk-usage" "ok" "DRY RUN"; return; fi
if ! $PVE_REACHABLE; then add_check "sda-disk-usage" "fail" "PVE unreachable"; return; fi
local usage_pct
usage_pct=$($PVE_SSH "df --output=pcent /mnt/backup 2>/dev/null | tail -1 | tr -d ' %'" 2>/dev/null) || true
if [ -z "$usage_pct" ]; then
add_check "sda-disk-usage" "warn" "Cannot read /mnt/backup usage"
return
fi
if [ "$usage_pct" -lt 85 ]; then
add_check "sda-disk-usage" "ok" "Backup disk ${usage_pct}% used"
elif [ "$usage_pct" -lt 95 ]; then
add_check "sda-disk-usage" "warn" "Backup disk ${usage_pct}% used (threshold: 85%)"
else
add_check "sda-disk-usage" "fail" "Backup disk ${usage_pct}% used (threshold: 95%)"
fi
}
check_pvc_data_freshness() {
if $DRY_RUN; then add_check "pvc-data-freshness" "ok" "DRY RUN"; return; fi
if ! $PVE_REACHABLE; then add_check "pvc-data-freshness" "fail" "PVE unreachable"; return; fi
local latest_week count
latest_week=$($PVE_SSH "ls -1d /mnt/backup/pvc-data/????-?? 2>/dev/null | tail -1" 2>/dev/null) || true
count=$($PVE_SSH "ls -1d /mnt/backup/pvc-data/????-??/*/* 2>/dev/null | wc -l" 2>/dev/null) || count=0
if [ -z "$latest_week" ]; then
add_check "pvc-data-freshness" "fail" "No PVC file copies found on sda"
else
local week_name age_days
week_name=$(basename "$latest_week")
# Check age of latest week dir
age_days=$($PVE_SSH "echo \$(( (\$(date +%s) - \$(stat -c %Y '$latest_week')) / 86400 ))" 2>/dev/null) || age_days=999
if [ "$age_days" -lt 9 ]; then
add_check "pvc-data-freshness" "ok" "PVC copies: week ${week_name}, ${count} PVCs, ${age_days}d old"
else
add_check "pvc-data-freshness" "fail" "PVC copies stale: week ${week_name}, ${age_days}d old (threshold: 9d)"
fi
fi
}
check_nfs_mirror_freshness() {
if $DRY_RUN; then add_check "nfs-mirror-freshness" "ok" "DRY RUN"; return; fi
if ! $PVE_REACHABLE; then add_check "nfs-mirror-freshness" "fail" "PVE unreachable"; return; fi
local dir_count age_days
dir_count=$($PVE_SSH "ls -1d /mnt/backup/nfs-mirror/*-backup 2>/dev/null | wc -l" 2>/dev/null) || dir_count=0
age_days=$($PVE_SSH "echo \$(( (\$(date +%s) - \$(stat -c %Y /mnt/backup/nfs-mirror 2>/dev/null || echo 0)) / 86400 ))" 2>/dev/null) || age_days=999
if [ "$dir_count" -gt 0 ] && [ "$age_days" -lt 9 ]; then
add_check "nfs-mirror-freshness" "ok" "NFS mirror: ${dir_count} dirs, ${age_days}d old"
elif [ "$dir_count" -eq 0 ]; then
add_check "nfs-mirror-freshness" "fail" "No NFS mirror dirs found on sda"
else
add_check "nfs-mirror-freshness" "fail" "NFS mirror stale: ${age_days}d old (threshold: 9d)"
fi
}
check_pfsense_backup_freshness() {
if $DRY_RUN; then add_check "pfsense-backup-freshness" "ok" "DRY RUN"; return; fi
if ! $PVE_REACHABLE; then add_check "pfsense-backup-freshness" "fail" "PVE unreachable"; return; fi
local latest age_days
latest=$($PVE_SSH "ls -t /mnt/backup/pfsense/config-*.xml 2>/dev/null | head -1" 2>/dev/null) || true
if [ -z "$latest" ]; then
add_check "pfsense-backup-freshness" "fail" "No pfsense config.xml backups found"
return
fi
age_days=$($PVE_SSH "echo \$(( (\$(date +%s) - \$(stat -c %Y '$latest')) / 86400 ))" 2>/dev/null) || age_days=999
local fname
fname=$(basename "$latest")
if [ "$age_days" -lt 9 ]; then
add_check "pfsense-backup-freshness" "ok" "pfsense backup: ${fname}, ${age_days}d old"
else
add_check "pfsense-backup-freshness" "fail" "pfsense backup stale: ${fname}, ${age_days}d old (threshold: 9d)"
fi
}
# ============================================================
# LAYER 3: Offsite Sync
# ============================================================
check_offsite_sync_freshness() {
if $DRY_RUN; then add_check "offsite-sync-freshness" "ok" "DRY RUN"; return; fi
if ! $PVE_REACHABLE; then add_check "offsite-sync-freshness" "fail" "PVE unreachable"; return; fi
local ts
ts=$($PVE_SSH "curl -s http://10.0.20.100:30091/metrics 2>/dev/null | grep 'backup_last_success_timestamp.*offsite-backup-sync' | awk '{print \$NF}'" 2>/dev/null) || true
if [ -z "$ts" ]; then
add_check "offsite-sync-freshness" "fail" "No offsite sync metric — may have never run"
return
fi
local now age_h
now=$(date +%s)
age_h=$(python3 -c "print(f'{($now - $ts) / 3600:.1f}')" 2>/dev/null)
if python3 -c "exit(0 if ($now - $ts) < 777600 else 1)" 2>/dev/null; then # 9d
add_check "offsite-sync-freshness" "ok" "Last offsite sync ${age_h}h ago"
else
add_check "offsite-sync-freshness" "fail" "Offsite sync stale: ${age_h}h ago (threshold: 9d)"
fi
}
check_offsite_sync_status() {
if $DRY_RUN; then add_check "offsite-sync-status" "ok" "DRY RUN"; return; fi
if ! $PVE_REACHABLE; then add_check "offsite-sync-status" "fail" "PVE unreachable"; return; fi
local status
status=$($PVE_SSH "curl -s http://10.0.20.100:30091/metrics 2>/dev/null | grep '^offsite_sync_last_status' | head -1 | awk '{print \$2}'" 2>/dev/null) || true
if [ "$status" = "0" ] || [ "$status" = "0.0" ]; then
add_check "offsite-sync-status" "ok" "Last offsite sync succeeded"
elif [ -z "$status" ]; then
add_check "offsite-sync-status" "warn" "No offsite sync status metric"
else
add_check "offsite-sync-status" "fail" "Last offsite sync failed (status=$status)"
fi
}
check_offsite_sync_timer() {
if $DRY_RUN; then add_check "offsite-sync-timer" "ok" "DRY RUN"; return; fi
if ! $PVE_REACHABLE; then add_check "offsite-sync-timer" "fail" "PVE unreachable"; return; fi
local active enabled
active=$($PVE_SSH "systemctl is-active offsite-sync-backup.timer 2>/dev/null" 2>/dev/null) || active="unknown"
enabled=$($PVE_SSH "systemctl is-enabled offsite-sync-backup.timer 2>/dev/null" 2>/dev/null) || enabled="unknown"
if [ "$active" = "active" ] && [ "$enabled" = "enabled" ]; then
add_check "offsite-sync-timer" "ok" "Timer active and enabled"
else
add_check "offsite-sync-timer" "fail" "Timer: active=$active enabled=$enabled"
if $FIX; then
$PVE_SSH "systemctl enable --now offsite-sync-backup.timer" 2>/dev/null && \
add_check "offsite-sync-timer-fix" "ok" "AUTO-FIX: Timer re-enabled" || \
add_check "offsite-sync-timer-fix" "fail" "AUTO-FIX: Failed to re-enable timer"
fi
fi
}
# ============================================================
# DB BACKUP CRONJOBS
# ============================================================
check_backup_cronjobs() {
if $DRY_RUN; then add_check "backup-cronjobs" "ok" "DRY RUN"; return; fi
local report
report=$($KUBECTL get cronjobs --all-namespaces -o json 2>/dev/null | python3 -c "
import sys, json
from datetime import datetime, timezone
data = json.load(sys.stdin)
# CronJobs with backup-related names
backup_cjs = []
for cj in data.get('items', []):
name = cj['metadata']['name']
ns = cj['metadata']['namespace']
if any(k in name.lower() for k in ['backup', 'etcd', 'raft']):
backup_cjs.append(cj)
if not backup_cjs:
print('WARN|No backup CronJobs found')
sys.exit(0)
# Thresholds in hours
thresholds = {
'mysql': 36, 'postgresql': 36, 'immich': 36,
'vault': 216, 'etcd': 216, 'redis': 216,
'vaultwarden': 216, 'plotting': 216, 'headscale': 216,
'prometheus': 840, # 35 days
}
results = []
all_ok = True
now = datetime.now(timezone.utc)
for cj in backup_cjs:
ns = cj['metadata']['namespace']
name = cj['metadata']['name']
last_success = cj.get('status', {}).get('lastSuccessfulTime', '')
suspend = cj.get('spec', {}).get('suspend', False)
# Find matching threshold
threshold_h = 216 # default 9 days
for key, th in thresholds.items():
if key in name.lower():
threshold_h = th
break
if suspend:
all_ok = False
results.append(f'FAIL {ns}/{name}: SUSPENDED')
continue
if not last_success:
results.append(f'WARN {ns}/{name}: never succeeded')
all_ok = False
continue
try:
dt = datetime.fromisoformat(last_success.replace('Z', '+00:00'))
age_h = (now - dt).total_seconds() / 3600
if age_h > threshold_h:
all_ok = False
results.append(f'FAIL {ns}/{name}: {age_h:.0f}h ago (threshold: {threshold_h}h)')
else:
results.append(f'OK {ns}/{name}: {age_h:.0f}h ago')
except Exception:
results.append(f'WARN {ns}/{name}: cannot parse time {last_success}')
all_ok = False
status = 'OK' if all_ok else 'WARN'
print(f'{status}|' + '; '.join(results))
" 2>/dev/null) || report="WARN|Failed to check backup CronJobs"
local status_prefix="${report%%|*}"
local detail="${report#*|}"
if [ "$status_prefix" = "OK" ]; then
add_check "backup-cronjobs" "ok" "$detail"
else
add_check "backup-cronjobs" "warn" "$detail"
fi
}
# ============================================================
# CNPG BACKUPS (existing checks, kept as-is)
# ============================================================
check_cnpg_backups() {
if $DRY_RUN; then add_check "cnpg-backups" "ok" "DRY RUN"; return; fi
local backups
backups=$($KUBECTL get backup.postgresql.cnpg.io --all-namespaces -o json 2>/dev/null) || {
add_check "cnpg-backups" "warn" "No CNPG Backup CRDs found"
return
}
local report
report=$(echo "$backups" | python3 -c "
import sys, json
from datetime import datetime, timezone
data = json.load(sys.stdin)
items = data.get('items', [])
if not items:
print('WARN|No CNPG backups found')
sys.exit(0)
clusters = {}
for b in items:
ns = b['metadata']['namespace']
cluster = b.get('spec', {}).get('cluster', {}).get('name', 'unknown')
key = f'{ns}/{cluster}'
stopped = b.get('status', {}).get('stoppedAt', '')
phase = b.get('status', {}).get('phase', 'unknown')
if key not in clusters or stopped > clusters[key].get('stopped', ''):
clusters[key] = {'phase': phase, 'stopped': stopped}
results = []
all_ok = True
now = datetime.now(timezone.utc)
for key, info in sorted(clusters.items()):
if info['stopped']:
try:
dt = datetime.fromisoformat(info['stopped'].replace('Z', '+00:00'))
age_h = (now - dt).total_seconds() / 3600
if age_h > 48: all_ok = False
results.append(f'{key}: {info[\"phase\"]} ({age_h:.1f}h ago)')
except: results.append(f'{key}: {info[\"phase\"]}'); all_ok = False
else:
results.append(f'{key}: {info[\"phase\"]} (no completion)'); all_ok = False
print(f'{\"OK\" if all_ok else \"WARN\"}|' + '; '.join(results))
" 2>/dev/null) || report="WARN|Failed to parse CNPG backups"
local status_prefix="${report%%|*}"
local detail="${report#*|}"
if [ "$status_prefix" = "OK" ]; then
add_check "cnpg-backups" "ok" "$detail"
else
add_check "cnpg-backups" "warn" "$detail"
fi
}
# ============================================================
# RUN ALL CHECKS
# ============================================================
check_pve_connectivity
# Layer 1: LVM Thin Snapshots
check_lvm_snapshot_freshness
check_lvm_snapshot_status
check_lvm_snapshot_count
check_lvm_thinpool_free
check_lvm_snapshot_timer
# Layer 2: Weekly Backup (sda)
check_daily_backup_freshness
check_daily_backup_status
check_daily_backup_timer
check_sda_mount
check_sda_disk_usage
check_pvc_data_freshness
check_nfs_mirror_freshness
check_pfsense_backup_freshness
# Layer 3: Offsite Sync
check_offsite_sync_freshness
check_offsite_sync_status
check_offsite_sync_timer
# DB CronJobs + CNPG
check_backup_cronjobs
check_cnpg_backups
# ============================================================
# OUTPUT
# ============================================================
OVERALL=$(echo "$CHECKS" | python3 -c "
import sys, json
checks = json.load(sys.stdin)
statuses = [c['status'] for c in checks]
if 'fail' in statuses:
print('fail')
elif 'warn' in statuses:
print('warn')
else:
print('ok')
")
echo "{\"status\": \"$OVERALL\", \"agent\": \"$AGENT\", \"checks\": $CHECKS}" | python3 -m json.tool

View file

@ -0,0 +1,166 @@
#!/usr/bin/env bash
set -euo pipefail
KUBECTL="kubectl --kubeconfig /Users/viktorbarzin/code/infra/config"
AGENT="crowdsec-status"
DRY_RUN=false
for arg in "$@"; do
case "$arg" in
--dry-run) DRY_RUN=true ;;
esac
done
checks=()
add_check() {
local name="$1" status="$2" message="$3"
checks+=("{\"name\": \"$name\", \"status\": \"$status\", \"message\": \"$message\"}")
}
find_crowdsec_namespace() {
$KUBECTL get pods -A -l app.kubernetes.io/name=crowdsec --no-headers 2>/dev/null | head -1 | awk '{print $1}' || \
$KUBECTL get pods -A --no-headers 2>/dev/null | grep -i crowdsec | head -1 | awk '{print $1}' || \
echo "crowdsec"
}
check_lapi_health() {
if $DRY_RUN; then
add_check "crowdsec-lapi" "ok" "dry-run: would check CrowdSec LAPI pod health"
return
fi
local ns
ns=$(find_crowdsec_namespace)
local lapi_pod
lapi_pod=$($KUBECTL get pods -n "$ns" -l app.kubernetes.io/name=crowdsec,app.kubernetes.io/component=lapi --no-headers 2>/dev/null | head -1) || true
if [ -z "$lapi_pod" ]; then
lapi_pod=$($KUBECTL get pods -n "$ns" --no-headers 2>/dev/null | grep -i "crowdsec.*lapi" | head -1) || true
fi
if [ -z "$lapi_pod" ]; then
add_check "crowdsec-lapi" "fail" "No CrowdSec LAPI pod found in namespace ${ns}"
return
fi
local pod_name status
pod_name=$(echo "$lapi_pod" | awk '{print $1}')
status=$(echo "$lapi_pod" | awk '{print $3}')
if [ "$status" != "Running" ]; then
add_check "crowdsec-lapi" "fail" "LAPI pod ${pod_name} is ${status}"
return
fi
add_check "crowdsec-lapi" "ok" "LAPI pod ${pod_name} is Running"
}
check_cscli_metrics() {
if $DRY_RUN; then
add_check "crowdsec-metrics" "ok" "dry-run: would run cscli metrics via kubectl exec"
return
fi
local ns
ns=$(find_crowdsec_namespace)
local lapi_pod
lapi_pod=$($KUBECTL get pods -n "$ns" -l app.kubernetes.io/name=crowdsec,app.kubernetes.io/component=lapi -o jsonpath='{.items[0].metadata.name}' 2>/dev/null) || \
lapi_pod=$($KUBECTL get pods -n "$ns" --no-headers 2>/dev/null | grep -i "crowdsec.*lapi" | head -1 | awk '{print $1}') || true
if [ -z "$lapi_pod" ]; then
add_check "crowdsec-metrics" "warn" "No LAPI pod found to run cscli metrics"
return
fi
local metrics_output
metrics_output=$($KUBECTL exec -n "$ns" "$lapi_pod" -- cscli metrics 2>/dev/null) || {
add_check "crowdsec-metrics" "warn" "Failed to run cscli metrics on ${lapi_pod}"
return
}
add_check "crowdsec-metrics" "ok" "cscli metrics returned successfully"
}
check_decisions() {
if $DRY_RUN; then
add_check "crowdsec-decisions" "ok" "dry-run: would check cscli decisions list"
return
fi
local ns
ns=$(find_crowdsec_namespace)
local lapi_pod
lapi_pod=$($KUBECTL get pods -n "$ns" -l app.kubernetes.io/name=crowdsec,app.kubernetes.io/component=lapi -o jsonpath='{.items[0].metadata.name}' 2>/dev/null) || \
lapi_pod=$($KUBECTL get pods -n "$ns" --no-headers 2>/dev/null | grep -i "crowdsec.*lapi" | head -1 | awk '{print $1}') || true
if [ -z "$lapi_pod" ]; then
add_check "crowdsec-decisions" "warn" "No LAPI pod found to check decisions"
return
fi
local decisions
decisions=$($KUBECTL exec -n "$ns" "$lapi_pod" -- cscli decisions list -o json 2>/dev/null) || {
add_check "crowdsec-decisions" "ok" "No active decisions (or failed to query)"
return
}
local count
count=$(echo "$decisions" | jq 'if type == "array" then length else 0 end' 2>/dev/null || echo "0")
if [ "$count" -gt 0 ]; then
add_check "crowdsec-decisions" "ok" "${count} active decision(s)"
else
add_check "crowdsec-decisions" "ok" "No active decisions"
fi
}
check_agent_daemonset() {
if $DRY_RUN; then
add_check "crowdsec-agents" "ok" "dry-run: would check CrowdSec agent DaemonSet"
return
fi
local ns
ns=$(find_crowdsec_namespace)
local ds_json
ds_json=$($KUBECTL get daemonset -n "$ns" -l app.kubernetes.io/name=crowdsec -o json 2>/dev/null) || {
# Fallback: search by name
ds_json=$($KUBECTL get daemonset -n "$ns" -o json 2>/dev/null | jq '{items: [.items[] | select(.metadata.name | test("crowdsec"))]}') || {
add_check "crowdsec-agents" "warn" "No CrowdSec DaemonSet found"
return
}
}
local desired ready
desired=$(echo "$ds_json" | jq '[.items[].status.desiredNumberScheduled] | add // 0' 2>/dev/null || echo "0")
ready=$(echo "$ds_json" | jq '[.items[].status.numberReady] | add // 0' 2>/dev/null || echo "0")
if [ "$ready" -lt "$desired" ]; then
add_check "crowdsec-agents" "warn" "CrowdSec agents: ${ready}/${desired} ready"
elif [ "$desired" -eq 0 ]; then
add_check "crowdsec-agents" "warn" "No CrowdSec agent DaemonSet pods scheduled"
else
add_check "crowdsec-agents" "ok" "CrowdSec agents: ${ready}/${desired} ready"
fi
}
check_lapi_health
check_cscli_metrics
check_decisions
check_agent_daemonset
# Output JSON
overall="ok"
for c in "${checks[@]}"; do
s=$(echo "$c" | jq -r '.status')
if [ "$s" = "fail" ]; then overall="fail"; break; fi
if [ "$s" = "warn" ]; then overall="warn"; fi
done
printf '{"status": "%s", "agent": "%s", "checks": [%s]}\n' \
"$overall" "$AGENT" "$(IFS=,; echo "${checks[*]}")"

194
.claude/scripts/db-health.sh Executable file
View file

@ -0,0 +1,194 @@
#!/usr/bin/env bash
set -euo pipefail
KUBECTL="kubectl --kubeconfig /Users/viktorbarzin/code/infra/config"
DRY_RUN=false
AGENT="db-health"
for arg in "$@"; do
case "$arg" in
--dry-run) DRY_RUN=true ;;
esac
done
CHECKS="[]"
add_check() {
local name="$1" status="$2" message="$3"
CHECKS=$(echo "$CHECKS" | python3 -c "
import sys, json
checks = json.load(sys.stdin)
checks.append({'name': '''$name''', 'status': '''$status''', 'message': '''$message'''})
json.dump(checks, sys.stdout)
")
}
# MySQL InnoDB Cluster - Group Replication status
check_mysql_gr() {
if $DRY_RUN; then
add_check "mysql-group-replication" "ok" "DRY RUN: would check MySQL Group Replication status"
return
fi
# Discover MySQL pod via labels first, fall back to known name
local mysql_pod
mysql_pod=$($KUBECTL get pods -n dbaas -l app=mysql-cluster -o name 2>/dev/null | head -1) || true
if [ -z "$mysql_pod" ]; then
mysql_pod=$($KUBECTL get pods -n dbaas -l app.kubernetes.io/name=mysql -o name 2>/dev/null | head -1) || true
fi
if [ -z "$mysql_pod" ]; then
mysql_pod="sts/mysql-cluster"
fi
local gr_status
gr_status=$($KUBECTL exec "$mysql_pod" -n dbaas -- mysql -N -e \
"SELECT MEMBER_HOST, MEMBER_STATE, MEMBER_ROLE FROM performance_schema.replication_group_members" 2>/dev/null) || {
add_check "mysql-group-replication" "fail" "Cannot connect to MySQL cluster to check GR status"
return
}
local member_count online_count
member_count=$(echo "$gr_status" | grep -c . || true)
online_count=$(echo "$gr_status" | grep -c "ONLINE" || true)
if [ "$online_count" -eq "$member_count" ] && [ "$member_count" -ge 3 ]; then
add_check "mysql-group-replication" "ok" "All $member_count members ONLINE: $(echo "$gr_status" | tr '\t' ' ' | tr '\n' '; ')"
elif [ "$online_count" -lt "$member_count" ]; then
add_check "mysql-group-replication" "fail" "Only $online_count/$member_count members ONLINE: $(echo "$gr_status" | tr '\t' ' ' | tr '\n' '; ')"
else
add_check "mysql-group-replication" "warn" "Cluster has $member_count members (expected 3): $(echo "$gr_status" | tr '\t' ' ' | tr '\n' '; ')"
fi
}
# MySQL pod health
check_mysql_pods() {
if $DRY_RUN; then
add_check "mysql-pods" "ok" "DRY RUN: would check MySQL pod status"
return
fi
local pod_status
pod_status=$($KUBECTL get pods -n dbaas -l app=mysql-cluster -o wide --no-headers 2>/dev/null) || \
pod_status=$($KUBECTL get pods -n dbaas --no-headers 2>/dev/null | grep -i mysql) || {
add_check "mysql-pods" "warn" "Cannot find MySQL pods in dbaas namespace"
return
}
local not_running
not_running=$(echo "$pod_status" | grep -v "Running" | grep -v "Completed" || true)
if [ -z "$not_running" ]; then
local count
count=$(echo "$pod_status" | grep -c "Running" || true)
add_check "mysql-pods" "ok" "$count MySQL pod(s) running in dbaas namespace"
else
add_check "mysql-pods" "fail" "Unhealthy MySQL pods: $(echo "$not_running" | awk '{print $1": "$3}' | tr '\n' '; ')"
fi
}
# CNPG PostgreSQL cluster health
check_cnpg() {
if $DRY_RUN; then
add_check "cnpg-clusters" "ok" "DRY RUN: would check CNPG PostgreSQL cluster health"
return
fi
# Check if CNPG CRDs exist
local cnpg_clusters
cnpg_clusters=$($KUBECTL get cluster.postgresql.cnpg.io --all-namespaces -o json 2>/dev/null) || {
add_check "cnpg-clusters" "warn" "CNPG CRD not found or no clusters deployed"
return
}
local report
report=$(echo "$cnpg_clusters" | python3 -c "
import sys, json
data = json.load(sys.stdin)
results = []
all_healthy = True
for cluster in data.get('items', []):
ns = cluster['metadata']['namespace']
name = cluster['metadata']['name']
phase = cluster.get('status', {}).get('phase', 'unknown')
ready = cluster.get('status', {}).get('readyInstances', 0)
instances = cluster.get('spec', {}).get('instances', 0)
primary = cluster.get('status', {}).get('currentPrimary', 'unknown')
if phase != 'Cluster in healthy state' and phase != 'Healthy':
all_healthy = False
if ready < instances:
all_healthy = False
results.append(f'{ns}/{name}: phase={phase} ready={ready}/{instances} primary={primary}')
print('HEALTHY' if all_healthy else 'UNHEALTHY')
print('; '.join(results))
" 2>/dev/null) || report="Failed to parse CNPG status"
local health_line
health_line=$(echo "$report" | head -1)
local detail_line
detail_line=$(echo "$report" | tail -1)
if [ "$health_line" = "HEALTHY" ]; then
add_check "cnpg-clusters" "ok" "$detail_line"
else
add_check "cnpg-clusters" "fail" "$detail_line"
fi
}
# Database connection counts (MySQL)
check_mysql_connections() {
if $DRY_RUN; then
add_check "mysql-connections" "ok" "DRY RUN: would check MySQL connection counts"
return
fi
local mysql_pod
mysql_pod=$($KUBECTL get pods -n dbaas -l app=mysql-cluster -o name 2>/dev/null | head -1) || true
if [ -z "$mysql_pod" ]; then
mysql_pod="sts/mysql-cluster"
fi
local conn_info
conn_info=$($KUBECTL exec "$mysql_pod" -n dbaas -- mysql -N -e \
"SELECT 'threads_connected', VARIABLE_VALUE FROM performance_schema.global_status WHERE VARIABLE_NAME='Threads_connected' UNION ALL SELECT 'max_connections', VARIABLE_VALUE FROM performance_schema.global_variables WHERE VARIABLE_NAME='max_connections'" 2>/dev/null) || {
add_check "mysql-connections" "warn" "Cannot query MySQL connection info"
return
}
local threads_connected max_connections
threads_connected=$(echo "$conn_info" | grep threads_connected | awk '{print $2}') || threads_connected="unknown"
max_connections=$(echo "$conn_info" | grep max_connections | awk '{print $2}') || max_connections="unknown"
if [ "$threads_connected" != "unknown" ] && [ "$max_connections" != "unknown" ]; then
local pct=$((threads_connected * 100 / max_connections))
if [ "$pct" -gt 80 ]; then
add_check "mysql-connections" "fail" "MySQL connections at ${pct}%: $threads_connected/$max_connections"
elif [ "$pct" -gt 60 ]; then
add_check "mysql-connections" "warn" "MySQL connections at ${pct}%: $threads_connected/$max_connections"
else
add_check "mysql-connections" "ok" "MySQL connections: $threads_connected/$max_connections (${pct}%)"
fi
else
add_check "mysql-connections" "warn" "MySQL connections: threads=$threads_connected max=$max_connections"
fi
}
# Run all checks
check_mysql_gr
check_mysql_pods
check_cnpg
check_mysql_connections
# Determine overall status
OVERALL=$(echo "$CHECKS" | python3 -c "
import sys, json
checks = json.load(sys.stdin)
statuses = [c['status'] for c in checks]
if 'fail' in statuses:
print('fail')
elif 'warn' in statuses:
print('warn')
else:
print('ok')
")
echo "{\"status\": \"$OVERALL\", \"agent\": \"$AGENT\", \"checks\": $CHECKS}" | python3 -m json.tool

217
.claude/scripts/deploy-status.sh Executable file
View file

@ -0,0 +1,217 @@
#!/usr/bin/env bash
set -euo pipefail
KUBECTL="kubectl --kubeconfig /Users/viktorbarzin/code/infra/config"
DRY_RUN=false
AGENT="deploy-status"
for arg in "$@"; do
case "$arg" in
--dry-run) DRY_RUN=true ;;
esac
done
CHECKS="[]"
add_check() {
local name="$1" status="$2" message="$3"
CHECKS=$(echo "$CHECKS" | python3 -c "
import sys, json
checks = json.load(sys.stdin)
checks.append({'name': '''$name''', 'status': '''$status''', 'message': '''$message'''})
json.dump(checks, sys.stdout)
")
}
# Check for stalled rollouts (Progressing=False or deadline exceeded)
check_stalled_rollouts() {
if $DRY_RUN; then
add_check "stalled-rollouts" "ok" "DRY RUN: would check for stalled deployment rollouts"
return
fi
local stalled
stalled=$($KUBECTL get deployments --all-namespaces -o json 2>/dev/null | python3 -c "
import sys, json
data = json.load(sys.stdin)
stalled = []
for dep in data.get('items', []):
ns = dep['metadata']['namespace']
name = dep['metadata']['name']
conditions = dep.get('status', {}).get('conditions', [])
for cond in conditions:
if cond.get('type') == 'Progressing' and cond.get('status') == 'False':
reason = cond.get('reason', 'unknown')
stalled.append(f'{ns}/{name}: {reason}')
elif cond.get('type') == 'Available' and cond.get('status') == 'False':
reason = cond.get('reason', 'unknown')
stalled.append(f'{ns}/{name}: unavailable ({reason})')
if stalled:
print('; '.join(stalled))
else:
print('')
" 2>/dev/null) || stalled="Failed to check deployments"
if [ -z "$stalled" ]; then
add_check "stalled-rollouts" "ok" "No stalled rollouts detected"
else
add_check "stalled-rollouts" "fail" "Stalled rollouts: $stalled"
fi
}
# Check for unavailable replicas
check_unavailable_replicas() {
if $DRY_RUN; then
add_check "unavailable-replicas" "ok" "DRY RUN: would check for deployments with unavailable replicas"
return
fi
local unavail
unavail=$($KUBECTL get deployments --all-namespaces -o json 2>/dev/null | python3 -c "
import sys, json
data = json.load(sys.stdin)
issues = []
for dep in data.get('items', []):
ns = dep['metadata']['namespace']
name = dep['metadata']['name']
spec_replicas = dep.get('spec', {}).get('replicas', 1)
ready = dep.get('status', {}).get('readyReplicas', 0) or 0
unavailable = dep.get('status', {}).get('unavailableReplicas', 0) or 0
if unavailable > 0 or ready < spec_replicas:
issues.append(f'{ns}/{name}: {ready}/{spec_replicas} ready, {unavailable} unavailable')
if issues:
print('; '.join(issues))
else:
print('')
" 2>/dev/null) || unavail="Failed to check replicas"
if [ -z "$unavail" ]; then
add_check "unavailable-replicas" "ok" "All deployments have desired replicas ready"
else
add_check "unavailable-replicas" "warn" "Unavailable replicas: $unavail"
fi
}
# Check for image pull errors
check_image_pull_errors() {
if $DRY_RUN; then
add_check "image-pull-errors" "ok" "DRY RUN: would check for ImagePullBackOff/ErrImagePull pods"
return
fi
local pull_errors
pull_errors=$($KUBECTL get pods --all-namespaces -o json 2>/dev/null | python3 -c "
import sys, json
data = json.load(sys.stdin)
errors = []
for pod in data.get('items', []):
ns = pod['metadata']['namespace']
name = pod['metadata']['name']
for cs in pod.get('status', {}).get('containerStatuses', []) + pod.get('status', {}).get('initContainerStatuses', []):
waiting = cs.get('state', {}).get('waiting', {})
reason = waiting.get('reason', '')
if reason in ('ImagePullBackOff', 'ErrImagePull', 'InvalidImageName'):
image = cs.get('image', 'unknown')
msg = waiting.get('message', '')[:100]
errors.append(f'{ns}/{name}: {reason} image={image} ({msg})')
if errors:
print('; '.join(errors))
else:
print('')
" 2>/dev/null) || pull_errors="Failed to check image pulls"
if [ -z "$pull_errors" ]; then
add_check "image-pull-errors" "ok" "No image pull errors found"
else
add_check "image-pull-errors" "fail" "Image pull errors: $pull_errors"
fi
}
# Check for recent restarts (>5 in last hour)
check_recent_restarts() {
if $DRY_RUN; then
add_check "recent-restarts" "ok" "DRY RUN: would check for pods with high restart counts"
return
fi
local restarts
restarts=$($KUBECTL get pods --all-namespaces -o json 2>/dev/null | python3 -c "
import sys, json
data = json.load(sys.stdin)
high_restart = []
for pod in data.get('items', []):
ns = pod['metadata']['namespace']
name = pod['metadata']['name']
for cs in pod.get('status', {}).get('containerStatuses', []):
count = cs.get('restartCount', 0)
if count >= 5:
container = cs['name']
high_restart.append(f'{ns}/{name}:{container} restarts={count}')
if high_restart:
print('; '.join(sorted(high_restart, key=lambda x: int(x.split('=')[1]), reverse=True)[:20]))
else:
print('')
" 2>/dev/null) || restarts="Failed to check restarts"
if [ -z "$restarts" ]; then
add_check "recent-restarts" "ok" "No pods with 5+ restarts"
else
add_check "recent-restarts" "warn" "High restart counts: $restarts"
fi
}
# Check CrashLoopBackOff pods
check_crashloop() {
if $DRY_RUN; then
add_check "crashloop" "ok" "DRY RUN: would check for CrashLoopBackOff pods"
return
fi
local crashloop
crashloop=$($KUBECTL get pods --all-namespaces -o json 2>/dev/null | python3 -c "
import sys, json
data = json.load(sys.stdin)
crashes = []
for pod in data.get('items', []):
ns = pod['metadata']['namespace']
name = pod['metadata']['name']
for cs in pod.get('status', {}).get('containerStatuses', []):
waiting = cs.get('state', {}).get('waiting', {})
if waiting.get('reason') == 'CrashLoopBackOff':
container = cs['name']
restarts = cs.get('restartCount', 0)
crashes.append(f'{ns}/{name}:{container} restarts={restarts}')
if crashes:
print('; '.join(crashes))
else:
print('')
" 2>/dev/null) || crashloop="Failed to check crashloop"
if [ -z "$crashloop" ]; then
add_check "crashloop" "ok" "No CrashLoopBackOff pods"
else
add_check "crashloop" "fail" "CrashLoopBackOff: $crashloop"
fi
}
# Run all checks
check_stalled_rollouts
check_unavailable_replicas
check_image_pull_errors
check_recent_restarts
check_crashloop
# Determine overall status
OVERALL=$(echo "$CHECKS" | python3 -c "
import sys, json
checks = json.load(sys.stdin)
statuses = [c['status'] for c in checks]
if 'fail' in statuses:
print('fail')
elif 'warn' in statuses:
print('warn')
else:
print('ok')
")
echo "{\"status\": \"$OVERALL\", \"agent\": \"$AGENT\", \"checks\": $CHECKS}" | python3 -m json.tool

144
.claude/scripts/dns-check.sh Executable file
View file

@ -0,0 +1,144 @@
#!/usr/bin/env bash
set -euo pipefail
KUBECTL="kubectl --kubeconfig /Users/viktorbarzin/code/infra/config"
AGENT="dns-check"
DRY_RUN=false
# Internal DNS server (Technitium)
INTERNAL_DNS="10.0.20.100"
# Public DNS
PUBLIC_DNS="1.1.1.1"
# Services to check
SERVICES=(
"grafana.viktorbarzin.me"
"prometheus.viktorbarzin.me"
"nextcloud.viktorbarzin.me"
"authentik.viktorbarzin.me"
"viktorbarzin.me"
)
for arg in "$@"; do
case "$arg" in
--dry-run) DRY_RUN=true ;;
esac
done
checks=()
add_check() {
local name="$1" status="$2" message="$3"
checks+=("{\"name\": \"$name\", \"status\": \"$status\", \"message\": \"$message\"}")
}
check_dns_resolution() {
if $DRY_RUN; then
add_check "dns-resolution" "ok" "dry-run: would resolve ${#SERVICES[@]} services via internal and public DNS"
return
fi
local failures=0 mismatches=0 successes=0
local failure_details="" mismatch_details=""
for svc in "${SERVICES[@]}"; do
local internal_result public_result
internal_result=$(dig +short "$svc" @"$INTERNAL_DNS" A 2>/dev/null | head -1) || internal_result=""
public_result=$(dig +short "$svc" @"$PUBLIC_DNS" A 2>/dev/null | head -1) || public_result=""
if [ -z "$internal_result" ] && [ -z "$public_result" ]; then
failures=$((failures + 1))
failure_details="${failure_details}${svc} (both resolvers failed); "
elif [ -z "$internal_result" ]; then
failures=$((failures + 1))
failure_details="${failure_details}${svc} (internal DNS failed); "
elif [ -z "$public_result" ]; then
# Public might use CNAME/proxy, not necessarily a failure
successes=$((successes + 1))
elif [ "$internal_result" != "$public_result" ]; then
# Mismatch is informational — Cloudflare proxy IPs differ from internal IPs
mismatches=$((mismatches + 1))
mismatch_details="${mismatch_details}${svc} (internal=${internal_result} public=${public_result}); "
successes=$((successes + 1))
else
successes=$((successes + 1))
fi
done
if [ "$failures" -gt 0 ]; then
add_check "dns-resolution" "fail" "${failures} DNS failures: ${failure_details}"
elif [ "$mismatches" -gt 0 ]; then
add_check "dns-resolution" "ok" "${successes}/${#SERVICES[@]} resolved. ${mismatches} internal/public mismatches (expected with Cloudflare proxy): ${mismatch_details}"
else
add_check "dns-resolution" "ok" "All ${successes}/${#SERVICES[@]} services resolved successfully"
fi
}
check_technitium_health() {
if $DRY_RUN; then
add_check "technitium" "ok" "dry-run: would check Technitium DNS server pod health"
return
fi
local tech_pods
tech_pods=$($KUBECTL get pods -A -l app.kubernetes.io/name=technitium --no-headers 2>/dev/null) || \
tech_pods=$($KUBECTL get pods -A --no-headers 2>/dev/null | grep -i technitium || true)
if [ -z "$tech_pods" ]; then
add_check "technitium" "warn" "No Technitium pods found"
return
fi
local not_running
not_running=$(echo "$tech_pods" | grep -v "Running" | grep -c "." 2>/dev/null || echo "0")
if [ "$not_running" -gt 0 ]; then
add_check "technitium" "fail" "Technitium pod(s) not running"
else
add_check "technitium" "ok" "Technitium DNS server pod(s) running"
fi
}
check_coredns_health() {
if $DRY_RUN; then
add_check "coredns" "ok" "dry-run: would check CoreDNS pod health"
return
fi
local coredns_pods
coredns_pods=$($KUBECTL get pods -n kube-system -l k8s-app=kube-dns --no-headers 2>/dev/null) || {
add_check "coredns" "warn" "Failed to query CoreDNS pods"
return
}
if [ -z "$coredns_pods" ]; then
add_check "coredns" "warn" "No CoreDNS pods found"
return
fi
local total not_running
total=$(echo "$coredns_pods" | grep -c "." 2>/dev/null || echo "0")
not_running=$(echo "$coredns_pods" | grep -v "Running" | grep -c "." 2>/dev/null || echo "0")
if [ "$not_running" -gt 0 ]; then
add_check "coredns" "fail" "${not_running}/${total} CoreDNS pod(s) not running"
else
add_check "coredns" "ok" "All ${total} CoreDNS pod(s) running"
fi
}
check_dns_resolution
check_technitium_health
check_coredns_health
# Output JSON
overall="ok"
for c in "${checks[@]}"; do
s=$(echo "$c" | jq -r '.status')
if [ "$s" = "fail" ]; then overall="fail"; break; fi
if [ "$s" = "warn" ]; then overall="warn"; fi
done
printf '{"status": "%s", "agent": "%s", "checks": [%s]}\n' \
"$overall" "$AGENT" "$(IFS=,; echo "${checks[*]}")"

View file

@ -0,0 +1,281 @@
#!/usr/bin/env bash
set -euo pipefail
AGENT="monitoring-health"
KUBECTL="kubectl --kubeconfig /Users/viktorbarzin/code/infra/config"
MONITORING_NS="monitoring"
DRY_RUN=false
for arg in "$@"; do
case "$arg" in
--dry-run) DRY_RUN=true ;;
esac
done
checks=()
add_check() {
local name="$1" status="$2" message="$3"
checks+=("{\"name\": \"$name\", \"status\": \"$status\", \"message\": \"$message\"}")
}
check_prometheus() {
if $DRY_RUN; then
add_check "prometheus" "ok" "dry-run: would check Prometheus server health"
return
fi
# Discover Prometheus server pod via labels
local prom_pod
prom_pod=$($KUBECTL get pods -n "$MONITORING_NS" -l app.kubernetes.io/name=prometheus,app.kubernetes.io/component=server -o name 2>/dev/null | head -1)
if [ -z "$prom_pod" ]; then
prom_pod=$($KUBECTL get pods -n "$MONITORING_NS" -l app=prometheus,component=server -o name 2>/dev/null | head -1)
fi
if [ -z "$prom_pod" ]; then
prom_pod=$($KUBECTL get pods -n "$MONITORING_NS" -o name 2>/dev/null | grep prometheus-server | head -1)
fi
if [ -z "$prom_pod" ]; then
add_check "prometheus" "fail" "No Prometheus server pod found in $MONITORING_NS"
return
fi
local phase
phase=$($KUBECTL get "$prom_pod" -n "$MONITORING_NS" -o jsonpath='{.status.phase}' 2>/dev/null)
if [ "$phase" != "Running" ]; then
add_check "prometheus" "fail" "Prometheus server pod phase: $phase"
return
fi
# Check Prometheus is responding
local prom_healthy
prom_healthy=$($KUBECTL exec "$prom_pod" -n "$MONITORING_NS" -c prometheus-server -- \
wget -q -O- "http://localhost:9090/-/healthy" 2>/dev/null || echo "unhealthy")
if echo "$prom_healthy" | grep -qi "ok\|healthy"; then
# Check target scraping
local targets_up
targets_up=$($KUBECTL exec "$prom_pod" -n "$MONITORING_NS" -c prometheus-server -- \
wget -q -O- "http://localhost:9090/api/v1/targets" 2>/dev/null | \
python3 -c "
import sys, json
try:
data = json.load(sys.stdin)
active = data.get('data',{}).get('activeTargets',[])
up = sum(1 for t in active if t.get('health') == 'up')
total = len(active)
print(f'{up}/{total}')
except: print('unknown')
" 2>/dev/null || echo "unknown")
add_check "prometheus" "ok" "Prometheus server healthy, targets: $targets_up up"
else
add_check "prometheus" "warn" "Prometheus server running but health check unclear"
fi
}
check_alertmanager() {
if $DRY_RUN; then
add_check "alertmanager" "ok" "dry-run: would check Alertmanager health"
return
fi
# Discover Alertmanager pod
local am_pod
am_pod=$($KUBECTL get pods -n "$MONITORING_NS" -l app.kubernetes.io/name=alertmanager -o name 2>/dev/null | head -1)
if [ -z "$am_pod" ]; then
am_pod=$($KUBECTL get pods -n "$MONITORING_NS" -o name 2>/dev/null | grep alertmanager | head -1)
fi
if [ -z "$am_pod" ]; then
add_check "alertmanager" "fail" "No Alertmanager pod found in $MONITORING_NS"
return
fi
local phase
phase=$($KUBECTL get "$am_pod" -n "$MONITORING_NS" -o jsonpath='{.status.phase}' 2>/dev/null)
if [ "$phase" != "Running" ]; then
add_check "alertmanager" "fail" "Alertmanager pod phase: $phase"
return
fi
# Check firing alerts
local alert_info
alert_info=$($KUBECTL exec "$am_pod" -n "$MONITORING_NS" -- \
wget -q -O- "http://localhost:9093/api/v2/alerts?active=true" 2>/dev/null | \
python3 -c "
import sys, json
try:
alerts = json.load(sys.stdin)
firing = [a for a in alerts if a.get('status',{}).get('state') == 'active']
print(len(firing))
except: print('unknown')
" 2>/dev/null || echo "unknown")
# Check silences
local silence_count
silence_count=$($KUBECTL exec "$am_pod" -n "$MONITORING_NS" -- \
wget -q -O- "http://localhost:9093/api/v2/silences" 2>/dev/null | \
python3 -c "
import sys, json
try:
silences = json.load(sys.stdin)
active = [s for s in silences if s.get('status',{}).get('state') == 'active']
print(len(active))
except: print('0')
" 2>/dev/null || echo "0")
if [ "$alert_info" = "unknown" ]; then
add_check "alertmanager" "warn" "Alertmanager running but could not query alerts"
else
local status="ok"
[ "$alert_info" -gt 0 ] 2>/dev/null && status="warn"
add_check "alertmanager" "$status" "Alertmanager healthy: $alert_info firing alerts, $silence_count active silences"
fi
}
check_grafana() {
if $DRY_RUN; then
add_check "grafana" "ok" "dry-run: would check Grafana health"
return
fi
# Discover Grafana pod
local grafana_pod
grafana_pod=$($KUBECTL get pods -n "$MONITORING_NS" -l app.kubernetes.io/name=grafana -o name 2>/dev/null | head -1)
if [ -z "$grafana_pod" ]; then
grafana_pod=$($KUBECTL get pods -n "$MONITORING_NS" -o name 2>/dev/null | grep grafana | grep -v test | head -1)
fi
if [ -z "$grafana_pod" ]; then
add_check "grafana" "fail" "No Grafana pod found in $MONITORING_NS"
return
fi
local phase
phase=$($KUBECTL get "$grafana_pod" -n "$MONITORING_NS" -o jsonpath='{.status.phase}' 2>/dev/null)
if [ "$phase" != "Running" ]; then
add_check "grafana" "fail" "Grafana pod phase: $phase"
return
fi
# Check datasource connectivity
local ds_info
ds_info=$($KUBECTL exec "$grafana_pod" -n "$MONITORING_NS" -- \
curl -sf "http://localhost:3000/api/datasources" 2>/dev/null | \
python3 -c "
import sys, json
try:
ds = json.load(sys.stdin)
names = [d.get('name','?') for d in ds]
print(f'{len(ds)} datasources: {\", \".join(names)}')
except: print('unknown')
" 2>/dev/null || echo "unknown")
if [ "$ds_info" = "unknown" ]; then
add_check "grafana" "warn" "Grafana running but could not query datasources (may need auth)"
else
add_check "grafana" "ok" "Grafana healthy, $ds_info"
fi
}
check_snmp_exporters() {
if $DRY_RUN; then
add_check "snmp-exporters" "ok" "dry-run: would check SNMP exporter pods"
return
fi
local exporters=("snmp-exporter" "idrac-redfish-exporter" "proxmox-exporter")
local running=0 total=0
for exporter in "${exporters[@]}"; do
total=$((total + 1))
local pod
pod=$($KUBECTL get pods -n "$MONITORING_NS" -o name 2>/dev/null | grep "$exporter" | head -1)
if [ -z "$pod" ]; then
# Try all namespaces
pod=$($KUBECTL get pods --all-namespaces -o custom-columns=NS:.metadata.namespace,NAME:.metadata.name --no-headers 2>/dev/null | \
grep "$exporter" | head -1)
if [ -z "$pod" ]; then
add_check "exporter-$exporter" "warn" "$exporter pod not found"
continue
fi
local ns
ns=$(echo "$pod" | awk '{print $1}')
local name
name=$(echo "$pod" | awk '{print $2}')
local phase
phase=$($KUBECTL get pod "$name" -n "$ns" -o jsonpath='{.status.phase}' 2>/dev/null)
if [ "$phase" = "Running" ]; then
running=$((running + 1))
add_check "exporter-$exporter" "ok" "$exporter running in $ns"
else
add_check "exporter-$exporter" "warn" "$exporter phase: $phase in $ns"
fi
else
local phase
phase=$($KUBECTL get "$pod" -n "$MONITORING_NS" -o jsonpath='{.status.phase}' 2>/dev/null)
if [ "$phase" = "Running" ]; then
running=$((running + 1))
add_check "exporter-$exporter" "ok" "$exporter running"
else
add_check "exporter-$exporter" "warn" "$exporter phase: $phase"
fi
fi
done
}
check_prometheus_storage() {
if $DRY_RUN; then
add_check "prometheus-storage" "ok" "dry-run: would check Prometheus storage usage"
return
fi
local prom_pvc
prom_pvc=$($KUBECTL get pvc -n "$MONITORING_NS" -o name 2>/dev/null | grep prometheus-server | head -1)
if [ -z "$prom_pvc" ]; then
add_check "prometheus-storage" "warn" "No Prometheus server PVC found"
return
fi
# Check storage via Prometheus TSDB stats
local prom_pod
prom_pod=$($KUBECTL get pods -n "$MONITORING_NS" -l app.kubernetes.io/name=prometheus,app.kubernetes.io/component=server -o name 2>/dev/null | head -1)
if [ -z "$prom_pod" ]; then
prom_pod=$($KUBECTL get pods -n "$MONITORING_NS" -o name 2>/dev/null | grep prometheus-server | head -1)
fi
if [ -n "$prom_pod" ]; then
local storage_info
storage_info=$($KUBECTL exec "$prom_pod" -n "$MONITORING_NS" -c prometheus-server -- \
df -h /data 2>/dev/null | tail -1 | awk '{printf "%s used of %s (%s)", $3, $2, $5}' || echo "unknown")
add_check "prometheus-storage" "ok" "Prometheus storage: $storage_info"
else
add_check "prometheus-storage" "warn" "Could not check Prometheus storage"
fi
}
# Run checks
check_prometheus
check_alertmanager
check_grafana
check_snmp_exporters
check_prometheus_storage
# Determine overall status
overall="ok"
for c in "${checks[@]}"; do
if echo "$c" | grep -q '"status": "fail"'; then
overall="fail"
break
elif echo "$c" | grep -q '"status": "warn"'; then
overall="warn"
fi
done
# Output JSON
checks_json=$(IFS=,; echo "${checks[*]}")
cat <<EOF
{"status": "$overall", "agent": "$AGENT", "checks": [$checks_json]}
EOF

166
.claude/scripts/network-health.sh Executable file
View file

@ -0,0 +1,166 @@
#!/usr/bin/env bash
set -euo pipefail
KUBECTL="kubectl --kubeconfig /Users/viktorbarzin/code/infra/config"
PFSENSE="python3 /Users/viktorbarzin/code/infra/.claude/pfsense.py"
AGENT="network-health"
DRY_RUN=false
for arg in "$@"; do
case "$arg" in
--dry-run) DRY_RUN=true ;;
esac
done
checks=()
add_check() {
local name="$1" status="$2" message="$3"
checks+=("{\"name\": \"$name\", \"status\": \"$status\", \"message\": \"$message\"}")
}
check_pfsense_status() {
if $DRY_RUN; then
add_check "pfsense" "ok" "dry-run: would check pfSense system status via pfsense.py"
return
fi
local pf_output
pf_output=$($PFSENSE status 2>/dev/null) || {
add_check "pfsense" "fail" "Failed to connect to pfSense via pfsense.py"
return
}
if echo "$pf_output" | grep -qi "error\|fail\|down"; then
add_check "pfsense" "warn" "pfSense reported issues: $(echo "$pf_output" | head -3 | tr '\n' ' ')"
else
add_check "pfsense" "ok" "pfSense system healthy"
fi
}
check_vpn_status() {
if $DRY_RUN; then
add_check "vpn" "ok" "dry-run: would check VPN tunnel status via pfsense.py"
return
fi
local vpn_output
vpn_output=$($PFSENSE wireguard 2>/dev/null) || {
add_check "vpn" "warn" "Failed to query VPN status via pfsense.py"
return
}
if echo "$vpn_output" | grep -qi "error\|fail\|down"; then
add_check "vpn" "warn" "VPN issues detected: $(echo "$vpn_output" | head -3 | tr '\n' ' ')"
else
add_check "vpn" "ok" "VPN tunnels healthy"
fi
}
check_metallb_speakers() {
if $DRY_RUN; then
add_check "metallb-speakers" "ok" "dry-run: would check MetalLB speaker pod health"
return
fi
local ns="metallb-system"
# Find MetalLB speaker pods via labels first
local speaker_pods
speaker_pods=$($KUBECTL get pods -n "$ns" -l app.kubernetes.io/component=speaker --no-headers 2>/dev/null) || \
speaker_pods=$($KUBECTL get pods -n "$ns" -l component=speaker --no-headers 2>/dev/null) || \
speaker_pods=$($KUBECTL get pods -n "$ns" --no-headers 2>/dev/null | grep -i speaker || true)
if [ -z "$speaker_pods" ]; then
add_check "metallb-speakers" "warn" "No MetalLB speaker pods found in ${ns}"
return
fi
local total not_running
total=$(echo "$speaker_pods" | grep -c "." 2>/dev/null || echo "0")
not_running=$(echo "$speaker_pods" | grep -v "Running" | grep -c "." 2>/dev/null || echo "0")
if [ "$not_running" -gt 0 ]; then
add_check "metallb-speakers" "fail" "${not_running}/${total} MetalLB speaker pod(s) not running"
else
add_check "metallb-speakers" "ok" "All ${total} MetalLB speaker pod(s) running"
fi
}
check_metallb_l2() {
if $DRY_RUN; then
add_check "metallb-l2" "ok" "dry-run: would check MetalLB L2 advertisements"
return
fi
local ns="metallb-system"
# Check L2Advertisement CRDs
local l2_ads
l2_ads=$($KUBECTL get l2advertisements -n "$ns" -o json 2>/dev/null) || {
add_check "metallb-l2" "warn" "Could not query L2Advertisement CRDs"
return
}
local count
count=$(echo "$l2_ads" | jq '.items | length' 2>/dev/null || echo "0")
if [ "$count" -eq 0 ]; then
add_check "metallb-l2" "warn" "No L2Advertisement resources found"
else
# Check MetalLB controller
local controller
controller=$($KUBECTL get pods -n "$ns" -l app.kubernetes.io/component=controller --no-headers 2>/dev/null) || \
controller=$($KUBECTL get pods -n "$ns" --no-headers 2>/dev/null | grep -i controller || true)
if [ -z "$controller" ]; then
add_check "metallb-l2" "warn" "${count} L2Advertisement(s) found but no controller pod"
elif echo "$controller" | grep -q "Running"; then
add_check "metallb-l2" "ok" "${count} L2Advertisement(s) configured, controller running"
else
add_check "metallb-l2" "warn" "${count} L2Advertisement(s) found but controller not running"
fi
fi
}
check_node_connectivity() {
if $DRY_RUN; then
add_check "node-connectivity" "ok" "dry-run: would ping k8s nodes"
return
fi
local nodes=("10.0.20.100" "10.0.20.101" "10.0.20.102" "10.0.20.103" "10.0.20.104")
local names=("k8s-master" "k8s-node1" "k8s-node2" "k8s-node3" "k8s-node4")
local failures=0
local failure_details=""
for i in "${!nodes[@]}"; do
if ! ping -c 1 -W 2 "${nodes[$i]}" >/dev/null 2>&1; then
failures=$((failures + 1))
failure_details="${failure_details}${names[$i]}(${nodes[$i]}) "
fi
done
if [ "$failures" -gt 0 ]; then
add_check "node-connectivity" "fail" "${failures} node(s) unreachable: ${failure_details}"
else
add_check "node-connectivity" "ok" "All ${#nodes[@]} nodes reachable"
fi
}
check_pfsense_status
check_vpn_status
check_metallb_speakers
check_metallb_l2
check_node_connectivity
# Output JSON
overall="ok"
for c in "${checks[@]}"; do
s=$(echo "$c" | jq -r '.status')
if [ "$s" = "fail" ]; then overall="fail"; break; fi
if [ "$s" = "warn" ]; then overall="warn"; fi
done
printf '{"status": "%s", "agent": "%s", "checks": [%s]}\n' \
"$overall" "$AGENT" "$(IFS=,; echo "${checks[*]}")"

174
.claude/scripts/nfs-health.sh Executable file
View file

@ -0,0 +1,174 @@
#!/usr/bin/env bash
set -euo pipefail
AGENT="nfs-health"
KUBECTL="kubectl --kubeconfig /Users/viktorbarzin/code/infra/config"
NFS_HOST="192.168.1.127"
NODES=("k8s-master:10.0.20.100" "k8s-node1:10.0.20.101" "k8s-node2:10.0.20.102" "k8s-node3:10.0.20.103" "k8s-node4:10.0.20.104")
SSH_USER="wizard"
DRY_RUN=false
for arg in "$@"; do
case "$arg" in
--dry-run) DRY_RUN=true ;;
esac
done
checks=()
add_check() {
local name="$1" status="$2" message="$3"
checks+=("{\"name\": \"$name\", \"status\": \"$status\", \"message\": \"$message\"}")
}
check_nfs_reachable() {
if $DRY_RUN; then
add_check "nfs-reachable" "ok" "dry-run: would ping $NFS_HOST"
return
fi
if timeout 5 ping -c 1 "$NFS_HOST" &>/dev/null; then
add_check "nfs-reachable" "ok" "Proxmox NFS at $NFS_HOST is reachable"
else
add_check "nfs-reachable" "fail" "Proxmox NFS at $NFS_HOST is unreachable"
fi
}
check_nfs_exports() {
if $DRY_RUN; then
add_check "nfs-exports" "ok" "dry-run: would check NFS exports on Proxmox"
return
fi
local result
if result=$(timeout 10 ssh -o ConnectTimeout=5 -o StrictHostKeyChecking=no "root@$NFS_HOST" \
"exportfs -v 2>/dev/null || cat /etc/exports 2>/dev/null" 2>/dev/null); then
local export_count
export_count=$(echo "$result" | grep -c '/' || echo 0)
if [ "$export_count" -gt 0 ]; then
add_check "nfs-exports" "ok" "$export_count NFS exports active on Proxmox"
else
add_check "nfs-exports" "warn" "No NFS exports found on Proxmox"
fi
else
add_check "nfs-exports" "fail" "Could not check NFS exports on Proxmox via SSH"
fi
}
check_nfs_disk_usage() {
if $DRY_RUN; then
add_check "nfs-disk" "ok" "dry-run: would check NFS disk usage"
return
fi
local result
if result=$(timeout 10 ssh -o ConnectTimeout=5 -o StrictHostKeyChecking=no "root@$NFS_HOST" \
"df -h /srv/nfs /srv/nfs-ssd 2>/dev/null" 2>/dev/null); then
while IFS= read -r line; do
local mount pct
mount=$(echo "$line" | awk '{print $6}')
pct=$(echo "$line" | awk '{print $5}' | tr -d '%')
[ -z "$pct" ] || ! [[ "$pct" =~ ^[0-9]+$ ]] && continue
if [ "$pct" -ge 90 ]; then
add_check "nfs-disk-$mount" "fail" "$mount is ${pct}% full"
elif [ "$pct" -ge 80 ]; then
add_check "nfs-disk-$mount" "warn" "$mount is ${pct}% full"
else
add_check "nfs-disk-$mount" "ok" "$mount is ${pct}% full"
fi
done <<< "$result"
else
add_check "nfs-disk" "warn" "Could not check NFS disk usage"
fi
}
check_node_nfs_mounts() {
local node_name="$1" node_ip="$2"
if $DRY_RUN; then
add_check "nfs-mounts-$node_name" "ok" "dry-run: would check NFS mounts on $node_name ($node_ip)"
return
fi
local mount_output
if ! mount_output=$(timeout 15 ssh -o ConnectTimeout=5 -o StrictHostKeyChecking=no "$SSH_USER@$node_ip" \
"mount | grep nfs" 2>/dev/null); then
add_check "nfs-mounts-$node_name" "warn" "No NFS mounts found or SSH failed on $node_name ($node_ip)"
return
fi
if [ -z "$mount_output" ]; then
add_check "nfs-mounts-$node_name" "warn" "No NFS mounts found on $node_name"
return
fi
local mount_count
mount_count=$(echo "$mount_output" | wc -l | tr -d ' ')
# Check for stale mounts by trying to stat each mount point
local stale_count=0
local stale_mounts=""
while IFS= read -r line; do
local mount_point
mount_point=$(echo "$line" | awk '{print $3}')
if [ -n "$mount_point" ]; then
if ! timeout 10 ssh -o ConnectTimeout=5 -o StrictHostKeyChecking=no "$SSH_USER@$node_ip" \
"timeout 5 stat '$mount_point' >/dev/null 2>&1" 2>/dev/null; then
stale_count=$((stale_count + 1))
stale_mounts="$stale_mounts $mount_point"
fi
fi
done <<< "$mount_output"
if [ "$stale_count" -gt 0 ]; then
add_check "nfs-mounts-$node_name" "fail" "$stale_count/$mount_count NFS mounts stale on $node_name:$stale_mounts"
else
add_check "nfs-mounts-$node_name" "ok" "$mount_count NFS mounts healthy on $node_name"
fi
}
check_nfs_pvcs() {
if $DRY_RUN; then
add_check "nfs-pvcs" "ok" "dry-run: would check NFS-backed PVCs"
return
fi
local pending
pending=$($KUBECTL get pvc --all-namespaces --field-selector='status.phase!=Bound' -o json 2>/dev/null | \
python3 -c "import sys,json; items=json.load(sys.stdin).get('items',[]); nfs=[i for i in items if 'nfs' in json.dumps(i).lower()]; print(len(nfs))" 2>/dev/null || echo "error")
if [ "$pending" = "error" ]; then
add_check "nfs-pvcs" "warn" "Could not check NFS PVC status"
elif [ "$pending" = "0" ]; then
add_check "nfs-pvcs" "ok" "All NFS-backed PVCs are bound"
else
add_check "nfs-pvcs" "fail" "$pending NFS-backed PVCs are not bound"
fi
}
# Run checks
check_nfs_reachable
check_nfs_exports
check_nfs_disk_usage
for node_entry in "${NODES[@]}"; do
node_name="${node_entry%%:*}"
node_ip="${node_entry##*:}"
check_node_nfs_mounts "$node_name" "$node_ip"
done
check_nfs_pvcs
# Determine overall status
overall="ok"
for c in "${checks[@]}"; do
if echo "$c" | grep -q '"status": "fail"'; then
overall="fail"
break
elif echo "$c" | grep -q '"status": "warn"'; then
overall="warn"
fi
done
# Output JSON
checks_json=$(IFS=,; echo "${checks[*]}")
cat <<EOF
{"status": "$overall", "agent": "$AGENT", "checks": [$checks_json]}
EOF

View file

@ -0,0 +1,214 @@
#!/usr/bin/env bash
set -euo pipefail
KUBECTL="kubectl --kubeconfig /Users/viktorbarzin/code/infra/config"
DRY_RUN=false
AGENT="oom-investigator"
for arg in "$@"; do
case "$arg" in
--dry-run) DRY_RUN=true ;;
esac
done
CHECKS="[]"
add_check() {
local name="$1" status="$2" message="$3"
CHECKS=$(echo "$CHECKS" | python3 -c "
import sys, json
checks = json.load(sys.stdin)
checks.append({'name': '''$name''', 'status': '''$status''', 'message': '''$message'''})
json.dump(checks, sys.stdout)
")
}
# Find OOMKilled pods across all namespaces
find_oomkilled() {
if $DRY_RUN; then
add_check "oom-killed-pods" "ok" "DRY RUN: would check for OOMKilled pods across all namespaces"
return
fi
local oom_pods
oom_pods=$($KUBECTL get pods --all-namespaces -o json | python3 -c "
import sys, json
data = json.load(sys.stdin)
results = []
for pod in data.get('items', []):
ns = pod['metadata']['namespace']
name = pod['metadata']['name']
for cs in pod.get('status', {}).get('containerStatuses', []) + pod.get('status', {}).get('initContainerStatuses', []):
last = cs.get('lastState', {}).get('terminated', {})
current = cs.get('state', {}).get('terminated', {})
for state in [last, current]:
if state.get('reason') == 'OOMKilled':
container = cs['name']
restart_count = cs.get('restartCount', 0)
finished = state.get('finishedAt', 'unknown')
results.append({'namespace': ns, 'pod': name, 'container': container, 'restarts': restart_count, 'finishedAt': finished})
json.dump(results, sys.stdout)
" 2>/dev/null) || oom_pods="[]"
local count
count=$(echo "$oom_pods" | python3 -c "import sys,json; print(len(json.load(sys.stdin)))")
if [ "$count" -eq 0 ]; then
add_check "oom-killed-pods" "ok" "No OOMKilled pods found"
else
add_check "oom-killed-pods" "fail" "Found $count OOMKilled container(s): $(echo "$oom_pods" | python3 -c "
import sys,json
pods = json.load(sys.stdin)
print('; '.join(f\"{p['namespace']}/{p['pod']}:{p['container']} (restarts={p['restarts']}, at={p['finishedAt']})\" for p in pods))
")"
fi
}
# Check LimitRange defaults in namespaces with OOM events
check_limitranges() {
if $DRY_RUN; then
add_check "limitranges" "ok" "DRY RUN: would check LimitRange defaults"
return
fi
local namespaces
namespaces=$($KUBECTL get pods --all-namespaces -o json | python3 -c "
import sys, json
data = json.load(sys.stdin)
ns_set = set()
for pod in data.get('items', []):
for cs in pod.get('status', {}).get('containerStatuses', []) + pod.get('status', {}).get('initContainerStatuses', []):
for state in [cs.get('lastState', {}).get('terminated', {}), cs.get('state', {}).get('terminated', {})]:
if state.get('reason') == 'OOMKilled':
ns_set.add(pod['metadata']['namespace'])
for ns in sorted(ns_set):
print(ns)
" 2>/dev/null) || namespaces=""
if [ -z "$namespaces" ]; then
add_check "limitranges" "ok" "No namespaces with OOMKilled pods to check"
return
fi
local lr_info=""
while IFS= read -r ns; do
local lr
lr=$($KUBECTL get limitrange -n "$ns" -o json 2>/dev/null | python3 -c "
import sys, json
data = json.load(sys.stdin)
for item in data.get('items', []):
for limit in item.get('spec', {}).get('limits', []):
if limit.get('type') == 'Container':
default_mem = limit.get('default', {}).get('memory', 'none')
default_cpu = limit.get('default', {}).get('cpu', 'none')
print(f'$ns: default memory={default_mem}, cpu={default_cpu}')
" 2>/dev/null) || lr=""
if [ -n "$lr" ]; then
lr_info="${lr_info}${lr}; "
else
lr_info="${lr_info}${ns}: no LimitRange; "
fi
done <<< "$namespaces"
add_check "limitranges" "warn" "LimitRange defaults for OOM namespaces: ${lr_info}"
}
# Check VPA recommendations from Goldilocks
check_vpa_recommendations() {
if $DRY_RUN; then
add_check "vpa-recommendations" "ok" "DRY RUN: would check VPA recommendations"
return
fi
local vpa_count
vpa_count=$($KUBECTL get vpa --all-namespaces --no-headers 2>/dev/null | wc -l | tr -d ' ') || vpa_count=0
if [ "$vpa_count" -eq 0 ]; then
add_check "vpa-recommendations" "warn" "No VPA objects found — Goldilocks may not be deployed"
return
fi
local vpa_recs
vpa_recs=$($KUBECTL get vpa --all-namespaces -o json 2>/dev/null | python3 -c "
import sys, json
data = json.load(sys.stdin)
recs = []
for vpa in data.get('items', []):
ns = vpa['metadata']['namespace']
name = vpa['metadata']['name']
for cr in vpa.get('status', {}).get('recommendation', {}).get('containerRecommendations', []):
container = cr.get('containerName', 'unknown')
target_mem = cr.get('target', {}).get('memory', 'n/a')
target_cpu = cr.get('target', {}).get('cpu', 'n/a')
upper_mem = cr.get('upperBound', {}).get('memory', 'n/a')
recs.append(f'{ns}/{name}:{container} target_mem={target_mem} target_cpu={target_cpu} upper_mem={upper_mem}')
if recs:
print('; '.join(recs[:20]))
else:
print('No recommendations available yet')
" 2>/dev/null) || vpa_recs="Failed to read VPA recommendations"
add_check "vpa-recommendations" "ok" "$vpa_recs"
}
# Check resource requests/limits on OOMKilled pods
check_pod_resources() {
if $DRY_RUN; then
add_check "pod-resources" "ok" "DRY RUN: would check pod resource specs"
return
fi
local resources
resources=$($KUBECTL get pods --all-namespaces -o json | python3 -c "
import sys, json
data = json.load(sys.stdin)
results = []
for pod in data.get('items', []):
ns = pod['metadata']['namespace']
name = pod['metadata']['name']
has_oom = False
for cs in pod.get('status', {}).get('containerStatuses', []) + pod.get('status', {}).get('initContainerStatuses', []):
for state in [cs.get('lastState', {}).get('terminated', {}), cs.get('state', {}).get('terminated', {})]:
if state.get('reason') == 'OOMKilled':
has_oom = True
break
if has_oom:
for c in pod.get('spec', {}).get('containers', []) + pod.get('spec', {}).get('initContainers', []):
req_mem = c.get('resources', {}).get('requests', {}).get('memory', 'none')
lim_mem = c.get('resources', {}).get('limits', {}).get('memory', 'none')
req_cpu = c.get('resources', {}).get('requests', {}).get('cpu', 'none')
lim_cpu = c.get('resources', {}).get('limits', {}).get('cpu', 'none')
results.append(f\"{ns}/{name}:{c['name']} req_mem={req_mem} lim_mem={lim_mem} req_cpu={req_cpu} lim_cpu={lim_cpu}\")
if results:
print('; '.join(results))
else:
print('No OOMKilled pods to inspect')
" 2>/dev/null) || resources="Failed to check pod resources"
if echo "$resources" | grep -q "No OOMKilled"; then
add_check "pod-resources" "ok" "$resources"
else
add_check "pod-resources" "warn" "$resources"
fi
}
# Run all checks
find_oomkilled
check_limitranges
check_vpa_recommendations
check_pod_resources
# Determine overall status
OVERALL=$(echo "$CHECKS" | python3 -c "
import sys, json
checks = json.load(sys.stdin)
statuses = [c['status'] for c in checks]
if 'fail' in statuses:
print('fail')
elif 'warn' in statuses:
print('warn')
else:
print('ok')
")
echo "{\"status\": \"$OVERALL\", \"agent\": \"$AGENT\", \"checks\": $CHECKS}" | python3 -m json.tool

View file

@ -0,0 +1,260 @@
#!/usr/bin/env bash
set -euo pipefail
AGENT="platform-status"
KUBECTL="kubectl --kubeconfig /Users/viktorbarzin/code/infra/config"
PROXMOX_HOST="root@192.168.1.127"
REGISTRY_HOST="10.0.20.10"
DRY_RUN=false
for arg in "$@"; do
case "$arg" in
--dry-run) DRY_RUN=true ;;
esac
done
checks=()
add_check() {
local name="$1" status="$2" message="$3"
checks+=("{\"name\": \"$name\", \"status\": \"$status\", \"message\": \"$message\"}")
}
check_traefik() {
if $DRY_RUN; then
add_check "traefik" "ok" "dry-run: would check Traefik status"
return
fi
# Discover Traefik pods via labels
local traefik_pod
traefik_pod=$($KUBECTL get pods -n traefik -l app.kubernetes.io/name=traefik -o name 2>/dev/null | head -1)
if [ -z "$traefik_pod" ]; then
traefik_pod=$($KUBECTL get pods -n traefik -l app=traefik -o name 2>/dev/null | head -1)
fi
if [ -z "$traefik_pod" ]; then
add_check "traefik" "fail" "No Traefik pods found in traefik namespace"
return
fi
local phase
phase=$($KUBECTL get "$traefik_pod" -n traefik -o jsonpath='{.status.phase}' 2>/dev/null)
if [ "$phase" = "Running" ]; then
# Check IngressRoute count
local ir_count
ir_count=$($KUBECTL get ingressroute --all-namespaces --no-headers 2>/dev/null | wc -l | tr -d ' ')
add_check "traefik" "ok" "Traefik running, $ir_count IngressRoutes configured"
else
add_check "traefik" "fail" "Traefik pod phase: $phase"
fi
# Check for IngressRoutes with errors (TLS or service issues)
local ir_errors
ir_errors=$($KUBECTL get events --all-namespaces --field-selector reason=IngressRouteError --no-headers 2>/dev/null | wc -l | tr -d ' ')
if [ "$ir_errors" -gt 0 ]; then
add_check "traefik-ingressroutes" "warn" "$ir_errors IngressRoute error events found"
fi
}
check_kyverno() {
if $DRY_RUN; then
add_check "kyverno" "ok" "dry-run: would check Kyverno status"
return
fi
# Discover Kyverno pods via labels
local kyverno_pods
kyverno_pods=$($KUBECTL get pods -n kyverno -l app.kubernetes.io/name=kyverno -o name 2>/dev/null)
if [ -z "$kyverno_pods" ]; then
kyverno_pods=$($KUBECTL get pods -n kyverno -l app=kyverno -o name 2>/dev/null)
fi
if [ -z "$kyverno_pods" ]; then
add_check "kyverno" "warn" "No Kyverno pods found"
return
fi
local total=0 ready=0
while IFS= read -r pod; do
[ -z "$pod" ] && continue
total=$((total + 1))
local phase
phase=$($KUBECTL get "$pod" -n kyverno -o jsonpath='{.status.phase}' 2>/dev/null)
[ "$phase" = "Running" ] && ready=$((ready + 1))
done <<< "$kyverno_pods"
if [ "$ready" -eq "$total" ]; then
# Check policy count
local policy_count
policy_count=$($KUBECTL get clusterpolicy --no-headers 2>/dev/null | wc -l | tr -d ' ')
add_check "kyverno" "ok" "$ready/$total Kyverno pods running, $policy_count ClusterPolicies"
else
add_check "kyverno" "warn" "$ready/$total Kyverno pods running"
fi
# Check for policy violations
local violations
violations=$($KUBECTL get policyreport --all-namespaces -o json 2>/dev/null | \
python3 -c "
import sys, json
try:
data = json.load(sys.stdin)
fail_count = sum(r.get('summary',{}).get('fail',0) for r in data.get('items',[]))
print(fail_count)
except: print('0')
" 2>/dev/null || echo "0")
if [ "$violations" -gt 0 ]; then
add_check "kyverno-violations" "warn" "$violations policy violations across namespaces"
fi
}
check_vpa_goldilocks() {
if $DRY_RUN; then
add_check "vpa-goldilocks" "ok" "dry-run: would check VPA/Goldilocks status"
return
fi
# Check VPA admission controller
local vpa_pods
vpa_pods=$($KUBECTL get pods -n goldilocks -l app.kubernetes.io/name=goldilocks -o name 2>/dev/null)
if [ -z "$vpa_pods" ]; then
vpa_pods=$($KUBECTL get pods -n goldilocks -o name 2>/dev/null)
fi
if [ -z "$vpa_pods" ]; then
add_check "vpa-goldilocks" "warn" "No Goldilocks pods found"
return
fi
local total=0 ready=0
while IFS= read -r pod; do
[ -z "$pod" ] && continue
total=$((total + 1))
local phase
phase=$($KUBECTL get "$pod" -n goldilocks -o jsonpath='{.status.phase}' 2>/dev/null)
[ "$phase" = "Running" ] && ready=$((ready + 1))
done <<< "$vpa_pods"
if [ "$ready" -eq "$total" ]; then
local vpa_count
vpa_count=$($KUBECTL get vpa --all-namespaces --no-headers 2>/dev/null | wc -l | tr -d ' ')
add_check "vpa-goldilocks" "ok" "$ready/$total Goldilocks pods running, $vpa_count VPAs configured"
else
add_check "vpa-goldilocks" "warn" "$ready/$total Goldilocks pods running"
fi
# Check for VPAs with unexpected updateMode
local auto_vpas
auto_vpas=$($KUBECTL get vpa --all-namespaces -o json 2>/dev/null | \
python3 -c "
import sys, json
try:
data = json.load(sys.stdin)
auto = [i['metadata']['name'] for i in data.get('items',[]) if i.get('spec',{}).get('updatePolicy',{}).get('updateMode','') == 'Auto']
print(len(auto))
except: print('0')
" 2>/dev/null || echo "0")
if [ "$auto_vpas" -gt 0 ]; then
add_check "vpa-auto-mode" "warn" "$auto_vpas VPAs set to Auto updateMode (may cause unexpected restarts)"
fi
}
check_pull_through_cache() {
if $DRY_RUN; then
add_check "pull-through-cache" "ok" "dry-run: would check pull-through cache at $REGISTRY_HOST"
return
fi
if timeout 5 curl -sf "http://${REGISTRY_HOST}:5000/v2/" &>/dev/null; then
add_check "pull-through-cache" "ok" "Pull-through cache registry at $REGISTRY_HOST:5000 is healthy"
elif timeout 5 curl -sf "https://${REGISTRY_HOST}/v2/" &>/dev/null; then
add_check "pull-through-cache" "ok" "Pull-through cache registry at $REGISTRY_HOST is healthy (HTTPS)"
else
add_check "pull-through-cache" "fail" "Pull-through cache registry at $REGISTRY_HOST is unreachable"
fi
}
check_proxmox() {
if $DRY_RUN; then
add_check "proxmox" "ok" "dry-run: would check Proxmox host resources"
return
fi
local cpu_load
if cpu_load=$(timeout 10 ssh -o ConnectTimeout=5 -o StrictHostKeyChecking=no "$PROXMOX_HOST" \
"uptime | awk -F'load average:' '{print \$2}' | awk -F, '{print \$1}' | tr -d ' '" 2>/dev/null); then
local cpu_count
cpu_count=$(timeout 10 ssh -o ConnectTimeout=5 -o StrictHostKeyChecking=no "$PROXMOX_HOST" \
"nproc" 2>/dev/null || echo "1")
# Check memory
local mem_info
mem_info=$(timeout 10 ssh -o ConnectTimeout=5 -o StrictHostKeyChecking=no "$PROXMOX_HOST" \
"free -m | awk '/Mem:/{printf \"%d/%dMB (%.0f%%)\", \$3, \$2, \$3/\$2*100}'" 2>/dev/null || echo "unknown")
add_check "proxmox" "ok" "Proxmox host: load=$cpu_load (${cpu_count}cores), mem=$mem_info"
else
add_check "proxmox" "fail" "Could not reach Proxmox host via SSH"
fi
}
check_metallb() {
if $DRY_RUN; then
add_check "metallb" "ok" "dry-run: would check MetalLB status"
return
fi
local metallb_pods
metallb_pods=$($KUBECTL get pods -n metallb-system -l app.kubernetes.io/name=metallb -o name 2>/dev/null)
if [ -z "$metallb_pods" ]; then
metallb_pods=$($KUBECTL get pods -n metallb-system -o name 2>/dev/null)
fi
if [ -z "$metallb_pods" ]; then
add_check "metallb" "warn" "No MetalLB pods found"
return
fi
local total=0 ready=0
while IFS= read -r pod; do
[ -z "$pod" ] && continue
total=$((total + 1))
local phase
phase=$($KUBECTL get "$pod" -n metallb-system -o jsonpath='{.status.phase}' 2>/dev/null)
[ "$phase" = "Running" ] && ready=$((ready + 1))
done <<< "$metallb_pods"
if [ "$ready" -eq "$total" ]; then
add_check "metallb" "ok" "$ready/$total MetalLB pods running"
else
add_check "metallb" "warn" "$ready/$total MetalLB pods running"
fi
}
# Run checks
check_traefik
check_kyverno
check_vpa_goldilocks
check_pull_through_cache
check_proxmox
check_metallb
# Determine overall status
overall="ok"
for c in "${checks[@]}"; do
if echo "$c" | grep -q '"status": "fail"'; then
overall="fail"
break
elif echo "$c" | grep -q '"status": "warn"'; then
overall="warn"
fi
done
# Output JSON
checks_json=$(IFS=,; echo "${checks[*]}")
cat <<EOF
{"status": "$overall", "agent": "$AGENT", "checks": [$checks_json]}
EOF

View file

@ -0,0 +1,190 @@
#!/usr/bin/env bash
set -euo pipefail
KUBECTL="kubectl --kubeconfig /Users/viktorbarzin/code/infra/config"
DRY_RUN=false
AGENT="resource-report"
for arg in "$@"; do
case "$arg" in
--dry-run) DRY_RUN=true ;;
esac
done
CHECKS="[]"
add_check() {
local name="$1" status="$2" message="$3"
CHECKS=$(echo "$CHECKS" | python3 -c "
import sys, json
checks = json.load(sys.stdin)
checks.append({'name': '''$name''', 'status': '''$status''', 'message': '''$message'''})
json.dump(checks, sys.stdout)
")
}
# Node capacity report: allocatable vs requests vs limits
check_node_capacity() {
if $DRY_RUN; then
add_check "node-capacity" "ok" "DRY RUN: would report node allocatable vs requests vs limits"
return
fi
local report
report=$($KUBECTL get nodes -o json | python3 -c "
import sys, json
def parse_cpu(val):
if val.endswith('m'):
return int(val[:-1])
return int(float(val) * 1000)
def parse_mem(val):
units = {'Ki': 1024, 'Mi': 1024**2, 'Gi': 1024**3, 'Ti': 1024**4}
for suffix, mult in units.items():
if val.endswith(suffix):
return int(float(val[:-len(suffix)]) * mult)
return int(val)
def fmt_mem(b):
return f'{b / (1024**3):.1f}Gi'
def fmt_cpu(m):
return f'{m}m'
data = json.load(sys.stdin)
nodes = []
for node in data.get('items', []):
name = node['metadata']['name']
alloc = node.get('status', {}).get('allocatable', {})
cpu_alloc = parse_cpu(alloc.get('cpu', '0'))
mem_alloc = parse_mem(alloc.get('memory', '0'))
nodes.append({'name': name, 'cpu_alloc': cpu_alloc, 'mem_alloc': mem_alloc})
for n in nodes:
print(f\"{n['name']}: cpu_alloc={fmt_cpu(n['cpu_alloc'])} mem_alloc={fmt_mem(n['mem_alloc'])}\")
" 2>/dev/null) || report="Failed to get node capacity"
# Get requests/limits per node
local usage
usage=$($KUBECTL get pods --all-namespaces -o json | python3 -c "
import sys, json
def parse_cpu(val):
if not val: return 0
if val.endswith('m'):
return int(val[:-1])
return int(float(val) * 1000)
def parse_mem(val):
if not val: return 0
units = {'Ki': 1024, 'Mi': 1024**2, 'Gi': 1024**3, 'Ti': 1024**4}
for suffix, mult in units.items():
if val.endswith(suffix):
return int(float(val[:-len(suffix)]) * mult)
return int(val)
def fmt_mem(b):
return f'{b / (1024**3):.1f}Gi'
def fmt_cpu(m):
return f'{m}m'
data = json.load(sys.stdin)
per_node = {}
for pod in data.get('items', []):
phase = pod.get('status', {}).get('phase', '')
if phase not in ('Running', 'Pending'):
continue
node = pod.get('spec', {}).get('nodeName', 'unscheduled')
if node not in per_node:
per_node[node] = {'cpu_req': 0, 'cpu_lim': 0, 'mem_req': 0, 'mem_lim': 0}
for c in pod.get('spec', {}).get('containers', []) + pod.get('spec', {}).get('initContainers', []):
res = c.get('resources', {})
per_node[node]['cpu_req'] += parse_cpu(res.get('requests', {}).get('cpu', ''))
per_node[node]['cpu_lim'] += parse_cpu(res.get('limits', {}).get('cpu', ''))
per_node[node]['mem_req'] += parse_mem(res.get('requests', {}).get('memory', ''))
per_node[node]['mem_lim'] += parse_mem(res.get('limits', {}).get('memory', ''))
for node in sorted(per_node.keys()):
n = per_node[node]
print(f\"{node}: cpu_req={fmt_cpu(n['cpu_req'])} cpu_lim={fmt_cpu(n['cpu_lim'])} mem_req={fmt_mem(n['mem_req'])} mem_lim={fmt_mem(n['mem_lim'])}\")
" 2>/dev/null) || usage="Failed to get pod resource usage"
add_check "node-capacity" "ok" "Allocatable: ${report} | Usage: ${usage}"
}
# Per-namespace ResourceQuota usage
check_resource_quotas() {
if $DRY_RUN; then
add_check "resource-quotas" "ok" "DRY RUN: would check ResourceQuota usage per namespace"
return
fi
local quota_count
quota_count=$($KUBECTL get resourcequota --all-namespaces --no-headers 2>/dev/null | wc -l | tr -d ' ') || quota_count=0
if [ "$quota_count" -eq 0 ]; then
add_check "resource-quotas" "ok" "No ResourceQuotas defined in the cluster"
return
fi
local quota_report
quota_report=$($KUBECTL get resourcequota --all-namespaces -o json 2>/dev/null | python3 -c "
import sys, json
data = json.load(sys.stdin)
results = []
for rq in data.get('items', []):
ns = rq['metadata']['namespace']
name = rq['metadata']['name']
hard = rq.get('status', {}).get('hard', {})
used = rq.get('status', {}).get('used', {})
for resource in hard:
h = hard[resource]
u = used.get(resource, '0')
results.append(f'{ns}/{name}: {resource} used={u} hard={h}')
if results:
print('; '.join(results[:30]))
else:
print('No quota usage data')
" 2>/dev/null) || quota_report="Failed to read ResourceQuotas"
add_check "resource-quotas" "ok" "$quota_report"
}
# Top pods by memory usage
check_top_consumers() {
if $DRY_RUN; then
add_check "top-consumers" "ok" "DRY RUN: would report top memory-consuming pods"
return
fi
local top_pods
top_pods=$($KUBECTL top pods --all-namespaces --no-headers 2>/dev/null | sort -k4 -h -r | head -10 | awk '{print $1"/"$2": cpu="$3" mem="$4}' | tr '\n' '; ') || top_pods="metrics-server may not be available"
if [ -z "$top_pods" ]; then
add_check "top-consumers" "warn" "kubectl top returned no data — metrics-server may not be running"
else
add_check "top-consumers" "ok" "Top 10 by memory: ${top_pods}"
fi
}
# Run all checks
check_node_capacity
check_resource_quotas
check_top_consumers
# Determine overall status
OVERALL=$(echo "$CHECKS" | python3 -c "
import sys, json
checks = json.load(sys.stdin)
statuses = [c['status'] for c in checks]
if 'fail' in statuses:
print('fail')
elif 'warn' in statuses:
print('warn')
else:
print('ok')
")
echo "{\"status\": \"$OVERALL\", \"agent\": \"$AGENT\", \"checks\": $CHECKS}" | python3 -m json.tool

95
.claude/scripts/sev-context.sh Executable file
View file

@ -0,0 +1,95 @@
#!/usr/bin/env bash
# sev-context.sh — Gather structured cluster context for post-mortem triage
# Used by sev-triage agent and available to all pipeline stages
set -euo pipefail
KUBECONFIG="${KUBECONFIG:-/Users/viktorbarzin/code/infra/config}"
INFRA_DIR="${INFRA_DIR:-/Users/viktorbarzin/code/infra}"
export KUBECONFIG
echo "=== NODE STATUS ==="
kubectl get nodes -o custom-columns=\
'NAME:.metadata.name,STATUS:.status.conditions[?(@.type=="Ready")].status,VERSION:.status.nodeInfo.kubeletVersion,CPU_CAP:.status.capacity.cpu,MEM_CAP:.status.capacity.memory' \
--no-headers 2>/dev/null || echo "ERROR: Cannot reach cluster"
echo ""
echo "=== UNHEALTHY PODS ==="
# Pods not Running/Succeeded, with UTC start time instead of relative age
kubectl get pods --all-namespaces \
--field-selector='status.phase!=Running,status.phase!=Succeeded' \
-o custom-columns=\
'NAMESPACE:.metadata.namespace,POD:.metadata.name,STATUS:.status.phase,RESTARTS:.status.containerStatuses[0].restartCount,STARTED_UTC:.status.startTime,NODE:.spec.nodeName' \
--no-headers 2>/dev/null || true
# Also show pods that are Running but have containers not ready or high restarts
kubectl get pods --all-namespaces -o json 2>/dev/null | python3 -c "
import json, sys
try:
data = json.load(sys.stdin)
except:
sys.exit(0)
for pod in data.get('items', []):
ns = pod['metadata']['namespace']
name = pod['metadata']['name']
node = pod['spec'].get('nodeName', 'N/A')
start = pod['status'].get('startTime', 'N/A')
phase = pod['status'].get('phase', 'Unknown')
if phase != 'Running':
continue
for cs in pod['status'].get('containerStatuses', []):
restarts = cs.get('restartCount', 0)
ready = cs.get('ready', True)
if restarts > 3 or not ready:
reason = ''
waiting = cs.get('state', {}).get('waiting', {})
if waiting:
reason = waiting.get('reason', '')
print(f'{ns}\t{name}\t{phase}/NotReady\t{restarts}\t{start}\t{node}\t{reason}')
break
" 2>/dev/null || true
echo ""
echo "=== RECENT EVENTS (last 2h, Warning/Error only) ==="
kubectl get events --all-namespaces \
--field-selector='type!=Normal' \
--sort-by='.lastTimestamp' \
-o custom-columns=\
'NAMESPACE:.metadata.namespace,TYPE:.type,REASON:.reason,OBJECT:.involvedObject.name,LAST_SEEN_UTC:.lastTimestamp,MESSAGE:.message' \
--no-headers 2>/dev/null | tail -50 || true
echo ""
echo "=== NAMESPACE TO STACK MAPPING ==="
# Parse terragrunt.hcl files to map k8s namespaces to stack directories
for tg in "$INFRA_DIR"/stacks/*/terragrunt.hcl; do
stack_dir=$(dirname "$tg")
stack_name=$(basename "$stack_dir")
# Try to find namespace from the stack - check main.tf for namespace references
ns=$(grep -h 'namespace' "$stack_dir"/main.tf 2>/dev/null | grep -oP '"\K[a-z0-9-]+(?=")' | head -1 || echo "$stack_name")
echo "$ns → stacks/$stack_name"
done 2>/dev/null | sort -u || true
echo ""
echo "=== SERVICE TIERS ==="
# Parse service-catalog.md for tier classifications
catalog="$INFRA_DIR/.claude/reference/service-catalog.md"
if [ -f "$catalog" ]; then
current_tier=""
while IFS= read -r line; do
case "$line" in
*"Tier: core"*) current_tier="core" ;;
*"Tier: cluster"*) current_tier="cluster" ;;
*"Admin"*) current_tier="admin" ;;
*"Active Use"*) current_tier="active" ;;
*"Optional"*|*"Inactive"*) current_tier="optional" ;;
esac
if [[ "$line" =~ ^\|[[:space:]]+([a-z0-9_-]+)[[:space:]]+\| && "$current_tier" != "" ]]; then
svc="${BASH_REMATCH[1]}"
[[ "$svc" == "Service" || "$svc" == "---" ]] && continue
echo "$svc=$current_tier"
fi
done < "$catalog"
fi
echo ""
echo "=== CURRENT UTC TIME ==="
date -u '+%Y-%m-%dT%H:%M:%SZ'

143
.claude/scripts/tls-check.sh Executable file
View file

@ -0,0 +1,143 @@
#!/usr/bin/env bash
set -euo pipefail
KUBECTL="kubectl --kubeconfig /Users/viktorbarzin/code/infra/config"
AGENT="tls-check"
DRY_RUN=false
WARN_DAYS=14
for arg in "$@"; do
case "$arg" in
--dry-run) DRY_RUN=true ;;
esac
done
checks=()
add_check() {
local name="$1" status="$2" message="$3"
checks+=("{\"name\": \"$name\", \"status\": \"$status\", \"message\": \"$message\"}")
}
check_tls_secrets() {
if $DRY_RUN; then
add_check "tls-secrets" "ok" "dry-run: would scan all kubernetes.io/tls secrets for expiry"
return
fi
local secrets_json
secrets_json=$($KUBECTL get secrets -A -o json 2>/dev/null) || {
add_check "tls-secrets" "fail" "Failed to list secrets"
return
}
local tls_secrets
tls_secrets=$(echo "$secrets_json" | jq -r '.items[] | select(.type=="kubernetes.io/tls") | "\(.metadata.namespace)/\(.metadata.name)"' 2>/dev/null) || {
add_check "tls-secrets" "fail" "Failed to parse secrets JSON"
return
}
if [ -z "$tls_secrets" ]; then
add_check "tls-secrets" "warn" "No TLS secrets found"
return
fi
local total=0 expiring=0 expired=0 healthy=0 errors=0
local now_epoch
now_epoch=$(date +%s)
local warn_epoch=$((now_epoch + WARN_DAYS * 86400))
local expiring_list=""
while IFS= read -r secret; do
total=$((total + 1))
local ns="${secret%%/*}"
local name="${secret##*/}"
local cert_pem
cert_pem=$($KUBECTL get secret "$name" -n "$ns" -o jsonpath='{.data.tls\.crt}' 2>/dev/null | base64 -d 2>/dev/null) || {
errors=$((errors + 1))
continue
}
local expiry_str
expiry_str=$(echo "$cert_pem" | openssl x509 -noout -enddate 2>/dev/null | sed 's/notAfter=//') || {
errors=$((errors + 1))
continue
}
local expiry_epoch
expiry_epoch=$(date -j -f "%b %d %T %Y %Z" "$expiry_str" +%s 2>/dev/null || date -d "$expiry_str" +%s 2>/dev/null) || {
errors=$((errors + 1))
continue
}
if [ "$expiry_epoch" -lt "$now_epoch" ]; then
expired=$((expired + 1))
expiring_list="${expiring_list}EXPIRED: ${ns}/${name}; "
elif [ "$expiry_epoch" -lt "$warn_epoch" ]; then
local days_left=$(( (expiry_epoch - now_epoch) / 86400 ))
expiring=$((expiring + 1))
expiring_list="${expiring_list}${days_left}d: ${ns}/${name}; "
else
healthy=$((healthy + 1))
fi
done <<< "$tls_secrets"
if [ "$expired" -gt 0 ]; then
add_check "tls-secrets" "fail" "${expired} expired, ${expiring} expiring soon, ${healthy} healthy out of ${total} certs. ${expiring_list}"
elif [ "$expiring" -gt 0 ]; then
add_check "tls-secrets" "warn" "${expiring} expiring within ${WARN_DAYS}d, ${healthy} healthy out of ${total} certs. ${expiring_list}"
else
add_check "tls-secrets" "ok" "All ${healthy} TLS certs healthy (${errors} decode errors skipped)"
fi
}
check_cert_manager() {
if $DRY_RUN; then
add_check "cert-manager" "ok" "dry-run: would check cert-manager pod health and certificate CRDs"
return
fi
local cm_pods
cm_pods=$($KUBECTL get pods -n cert-manager -l app.kubernetes.io/instance=cert-manager --no-headers 2>/dev/null) || {
add_check "cert-manager" "fail" "Failed to query cert-manager pods"
return
}
local not_running
not_running=$(echo "$cm_pods" | grep -v "Running" | grep -v "Completed" | grep -c "." 2>/dev/null || echo "0")
if [ "$not_running" -gt 0 ]; then
add_check "cert-manager" "fail" "${not_running} cert-manager pod(s) not running"
return
fi
# Check for failed certificates
local failed_certs
failed_certs=$($KUBECTL get certificates -A -o json 2>/dev/null | jq -r '.items[] | select(.status.conditions[]? | select(.type=="Ready" and .status=="False")) | "\(.metadata.namespace)/\(.metadata.name)"' 2>/dev/null) || {
add_check "cert-manager" "warn" "Could not query certificate CRDs"
return
}
if [ -n "$failed_certs" ]; then
local count
count=$(echo "$failed_certs" | wc -l | tr -d ' ')
add_check "cert-manager" "warn" "${count} certificate(s) not ready: $(echo "$failed_certs" | head -5 | tr '\n' ', ')"
else
add_check "cert-manager" "ok" "cert-manager healthy, all certificates ready"
fi
}
check_tls_secrets
check_cert_manager
# Output JSON
overall="ok"
for c in "${checks[@]}"; do
s=$(echo "$c" | jq -r '.status')
if [ "$s" = "fail" ]; then overall="fail"; break; fi
if [ "$s" = "warn" ]; then overall="warn"; fi
done
printf '{"status": "%s", "agent": "%s", "checks": [%s]}\n' \
"$overall" "$AGENT" "$(IFS=,; echo "${checks[*]}")"