infra/.claude/scripts/deploy-status.sh

#!/usr/bin/env bash
set -euo pipefail

KUBECTL="kubectl --kubeconfig /Users/viktorbarzin/code/infra/config"
DRY_RUN=false
AGENT="deploy-status"

for arg in "$@"; do
  case "$arg" in
    --dry-run) DRY_RUN=true ;;
  esac
done

CHECKS="[]"

add_check() {
  local name="$1" status="$2" message="$3"
  CHECKS=$(echo "$CHECKS" | python3 -c "
import sys, json
checks = json.load(sys.stdin)
checks.append({'name': '''$name''', 'status': '''$status''', 'message': '''$message'''})
json.dump(checks, sys.stdout)
")
}

# Check for stalled rollouts (Progressing=False or deadline exceeded)
check_stalled_rollouts() {
  if $DRY_RUN; then
    add_check "stalled-rollouts" "ok" "DRY RUN: would check for stalled deployment rollouts"
    return
  fi

  local stalled
  stalled=$($KUBECTL get deployments --all-namespaces -o json 2>/dev/null | python3 -c "
import sys, json
data = json.load(sys.stdin)
stalled = []
for dep in data.get('items', []):
  ns = dep['metadata']['namespace']
  name = dep['metadata']['name']
  conditions = dep.get('status', {}).get('conditions', [])
  for cond in conditions:
    if cond.get('type') == 'Progressing' and cond.get('status') == 'False':
      reason = cond.get('reason', 'unknown')
      stalled.append(f'{ns}/{name}: {reason}')
    elif cond.get('type') == 'Available' and cond.get('status') == 'False':
      reason = cond.get('reason', 'unknown')
      stalled.append(f'{ns}/{name}: unavailable ({reason})')
if stalled:
  print('; '.join(stalled))
else:
  print('')
" 2>/dev/null) || stalled="Failed to check deployments"

  if [ -z "$stalled" ]; then
    add_check "stalled-rollouts" "ok" "No stalled rollouts detected"
  else
    add_check "stalled-rollouts" "fail" "Stalled rollouts: $stalled"
  fi
}

# Check for unavailable replicas
check_unavailable_replicas() {
  if $DRY_RUN; then
    add_check "unavailable-replicas" "ok" "DRY RUN: would check for deployments with unavailable replicas"
    return
  fi

  local unavail
  unavail=$($KUBECTL get deployments --all-namespaces -o json 2>/dev/null | python3 -c "
import sys, json
data = json.load(sys.stdin)
issues = []
for dep in data.get('items', []):
  ns = dep['metadata']['namespace']
  name = dep['metadata']['name']
  spec_replicas = dep.get('spec', {}).get('replicas', 1)
  ready = dep.get('status', {}).get('readyReplicas', 0) or 0
  unavailable = dep.get('status', {}).get('unavailableReplicas', 0) or 0
  if unavailable > 0 or ready < spec_replicas:
    issues.append(f'{ns}/{name}: {ready}/{spec_replicas} ready, {unavailable} unavailable')
if issues:
  print('; '.join(issues))
else:
  print('')
" 2>/dev/null) || unavail="Failed to check replicas"

  if [ -z "$unavail" ]; then
    add_check "unavailable-replicas" "ok" "All deployments have desired replicas ready"
  else
    add_check "unavailable-replicas" "warn" "Unavailable replicas: $unavail"
  fi
}

# Check for image pull errors
check_image_pull_errors() {
  if $DRY_RUN; then
    add_check "image-pull-errors" "ok" "DRY RUN: would check for ImagePullBackOff/ErrImagePull pods"
    return
  fi

  local pull_errors
  pull_errors=$($KUBECTL get pods --all-namespaces -o json 2>/dev/null | python3 -c "
import sys, json
data = json.load(sys.stdin)
errors = []
for pod in data.get('items', []):
  ns = pod['metadata']['namespace']
  name = pod['metadata']['name']
  for cs in pod.get('status', {}).get('containerStatuses', []) + pod.get('status', {}).get('initContainerStatuses', []):
    waiting = cs.get('state', {}).get('waiting', {})
    reason = waiting.get('reason', '')
    if reason in ('ImagePullBackOff', 'ErrImagePull', 'InvalidImageName'):
      image = cs.get('image', 'unknown')
      msg = waiting.get('message', '')[:100]
      errors.append(f'{ns}/{name}: {reason} image={image} ({msg})')
if errors:
  print('; '.join(errors))
else:
  print('')
" 2>/dev/null) || pull_errors="Failed to check image pulls"

  if [ -z "$pull_errors" ]; then
    add_check "image-pull-errors" "ok" "No image pull errors found"
  else
    add_check "image-pull-errors" "fail" "Image pull errors: $pull_errors"
  fi
}

# Check for recent restarts (>5 in last hour)
check_recent_restarts() {
  if $DRY_RUN; then
    add_check "recent-restarts" "ok" "DRY RUN: would check for pods with high restart counts"
    return
  fi

  local restarts
  restarts=$($KUBECTL get pods --all-namespaces -o json 2>/dev/null | python3 -c "
import sys, json
data = json.load(sys.stdin)
high_restart = []
for pod in data.get('items', []):
  ns = pod['metadata']['namespace']
  name = pod['metadata']['name']
  for cs in pod.get('status', {}).get('containerStatuses', []):
    count = cs.get('restartCount', 0)
    if count >= 5:
      container = cs['name']
      high_restart.append(f'{ns}/{name}:{container} restarts={count}')
if high_restart:
  print('; '.join(sorted(high_restart, key=lambda x: int(x.split('=')[1]), reverse=True)[:20]))
else:
  print('')
" 2>/dev/null) || restarts="Failed to check restarts"

  if [ -z "$restarts" ]; then
    add_check "recent-restarts" "ok" "No pods with 5+ restarts"
  else
    add_check "recent-restarts" "warn" "High restart counts: $restarts"
  fi
}

# Check CrashLoopBackOff pods
check_crashloop() {
  if $DRY_RUN; then
    add_check "crashloop" "ok" "DRY RUN: would check for CrashLoopBackOff pods"
    return
  fi

  local crashloop
  crashloop=$($KUBECTL get pods --all-namespaces -o json 2>/dev/null | python3 -c "
import sys, json
data = json.load(sys.stdin)
crashes = []
for pod in data.get('items', []):
  ns = pod['metadata']['namespace']
  name = pod['metadata']['name']
  for cs in pod.get('status', {}).get('containerStatuses', []):
    waiting = cs.get('state', {}).get('waiting', {})
    if waiting.get('reason') == 'CrashLoopBackOff':
      container = cs['name']
      restarts = cs.get('restartCount', 0)
      crashes.append(f'{ns}/{name}:{container} restarts={restarts}')
if crashes:
  print('; '.join(crashes))
else:
  print('')
" 2>/dev/null) || crashloop="Failed to check crashloop"

  if [ -z "$crashloop" ]; then
    add_check "crashloop" "ok" "No CrashLoopBackOff pods"
  else
    add_check "crashloop" "fail" "CrashLoopBackOff: $crashloop"
  fi
}

# Run all checks
check_stalled_rollouts
check_unavailable_replicas
check_image_pull_errors
check_recent_restarts
check_crashloop

# Determine overall status
OVERALL=$(echo "$CHECKS" | python3 -c "
import sys, json
checks = json.load(sys.stdin)
statuses = [c['status'] for c in checks]
if 'fail' in statuses:
  print('fail')
elif 'warn' in statuses:
  print('warn')
else:
  print('ok')
")

echo "{\"status\": \"$OVERALL\", \"agent\": \"$AGENT\", \"checks\": $CHECKS}" | python3 -m json.tool