infra/scripts/tg

#!/usr/bin/env bash
# scripts/tg — wrapper: decrypt state before, encrypt+commit after mutating ops
# Usage: scripts/tg apply --non-interactive
#        scripts/tg plan
# Auth: `vault login -method=oidc` (token at ~/.vault-token)
set -euo pipefail

REPO_ROOT="$(cd "$(dirname "$0")/.." && pwd)"
SYNC="$REPO_ROOT/scripts/state-sync"

# Enable provider cache (shared across stacks)
export TF_PLUGIN_CACHE_DIR="${TF_PLUGIN_CACHE_DIR:-$HOME/.terraform.d/plugin-cache}"
export TF_PLUGIN_CACHE_MAY_BREAK_DEPENDENCY_LOCK_FILE=1
mkdir -p "$TF_PLUGIN_CACHE_DIR"

# Determine stack name from cwd (relative to stacks/)
STACK_NAME=""
cwd="$(pwd)"
stacks_dir="$REPO_ROOT/stacks"
if [[ "$cwd" == "$stacks_dir"/* ]]; then
  rel="${cwd#$stacks_dir/}"
  STACK_NAME="${rel%%/*}"
fi

# ── Tier detection ──
TIER0_STACKS="infra platform cnpg vault dbaas external-secrets"
is_tier0() {
  echo "$TIER0_STACKS" | tr ' ' '\n' | grep -qx "$1"
}

# ── Advisory lock via Vault KV ──
LOCK_MAX_AGE=1800  # 30 minutes — stale lock threshold
acquire_lock() {
  local stack="$1"
  local vault_addr="${VAULT_ADDR:-https://vault.viktorbarzin.me}"
  local lock_path="secret/data/locks/$stack"
  local holder="pid=$$,host=$(hostname -s),user=$(whoami)"

  # Check if lock exists and is not stale
  local existing
  existing=$(vault kv get -format=json "secret/locks/$stack" 2>/dev/null || echo '{}')
  local locked=$(echo "$existing" | jq -r '.data.data.locked // "false"')
  local acquired=$(echo "$existing" | jq -r '.data.data.acquired // "0"')
  local existing_holder=$(echo "$existing" | jq -r '.data.data.holder // ""')

  if [ "$locked" = "true" ]; then
    local now=$(date +%s)
    local age=$((now - acquired))
    if [ "$age" -lt "$LOCK_MAX_AGE" ]; then
      echo "ERROR: Stack '$stack' is locked by: $existing_holder (${age}s ago)"
      echo "       Wait for it to finish or run: vault kv delete secret/locks/$stack"
      return 1
    fi
    echo "WARNING: Breaking stale lock on '$stack' (held ${age}s by $existing_holder)"
  fi

  vault kv put "secret/locks/$stack" locked=true holder="$holder" acquired="$(date +%s)" >/dev/null
}

release_lock() {
  local stack="$1"
  vault kv delete "secret/locks/$stack" >/dev/null 2>&1 || true
}

# ── Pre-flight: decrypt state (Tier 0) or fetch PG creds (Tier 1) ──
if [ -n "$STACK_NAME" ]; then
  if is_tier0 "$STACK_NAME"; then
    # Tier 0: SOPS-encrypted local state
    if [ -f "$REPO_ROOT/state/stacks/$STACK_NAME/terraform.tfstate.enc" ]; then
      "$SYNC" decrypt "$STACK_NAME"
    fi
  else
    # Tier 1: PG backend — fetch credentials from Vault
    if [ -z "${PG_CONN_STR:-}" ]; then
      # Pre-flight: vault CLI must be available. Previously CI failed with a
      # misleading "Cannot read PG credentials" message because the Alpine CI
      # image lacked the vault binary — the 2>/dev/null below swallowed the
      # real "vault: not found" error. Fail fast with a clear message instead.
      if ! command -v vault >/dev/null 2>&1; then
        echo "ERROR: vault CLI not found on PATH. Install it or use an image that includes it (ci/Dockerfile)." >&2
        exit 1
      fi
      VAULT_OUT=$(vault read -format=json database/static-creds/pg-terraform-state 2>&1) || {
        echo "ERROR: Cannot read PG credentials from Vault. Vault output follows:" >&2
        echo "$VAULT_OUT" >&2
        echo "" >&2
        echo "Hint: humans run 'vault login -method=oidc'; CI auths via K8s SA (role=ci)." >&2
        exit 1
      }
      PG_USER=$(echo "$VAULT_OUT" | jq -r .data.username)
      PG_PASS=$(echo "$VAULT_OUT" | jq -r .data.password)
      export PG_CONN_STR="postgres://${PG_USER}:${PG_PASS}@10.0.20.200:5432/terraform_state?sslmode=disable"
    fi
  fi
fi

# Detect if this is a mutating operation
is_mutating=false
for arg in "$@"; do
  case "$arg" in
    apply|destroy|import|state) is_mutating=true ;;
  esac
done

# Detect if this is a plan/apply/destroy/refresh — anything that reads or
# writes infra state. Cheap pre-flight check below scans only the current
# stack's .tf files for the ingress_factory auth-comment convention. Other
# tg verbs (init, fmt, validate) skip the check.
is_tf_op=false
for arg in "$@"; do
  case "$arg" in
    plan|apply|destroy|refresh) is_tf_op=true ;;
  esac
done

# Anti-exposure guard: every `auth = "app"` or `auth = "none"` in this stack
# must have a preceding `# auth = "<tier>":` comment documenting what gates
# the app or why the endpoint is intentionally public. See:
# - infra/modules/kubernetes/ingress_factory/main.tf (variable description)
# - infra/.claude/CLAUDE.md "Auth" section
# Stack-scoped: untouched stacks aren't blocked from future applies until
# they're actually edited, at which point the convention applies.
if $is_tf_op && [ -n "$STACK_NAME" ]; then
  if ! "$REPO_ROOT/scripts/check-ingress-auth-comments.py" "$REPO_ROOT/stacks/$STACK_NAME"; then
    exit 1
  fi
fi

# Acquire lock for mutating operations (Tier 0 only — Tier 1 uses pg_advisory_lock)
if $is_mutating && [ -n "$STACK_NAME" ] && is_tier0 "$STACK_NAME"; then
  if command -v vault &>/dev/null && [ -n "${VAULT_TOKEN:-}" ]; then
    acquire_lock "$STACK_NAME"
    trap 'release_lock "$STACK_NAME"' EXIT
  fi
fi

# If running apply with --non-interactive, add -auto-approve for Terraform
args=("$@")
has_apply=false
has_non_interactive=false
for arg in "${args[@]}"; do
  case "$arg" in
    apply) has_apply=true ;;
    --non-interactive) has_non_interactive=true ;;
  esac
done

if $has_apply && $has_non_interactive; then
  new_args=()
  for arg in "${args[@]}"; do
    new_args+=("$arg")
    if [ "$arg" = "apply" ]; then
      new_args+=("-auto-approve")
    fi
  done
  terragrunt "${new_args[@]}"
else
  terragrunt "$@"
fi

# After mutating operations: encrypt+commit (Tier 0) or no-op (Tier 1 — PG is authoritative)
if $is_mutating && [ -n "$STACK_NAME" ] && is_tier0 "$STACK_NAME"; then
  "$SYNC" encrypt "$STACK_NAME"
  cd "$REPO_ROOT"
  git add "state/stacks/$STACK_NAME/terraform.tfstate.enc"
  if ! git diff --cached --quiet; then
    git commit -m "state($STACK_NAME): update encrypted state"
  fi
fi