infra/scripts/tg
Viktor Barzin e80b2f026f [infra] Migrate Terraform state from local SOPS to PostgreSQL backend
Two-tier state architecture:
- Tier 0 (infra, platform, cnpg, vault, dbaas, external-secrets): local
  state with SOPS encryption in git — unchanged, required for bootstrap.
- Tier 1 (105 app stacks): PostgreSQL backend on CNPG cluster at
  10.0.20.200:5432/terraform_state with native pg_advisory_lock.

Motivation: multi-operator friction (every workstation needed SOPS + age +
git-crypt), bootstrap complexity for new operators, and headless agents/CI
needing the full encryption toolchain just to read state.

Changes:
- terragrunt.hcl: conditional backend (local vs pg) based on tier0 list
- scripts/tg: tier detection, auto-fetch PG creds from Vault for Tier 1,
  skip SOPS and Vault KV locking for Tier 1 stacks
- scripts/state-sync: tier-aware encrypt/decrypt (skips Tier 1)
- scripts/migrate-state-to-pg: one-shot migration script (idempotent)
- stacks/vault/main.tf: pg-terraform-state static role + K8s auth role
  for claude-agent namespace
- stacks/dbaas: terraform_state DB creation + MetalLB LoadBalancer
  service on shared IP 10.0.20.200
- Deleted 107 .tfstate.enc files for migrated Tier 1 stacks
- Cleaned up per-stack tiers.tf (now generated by root terragrunt.hcl)

[ci skip]

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-16 19:33:12 +00:00

134 lines
4.4 KiB
Bash
Executable file

#!/usr/bin/env bash
# scripts/tg — wrapper: decrypt state before, encrypt+commit after mutating ops
# Usage: scripts/tg apply --non-interactive
# scripts/tg plan
# Auth: `vault login -method=oidc` (token at ~/.vault-token)
set -euo pipefail
REPO_ROOT="$(cd "$(dirname "$0")/.." && pwd)"
SYNC="$REPO_ROOT/scripts/state-sync"
# Enable provider cache (shared across stacks)
export TF_PLUGIN_CACHE_DIR="${TF_PLUGIN_CACHE_DIR:-$HOME/.terraform.d/plugin-cache}"
export TF_PLUGIN_CACHE_MAY_BREAK_DEPENDENCY_LOCK_FILE=1
mkdir -p "$TF_PLUGIN_CACHE_DIR"
# Determine stack name from cwd (relative to stacks/)
STACK_NAME=""
cwd="$(pwd)"
stacks_dir="$REPO_ROOT/stacks"
if [[ "$cwd" == "$stacks_dir"/* ]]; then
rel="${cwd#$stacks_dir/}"
STACK_NAME="${rel%%/*}"
fi
# ── Tier detection ──
TIER0_STACKS="infra platform cnpg vault dbaas external-secrets"
is_tier0() {
echo "$TIER0_STACKS" | tr ' ' '\n' | grep -qx "$1"
}
# ── Advisory lock via Vault KV ──
LOCK_MAX_AGE=1800 # 30 minutes — stale lock threshold
acquire_lock() {
local stack="$1"
local vault_addr="${VAULT_ADDR:-https://vault.viktorbarzin.me}"
local lock_path="secret/data/locks/$stack"
local holder="pid=$$,host=$(hostname -s),user=$(whoami)"
# Check if lock exists and is not stale
local existing
existing=$(vault kv get -format=json "secret/locks/$stack" 2>/dev/null || echo '{}')
local locked=$(echo "$existing" | jq -r '.data.data.locked // "false"')
local acquired=$(echo "$existing" | jq -r '.data.data.acquired // "0"')
local existing_holder=$(echo "$existing" | jq -r '.data.data.holder // ""')
if [ "$locked" = "true" ]; then
local now=$(date +%s)
local age=$((now - acquired))
if [ "$age" -lt "$LOCK_MAX_AGE" ]; then
echo "ERROR: Stack '$stack' is locked by: $existing_holder (${age}s ago)"
echo " Wait for it to finish or run: vault kv delete secret/locks/$stack"
return 1
fi
echo "WARNING: Breaking stale lock on '$stack' (held ${age}s by $existing_holder)"
fi
vault kv put "secret/locks/$stack" locked=true holder="$holder" acquired="$(date +%s)" >/dev/null
}
release_lock() {
local stack="$1"
vault kv delete "secret/locks/$stack" >/dev/null 2>&1 || true
}
# ── Pre-flight: decrypt state (Tier 0) or fetch PG creds (Tier 1) ──
if [ -n "$STACK_NAME" ]; then
if is_tier0 "$STACK_NAME"; then
# Tier 0: SOPS-encrypted local state
if [ -f "$REPO_ROOT/state/stacks/$STACK_NAME/terraform.tfstate.enc" ]; then
"$SYNC" decrypt "$STACK_NAME"
fi
else
# Tier 1: PG backend — fetch credentials from Vault
if [ -z "${PG_CONN_STR:-}" ]; then
PG_CREDS=$(vault read -format=json database/static-creds/pg-terraform-state 2>/dev/null) || {
echo "ERROR: Cannot read PG credentials from Vault. Run: vault login -method=oidc" >&2
exit 1
}
PG_USER=$(echo "$PG_CREDS" | jq -r .data.username)
PG_PASS=$(echo "$PG_CREDS" | jq -r .data.password)
export PG_CONN_STR="postgres://${PG_USER}:${PG_PASS}@10.0.20.200:5432/terraform_state?sslmode=disable"
fi
fi
fi
# Detect if this is a mutating operation
is_mutating=false
for arg in "$@"; do
case "$arg" in
apply|destroy|import|state) is_mutating=true ;;
esac
done
# Acquire lock for mutating operations (Tier 0 only — Tier 1 uses pg_advisory_lock)
if $is_mutating && [ -n "$STACK_NAME" ] && is_tier0 "$STACK_NAME"; then
if command -v vault &>/dev/null && [ -n "${VAULT_TOKEN:-}" ]; then
acquire_lock "$STACK_NAME"
trap 'release_lock "$STACK_NAME"' EXIT
fi
fi
# If running apply with --non-interactive, add -auto-approve for Terraform
args=("$@")
has_apply=false
has_non_interactive=false
for arg in "${args[@]}"; do
case "$arg" in
apply) has_apply=true ;;
--non-interactive) has_non_interactive=true ;;
esac
done
if $has_apply && $has_non_interactive; then
new_args=()
for arg in "${args[@]}"; do
new_args+=("$arg")
if [ "$arg" = "apply" ]; then
new_args+=("-auto-approve")
fi
done
terragrunt "${new_args[@]}"
else
terragrunt "$@"
fi
# After mutating operations: encrypt+commit (Tier 0) or no-op (Tier 1 — PG is authoritative)
if $is_mutating && [ -n "$STACK_NAME" ] && is_tier0 "$STACK_NAME"; then
"$SYNC" encrypt "$STACK_NAME"
cd "$REPO_ROOT"
git add "state/stacks/$STACK_NAME/terraform.tfstate.enc"
if ! git diff --cached --quiet; then
git commit -m "state($STACK_NAME): update encrypted state"
fi
fi