[infra] Migrate Terraform state from local SOPS to PostgreSQL backend

Two-tier state architecture:
- Tier 0 (infra, platform, cnpg, vault, dbaas, external-secrets): local
  state with SOPS encryption in git — unchanged, required for bootstrap.
- Tier 1 (105 app stacks): PostgreSQL backend on CNPG cluster at
  10.0.20.200:5432/terraform_state with native pg_advisory_lock.

Motivation: multi-operator friction (every workstation needed SOPS + age +
git-crypt), bootstrap complexity for new operators, and headless agents/CI
needing the full encryption toolchain just to read state.

Changes:
- terragrunt.hcl: conditional backend (local vs pg) based on tier0 list
- scripts/tg: tier detection, auto-fetch PG creds from Vault for Tier 1,
  skip SOPS and Vault KV locking for Tier 1 stacks
- scripts/state-sync: tier-aware encrypt/decrypt (skips Tier 1)
- scripts/migrate-state-to-pg: one-shot migration script (idempotent)
- stacks/vault/main.tf: pg-terraform-state static role + K8s auth role
  for claude-agent namespace
- stacks/dbaas: terraform_state DB creation + MetalLB LoadBalancer
  service on shared IP 10.0.20.200
- Deleted 107 .tfstate.enc files for migrated Tier 1 stacks
- Cleaned up per-stack tiers.tf (now generated by root terragrunt.hcl)

[ci skip]

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Viktor Barzin 2026-04-16 19:33:12 +00:00
parent f538115c43
commit e80b2f026f
360 changed files with 844 additions and 302747 deletions

117
scripts/migrate-state-to-pg Executable file
View file

@ -0,0 +1,117 @@
#!/usr/bin/env bash
# scripts/migrate-state-to-pg — One-shot migration from local SOPS state to PG backend.
# Prerequisites: vault login -method=oidc, PG terraform_state DB exists, Vault static role created.
# Usage: scripts/migrate-state-to-pg [--dry-run]
set -euo pipefail
REPO_ROOT="$(cd "$(dirname "$0")/.." && pwd)"
SYNC="$REPO_ROOT/scripts/state-sync"
STACKS_DIR="$REPO_ROOT/stacks"
STATE_DIR="$REPO_ROOT/state/stacks"
TIER0_STACKS="infra platform cnpg vault dbaas external-secrets"
is_tier0() {
echo "$TIER0_STACKS" | tr ' ' '\n' | grep -qx "$1"
}
DRY_RUN=false
[ "${1:-}" = "--dry-run" ] && DRY_RUN=true
# Fetch PG credentials from Vault
echo "==> Fetching PG credentials from Vault..."
PG_CREDS=$(vault read -format=json database/static-creds/pg-terraform-state) || {
echo "ERROR: Cannot read PG credentials. Run: vault login -method=oidc" >&2
exit 1
}
PG_USER=$(echo "$PG_CREDS" | jq -r .data.username)
PG_PASS=$(echo "$PG_CREDS" | jq -r .data.password)
export PG_CONN_STR="postgres://${PG_USER}:${PG_PASS}@10.0.20.200:5432/terraform_state?sslmode=disable"
echo " PG_CONN_STR set (user: $PG_USER)"
# Enable provider cache
export TF_PLUGIN_CACHE_DIR="${TF_PLUGIN_CACHE_DIR:-$HOME/.terraform.d/plugin-cache}"
export TF_PLUGIN_CACHE_MAY_BREAK_DEPENDENCY_LOCK_FILE=1
mkdir -p "$TF_PLUGIN_CACHE_DIR"
migrated=0
failed=0
skipped=0
failed_stacks=""
# Increment helpers (avoid arithmetic exit code 1 when value is 0)
inc_migrated() { migrated=$((migrated + 1)); }
inc_failed() { failed=$((failed + 1)); }
inc_skipped() { skipped=$((skipped + 1)); }
# Iterate over all stack directories that have state
for state_dir in "$STATE_DIR"/*/; do
stack="$(basename "$state_dir")"
# Skip Tier 0
if is_tier0 "$stack"; then
echo "--- SKIP (Tier 0): $stack"
inc_skipped
continue
fi
# Skip stacks with no state file
if [ ! -f "$state_dir/terraform.tfstate.enc" ] && [ ! -f "$state_dir/terraform.tfstate" ]; then
echo "--- SKIP (no state): $stack"
inc_skipped
continue
fi
# Skip stacks with no corresponding stack directory
if [ ! -d "$STACKS_DIR/$stack" ]; then
echo "--- SKIP (no stack dir): $stack"
inc_skipped
continue
fi
echo "==> Migrating: $stack"
if $DRY_RUN; then
echo " [dry-run] Would migrate $stack"
inc_skipped
continue
fi
# Decrypt state if needed (call decrypt_state directly — state-sync skips Tier 1)
if [ -f "$state_dir/terraform.tfstate.enc" ] && [ ! -f "$state_dir/terraform.tfstate" ]; then
sops -d --input-type json --output-type json "$state_dir/terraform.tfstate.enc" > "$state_dir/terraform.tfstate" || {
echo " WARNING: decrypt failed, skipping"
inc_skipped
continue
}
fi
# Migrate state
cd "$STACKS_DIR/$stack"
if terragrunt init -upgrade -migrate-state -force-copy -input=false 2>&1 | tee /tmp/tg-migrate-$stack.log; then
echo " init OK"
# Verify — plan should show no changes
if terragrunt plan -detailed-exitcode -input=false 2>&1 | tail -5 | grep -q "No changes"; then
echo " plan OK — no drift"
inc_migrated
else
echo " WARNING: plan shows changes (may be normal drift, not migration issue)"
inc_migrated
fi
else
echo " FAILED: init error (see /tmp/tg-migrate-$stack.log)"
inc_failed
failed_stacks="$failed_stacks $stack"
fi
done
echo ""
echo "========================================"
echo "Migration complete"
echo " Migrated: $migrated"
echo " Failed: $failed"
echo " Skipped: $skipped"
if [ -n "$failed_stacks" ]; then
echo " Failed stacks:$failed_stacks"
fi
echo "========================================"

View file

@ -24,6 +24,12 @@ stack_name_from_dir() {
basename "$1"
}
# Tier 0 stacks keep SOPS-encrypted local state; Tier 1 uses PG backend
TIER0_STACKS="infra platform cnpg vault dbaas external-secrets"
is_tier0() {
echo "$TIER0_STACKS" | tr ' ' '\n' | grep -qx "$1"
}
# Read age recipients from .sops.yaml
AGE_RECIPIENTS="$(python3 -c "
import yaml, sys
@ -74,24 +80,38 @@ decrypt_state() {
case "$cmd" in
encrypt)
if [ -n "$stack" ]; then
encrypt_state "$STATE_DIR/$stack"
if is_tier0 "$stack"; then
encrypt_state "$STATE_DIR/$stack"
else
echo "state-sync: skipping Tier 1 stack '$stack' (PG backend)" >&2
fi
else
for dir in "$STATE_DIR"/*/; do
encrypt_state "$dir"
_name="$(stack_name_from_dir "$dir")"
if is_tier0 "$_name"; then
encrypt_state "$dir"
fi
done
fi
;;
decrypt)
if [ -n "$stack" ]; then
decrypt_state "$STATE_DIR/$stack"
if is_tier0 "$stack"; then
decrypt_state "$STATE_DIR/$stack"
else
echo "state-sync: skipping Tier 1 stack '$stack' (PG backend)" >&2
fi
else
for dir in "$STATE_DIR"/*/; do
decrypt_state "$dir"
_name="$(stack_name_from_dir "$dir")"
if is_tier0 "$_name"; then
decrypt_state "$dir"
fi
done
fi
;;
commit)
# Encrypt all changed state, then git add + commit
# Only Tier 0 stacks have encrypted state in git
"$0" encrypt
cd "$REPO_ROOT"
git add state/stacks/*/terraform.tfstate.enc
@ -101,6 +121,8 @@ case "$cmd" in
;;
help)
echo "Usage: state-sync {encrypt|decrypt|commit} [stack-name]"
echo "Operates on Tier 0 stacks only (infra, platform, cnpg, vault, dbaas, external-secrets)."
echo "Tier 1 stacks use the PG backend and don't need local state sync."
echo "Encrypt uses per-stack Vault Transit key (transit/keys/sops-state-<stack>)."
echo "Decrypt uses Vault Transit if logged in, falls back to age key."
;;

View file

@ -22,6 +22,12 @@ if [[ "$cwd" == "$stacks_dir"/* ]]; then
STACK_NAME="${rel%%/*}"
fi
# ── Tier detection ──
TIER0_STACKS="infra platform cnpg vault dbaas external-secrets"
is_tier0() {
echo "$TIER0_STACKS" | tr ' ' '\n' | grep -qx "$1"
}
# ── Advisory lock via Vault KV ──
LOCK_MAX_AGE=1800 # 30 minutes — stale lock threshold
acquire_lock() {
@ -56,9 +62,25 @@ release_lock() {
vault kv delete "secret/locks/$stack" >/dev/null 2>&1 || true
}
# Decrypt state before any operation
if [ -n "$STACK_NAME" ] && [ -f "$REPO_ROOT/state/stacks/$STACK_NAME/terraform.tfstate.enc" ]; then
"$SYNC" decrypt "$STACK_NAME"
# ── Pre-flight: decrypt state (Tier 0) or fetch PG creds (Tier 1) ──
if [ -n "$STACK_NAME" ]; then
if is_tier0 "$STACK_NAME"; then
# Tier 0: SOPS-encrypted local state
if [ -f "$REPO_ROOT/state/stacks/$STACK_NAME/terraform.tfstate.enc" ]; then
"$SYNC" decrypt "$STACK_NAME"
fi
else
# Tier 1: PG backend — fetch credentials from Vault
if [ -z "${PG_CONN_STR:-}" ]; then
PG_CREDS=$(vault read -format=json database/static-creds/pg-terraform-state 2>/dev/null) || {
echo "ERROR: Cannot read PG credentials from Vault. Run: vault login -method=oidc" >&2
exit 1
}
PG_USER=$(echo "$PG_CREDS" | jq -r .data.username)
PG_PASS=$(echo "$PG_CREDS" | jq -r .data.password)
export PG_CONN_STR="postgres://${PG_USER}:${PG_PASS}@10.0.20.200:5432/terraform_state?sslmode=disable"
fi
fi
fi
# Detect if this is a mutating operation
@ -69,8 +91,8 @@ for arg in "$@"; do
esac
done
# Acquire lock for mutating operations
if $is_mutating && [ -n "$STACK_NAME" ]; then
# Acquire lock for mutating operations (Tier 0 only — Tier 1 uses pg_advisory_lock)
if $is_mutating && [ -n "$STACK_NAME" ] && is_tier0 "$STACK_NAME"; then
if command -v vault &>/dev/null && [ -n "${VAULT_TOKEN:-}" ]; then
acquire_lock "$STACK_NAME"
trap 'release_lock "$STACK_NAME"' EXIT
@ -101,8 +123,8 @@ else
terragrunt "$@"
fi
# After mutating operations, encrypt and commit
if $is_mutating && [ -n "$STACK_NAME" ]; then
# After mutating operations: encrypt+commit (Tier 0) or no-op (Tier 1 — PG is authoritative)
if $is_mutating && [ -n "$STACK_NAME" ] && is_tier0 "$STACK_NAME"; then
"$SYNC" encrypt "$STACK_NAME"
cd "$REPO_ROOT"
git add "state/stacks/$STACK_NAME/terraform.tfstate.enc"