The Woodpecker CI pipeline has been silently failing to apply Tier 1
stacks since the state-migration commit e80b2f02 because the Alpine
CI image never had the vault CLI. `scripts/tg` swallowed stderr with
`2>/dev/null` and surfaced a misleading "Cannot read PG credentials
from Vault" message — the real error was `sh: vault: not found`.
Verified with an in-cluster probe: woodpecker/default SA + role=ci
already gets the terraform-state policy and has read capability on
database/static-creds/pg-terraform-state. Auth was never the problem;
the vault binary just wasn't there.
- ci/Dockerfile: pin vault v1.18.1 (matches server) and install
- scripts/tg: pre-flight check + surface real vault output on failure
- Next build-ci-image.yml run rebuilds :latest with vault included;
subsequent default.yml runs unblock monitoring apply (code-aoxk)
Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
145 lines
5 KiB
Bash
Executable file
145 lines
5 KiB
Bash
Executable file
#!/usr/bin/env bash
|
|
# scripts/tg — wrapper: decrypt state before, encrypt+commit after mutating ops
|
|
# Usage: scripts/tg apply --non-interactive
|
|
# scripts/tg plan
|
|
# Auth: `vault login -method=oidc` (token at ~/.vault-token)
|
|
set -euo pipefail
|
|
|
|
REPO_ROOT="$(cd "$(dirname "$0")/.." && pwd)"
|
|
SYNC="$REPO_ROOT/scripts/state-sync"
|
|
|
|
# Enable provider cache (shared across stacks)
|
|
export TF_PLUGIN_CACHE_DIR="${TF_PLUGIN_CACHE_DIR:-$HOME/.terraform.d/plugin-cache}"
|
|
export TF_PLUGIN_CACHE_MAY_BREAK_DEPENDENCY_LOCK_FILE=1
|
|
mkdir -p "$TF_PLUGIN_CACHE_DIR"
|
|
|
|
# Determine stack name from cwd (relative to stacks/)
|
|
STACK_NAME=""
|
|
cwd="$(pwd)"
|
|
stacks_dir="$REPO_ROOT/stacks"
|
|
if [[ "$cwd" == "$stacks_dir"/* ]]; then
|
|
rel="${cwd#$stacks_dir/}"
|
|
STACK_NAME="${rel%%/*}"
|
|
fi
|
|
|
|
# ── Tier detection ──
|
|
TIER0_STACKS="infra platform cnpg vault dbaas external-secrets"
|
|
is_tier0() {
|
|
echo "$TIER0_STACKS" | tr ' ' '\n' | grep -qx "$1"
|
|
}
|
|
|
|
# ── Advisory lock via Vault KV ──
|
|
LOCK_MAX_AGE=1800 # 30 minutes — stale lock threshold
|
|
acquire_lock() {
|
|
local stack="$1"
|
|
local vault_addr="${VAULT_ADDR:-https://vault.viktorbarzin.me}"
|
|
local lock_path="secret/data/locks/$stack"
|
|
local holder="pid=$$,host=$(hostname -s),user=$(whoami)"
|
|
|
|
# Check if lock exists and is not stale
|
|
local existing
|
|
existing=$(vault kv get -format=json "secret/locks/$stack" 2>/dev/null || echo '{}')
|
|
local locked=$(echo "$existing" | jq -r '.data.data.locked // "false"')
|
|
local acquired=$(echo "$existing" | jq -r '.data.data.acquired // "0"')
|
|
local existing_holder=$(echo "$existing" | jq -r '.data.data.holder // ""')
|
|
|
|
if [ "$locked" = "true" ]; then
|
|
local now=$(date +%s)
|
|
local age=$((now - acquired))
|
|
if [ "$age" -lt "$LOCK_MAX_AGE" ]; then
|
|
echo "ERROR: Stack '$stack' is locked by: $existing_holder (${age}s ago)"
|
|
echo " Wait for it to finish or run: vault kv delete secret/locks/$stack"
|
|
return 1
|
|
fi
|
|
echo "WARNING: Breaking stale lock on '$stack' (held ${age}s by $existing_holder)"
|
|
fi
|
|
|
|
vault kv put "secret/locks/$stack" locked=true holder="$holder" acquired="$(date +%s)" >/dev/null
|
|
}
|
|
|
|
release_lock() {
|
|
local stack="$1"
|
|
vault kv delete "secret/locks/$stack" >/dev/null 2>&1 || true
|
|
}
|
|
|
|
# ── Pre-flight: decrypt state (Tier 0) or fetch PG creds (Tier 1) ──
|
|
if [ -n "$STACK_NAME" ]; then
|
|
if is_tier0 "$STACK_NAME"; then
|
|
# Tier 0: SOPS-encrypted local state
|
|
if [ -f "$REPO_ROOT/state/stacks/$STACK_NAME/terraform.tfstate.enc" ]; then
|
|
"$SYNC" decrypt "$STACK_NAME"
|
|
fi
|
|
else
|
|
# Tier 1: PG backend — fetch credentials from Vault
|
|
if [ -z "${PG_CONN_STR:-}" ]; then
|
|
# Pre-flight: vault CLI must be available. Previously CI failed with a
|
|
# misleading "Cannot read PG credentials" message because the Alpine CI
|
|
# image lacked the vault binary — the 2>/dev/null below swallowed the
|
|
# real "vault: not found" error. Fail fast with a clear message instead.
|
|
if ! command -v vault >/dev/null 2>&1; then
|
|
echo "ERROR: vault CLI not found on PATH. Install it or use an image that includes it (ci/Dockerfile)." >&2
|
|
exit 1
|
|
fi
|
|
VAULT_OUT=$(vault read -format=json database/static-creds/pg-terraform-state 2>&1) || {
|
|
echo "ERROR: Cannot read PG credentials from Vault. Vault output follows:" >&2
|
|
echo "$VAULT_OUT" >&2
|
|
echo "" >&2
|
|
echo "Hint: humans run 'vault login -method=oidc'; CI auths via K8s SA (role=ci)." >&2
|
|
exit 1
|
|
}
|
|
PG_USER=$(echo "$VAULT_OUT" | jq -r .data.username)
|
|
PG_PASS=$(echo "$VAULT_OUT" | jq -r .data.password)
|
|
export PG_CONN_STR="postgres://${PG_USER}:${PG_PASS}@10.0.20.200:5432/terraform_state?sslmode=disable"
|
|
fi
|
|
fi
|
|
fi
|
|
|
|
# Detect if this is a mutating operation
|
|
is_mutating=false
|
|
for arg in "$@"; do
|
|
case "$arg" in
|
|
apply|destroy|import|state) is_mutating=true ;;
|
|
esac
|
|
done
|
|
|
|
# Acquire lock for mutating operations (Tier 0 only — Tier 1 uses pg_advisory_lock)
|
|
if $is_mutating && [ -n "$STACK_NAME" ] && is_tier0 "$STACK_NAME"; then
|
|
if command -v vault &>/dev/null && [ -n "${VAULT_TOKEN:-}" ]; then
|
|
acquire_lock "$STACK_NAME"
|
|
trap 'release_lock "$STACK_NAME"' EXIT
|
|
fi
|
|
fi
|
|
|
|
# If running apply with --non-interactive, add -auto-approve for Terraform
|
|
args=("$@")
|
|
has_apply=false
|
|
has_non_interactive=false
|
|
for arg in "${args[@]}"; do
|
|
case "$arg" in
|
|
apply) has_apply=true ;;
|
|
--non-interactive) has_non_interactive=true ;;
|
|
esac
|
|
done
|
|
|
|
if $has_apply && $has_non_interactive; then
|
|
new_args=()
|
|
for arg in "${args[@]}"; do
|
|
new_args+=("$arg")
|
|
if [ "$arg" = "apply" ]; then
|
|
new_args+=("-auto-approve")
|
|
fi
|
|
done
|
|
terragrunt "${new_args[@]}"
|
|
else
|
|
terragrunt "$@"
|
|
fi
|
|
|
|
# After mutating operations: encrypt+commit (Tier 0) or no-op (Tier 1 — PG is authoritative)
|
|
if $is_mutating && [ -n "$STACK_NAME" ] && is_tier0 "$STACK_NAME"; then
|
|
"$SYNC" encrypt "$STACK_NAME"
|
|
cd "$REPO_ROOT"
|
|
git add "state/stacks/$STACK_NAME/terraform.tfstate.enc"
|
|
if ! git diff --cached --quiet; then
|
|
git commit -m "state($STACK_NAME): update encrypted state"
|
|
fi
|
|
fi
|