# Unified infra CI pipeline — detects changed stacks and applies only those. # Platform stacks and app stacks handled in one pipeline with proper ordering. # # Optimizations over the previous split pipeline: # - Custom CI image (no apk/wget per step) # - Shallow clone (depth=2 for git diff HEAD~1) # - TF_PLUGIN_CACHE_DIR (shared provider cache) # - Serial apply with Vault advisory locks (prevents user/CI race conditions) # - Step consolidation (2 steps instead of 4) # - Changed-stacks-only detection (skips no-op applies) # - Global-file fallback (modules/config changes trigger full apply) # - Lock-aware: skips stacks locked by users instead of failing when: event: push branch: master clone: git: image: woodpeckerci/plugin-git settings: depth: 2 attempts: 5 backoff: 10s steps: - name: apply image: forgejo.viktorbarzin.me/viktor/infra-ci:latest pull: true backend_options: kubernetes: resources: requests: memory: 3Gi limits: memory: 6Gi environment: SLACK_WEBHOOK: from_secret: slack_webhook # Each `- |` command runs in a fresh shell, so we can't rely on an # `export VAULT_ADDR=...` in the auth command persisting — pin it at # step level. VAULT_TOKEN is still per-command; we persist it to # ~/.vault-token (auto-read by `vault` CLI) so downstream commands # don't need explicit token propagation. VAULT_ADDR: http://vault-active.vault.svc.cluster.local:8200 commands: # ── Skip CI commits ── - | if echo "$CI_COMMIT_MESSAGE" | grep -q '\[CI SKIP\]\|\[ci skip\]'; then echo "Commit has [CI SKIP], exiting" exit 0 fi # ── git-crypt unlock ── - | SA_TOKEN=$(cat /var/run/secrets/kubernetes.io/serviceaccount/token) curl -sk "https://10.0.20.100:6443/api/v1/namespaces/woodpecker/configmaps/git-crypt-key" \ -H "Authorization:Bearer $SA_TOKEN" | jq -r .data.key | base64 -d > /tmp/key git-crypt unlock /tmp/key && rm /tmp/key # ── Vault auth ── - | SA_TOKEN=$(cat /var/run/secrets/kubernetes.io/serviceaccount/token) VAULT_TOKEN=$(curl -s -X POST "$VAULT_ADDR/v1/auth/kubernetes/login" \ -d "{\"role\":\"ci\",\"jwt\":\"$SA_TOKEN\"}" | jq -r .auth.client_token) if [ -z "$VAULT_TOKEN" ] || [ "$VAULT_TOKEN" = "null" ]; then echo "ERROR: Vault K8s auth failed (role=ci, ns=woodpecker)" >&2 exit 1 fi # Persist for downstream `- |` blocks (each runs in a fresh shell, # so exporting VAULT_TOKEN wouldn't help). `vault`, `scripts/tg`, # and `scripts/state-sync` all fall through to ~/.vault-token when # the env var is unset. umask 077; printf '%s' "$VAULT_TOKEN" > "$HOME/.vault-token" # ── Generate kubeconfig from projected SA token ── # terragrunt.hcl injects `-var kube_config_path=/config` for every # terraform invocation, so we need a kubeconfig file at that path. The # `default` SA in the woodpecker namespace is cluster-admin (via the # `woodpecker-default` ClusterRoleBinding), so the projected token is # sufficient to apply any stack. Using `tokenFile` (not an inline token) # so the provider re-reads it if kubelet rotates the projected token # mid-pipeline. - | cat > config <<'EOF' apiVersion: v1 kind: Config clusters: - name: kubernetes cluster: server: https://10.0.20.100:6443 certificate-authority: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt contexts: - name: ci context: cluster: kubernetes user: ci current-context: ci users: - name: ci user: tokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token EOF chmod 600 config # Sanity check: kubeconfig works kubectl --kubeconfig=config get ns kube-system -o name >/dev/null # ── Detect changed stacks ── - | PLATFORM_STACKS="dbaas authentik crowdsec monitoring nvidia mailserver cloudflared kyverno metallb redis traefik technitium headscale rbac k8s-portal vaultwarden reverse-proxy metrics-server vpa nfs-csi iscsi-csi cnpg sealed-secrets uptime-kuma wireguard xray infra-maintenance platform vault reloader descheduler external-secrets" # Ensure we have enough history for diff (clone may be shallow) if ! git rev-parse HEAD~1 >/dev/null 2>&1; then echo "WARNING: HEAD~1 not available (shallow clone?) — fetching more history" git fetch --deepen=1 origin master 2>/dev/null || true fi # If still no parent, apply all platform stacks as a safe fallback if ! git rev-parse HEAD~1 >/dev/null 2>&1; then echo "Cannot determine changed files — applying ALL platform stacks" echo "$PLATFORM_STACKS" | tr ' ' '\n' > .platform_apply > .app_apply else # Check if global files changed (triggers full platform apply) GLOBAL_CHANGED=$(git diff --name-only HEAD~1 HEAD | grep -E '^(modules/|config\.tfvars|terragrunt\.hcl)' || true) if [ -n "$GLOBAL_CHANGED" ]; then echo "Global files changed — applying ALL platform stacks" echo "$PLATFORM_STACKS" | tr ' ' '\n' > .platform_apply else # Detect platform stacks that changed git diff --name-only HEAD~1 HEAD | grep '^stacks/' | cut -d/ -f2 | sort -u > .all_changed > .platform_apply while read -r stack; do if echo "$PLATFORM_STACKS" | grep -qw "$stack"; then echo "$stack" >> .platform_apply fi done < .all_changed fi # Detect app stacks that changed > .app_apply git diff --name-only HEAD~1 HEAD | grep '^stacks/' | cut -d/ -f2 | sort -u | while read -r stack; do if echo "$PLATFORM_STACKS" | grep -qw "$stack"; then continue # Skip platform stacks fi if [ ! -f "stacks/$stack/terragrunt.hcl" ]; then continue # Skip non-terragrunt dirs fi echo "$stack" >> .app_apply done fi PLATFORM_COUNT=$(wc -l < .platform_apply | tr -d ' ') APP_COUNT=$(wc -l < .app_apply | tr -d ' ') echo "Platform stacks to apply: $PLATFORM_COUNT" echo "App stacks to apply: $APP_COUNT" cat .platform_apply .app_apply # ── Pre-warm provider cache ── - | if [ -s .platform_apply ] || [ -s .app_apply ]; then FIRST_STACK=$(cat .platform_apply .app_apply 2>/dev/null | head -1) if [ -n "$FIRST_STACK" ]; then echo "Pre-warming provider cache from stacks/$FIRST_STACK..." cd "stacks/$FIRST_STACK" && terragrunt init --terragrunt-non-interactive -input=false 2>&1 | tail -3 && cd ../.. fi fi # ── Apply platform stacks (serial, with Vault advisory locks) ── - | FAILED_PLATFORM_STACKS="" if [ -s .platform_apply ]; then echo "=== Applying platform stacks (serial, locked) ===" while read -r stack; do echo "[$stack] Starting apply..." set +e OUTPUT=$(cd "stacks/$stack" && ../../scripts/tg apply --non-interactive 2>&1) EXIT=$? set -e if [ $EXIT -ne 0 ]; then if echo "$OUTPUT" | grep -q "is locked by"; then echo "[$stack] SKIPPED (locked by another session)" else echo "$OUTPUT" | tail -50 echo "[$stack] FAILED (exit $EXIT)" FAILED_PLATFORM_STACKS="$FAILED_PLATFORM_STACKS $stack" fi else echo "$OUTPUT" | tail -3 echo "[$stack] OK" fi done < .platform_apply fi # Deferred until after app stacks so both lists get a chance to run. echo "$FAILED_PLATFORM_STACKS" > .platform_failed # ── Apply app stacks (serial, with Vault advisory locks) ── - | FAILED_APP_STACKS="" if [ -s .app_apply ]; then echo "=== Applying app stacks (serial, locked) ===" while read -r stack; do echo "[$stack] Starting apply..." set +e OUTPUT=$(cd "stacks/$stack" && ../../scripts/tg apply --non-interactive 2>&1) EXIT=$? set -e if [ $EXIT -ne 0 ]; then if echo "$OUTPUT" | grep -q "is locked by"; then echo "[$stack] SKIPPED (locked by another session)" else echo "$OUTPUT" | tail -50 echo "[$stack] FAILED (exit $EXIT)" FAILED_APP_STACKS="$FAILED_APP_STACKS $stack" fi else echo "$OUTPUT" | tail -3 echo "[$stack] OK" fi done < .app_apply fi # Fail the step loudly so the pipeline `default` workflow state # reflects reality — the service-upgrade agent and CI alert cascade # both rely on this (see bd code-e1x). Lock-skipped stacks are NOT # counted as failures. FAILED_PLATFORM=$(cat .platform_failed 2>/dev/null | tr -d ' ') if [ -n "$FAILED_PLATFORM" ] || [ -n "$FAILED_APP_STACKS" ]; then echo "=== FAILED STACKS: platform=[$FAILED_PLATFORM ] apps=[$FAILED_APP_STACKS ] ===" exit 1 fi # ── Commit and push state changes ── - | mkdir -p ~/.ssh && ssh-keyscan -H github.com >> ~/.ssh/known_hosts 2>/dev/null chmod 400 secrets/deploy_key git add stacks/ state/ .woodpecker/ 2>/dev/null || true git remote set-url origin git@github.com:ViktorBarzin/infra.git git diff --cached --quiet && echo "No changes to commit" && exit 0 git commit -m "Woodpecker CI deploy [CI SKIP]" GIT_SSH_COMMAND='ssh -i ./secrets/deploy_key -o IdentitiesOnly=yes' git fetch origin master if ! GIT_SSH_COMMAND='ssh -i ./secrets/deploy_key -o IdentitiesOnly=yes' git rebase origin/master; then echo "ERROR: Git rebase failed — state commits could not be pushed" echo "Manual intervention required: pull, resolve conflicts, push" GIT_SSH_COMMAND='ssh -i ./secrets/deploy_key -o IdentitiesOnly=yes' git rebase --abort || true exit 1 fi GIT_SSH_COMMAND='ssh -i ./secrets/deploy_key -o IdentitiesOnly=yes' git push origin master # ── Slack notification ── - | PLATFORM_COUNT=$(wc -l < .platform_apply 2>/dev/null | tr -d ' ') APP_COUNT=$(wc -l < .app_apply 2>/dev/null | tr -d ' ') curl -s -X POST -H 'Content-type: application/json' \ --data "{\"channel\":\"general\",\"text\":\"Woodpecker CI: infra pipeline ${CI_PIPELINE_STATUS} (platform:${PLATFORM_COUNT}, apps:${APP_COUNT})\"}" \ "$SLACK_WEBHOOK" || true # Slack on failure (runs even if apply step fails) - name: notify-failure image: curlimages/curl commands: - | curl -s -X POST -H 'Content-type: application/json' \ --data "{\"channel\":\"general\",\"text\":\":red_circle: Woodpecker CI: infra pipeline FAILED\"}" \ "$SLACK_WEBHOOK" || true environment: SLACK_WEBHOOK: from_secret: slack_webhook when: status: [failure]