Merge remote-tracking branch 'origin/master' into wizard/authentik-perf-fix

2026-06-28 11:38:07 +00:00 · 2026-06-28 11:38:07 +00:00 · 5fb2004de5
commit 5fb2004de5
parent f10bb71562 ec681ba6e1
2 changed files with 93 additions and 30 deletions
--- a/.woodpecker/default.yml
+++ b/.woodpecker/default.yml
@ -65,6 +65,21 @@ steps:
      # don't need explicit token propagation.
      VAULT_ADDR: http://vault-active.vault.svc.cluster.local:8200
    commands:
+      # ── Forge guard: apply ONLY on the canonical Forgejo forge ──
+      # infra is registered in Woodpecker on BOTH the Forgejo canonical repo and
+      # the legacy GitHub mirror, and BOTH fire this push pipeline. Without this
+      # guard both run `terragrunt apply` on every push and race each other for
+      # the per-stack PG state lock — the dominant cause of the "Error acquiring
+      # the state lock" failures + push-supersede "killed" runs. The GitHub-mirror
+      # registration keeps running the CRONS (drift-detection, renew-tls, …) — only
+      # its duplicate push-apply no-ops here. Fail-open: an unknown forge (neither
+      # env var set) still applies, preserving prior behaviour.
+      - |
+        if echo "${CI_REPO_URL:-}${CI_FORGE_URL:-}" | grep -qi 'github\.com'; then
+          echo "[forge-guard] GitHub-mirror push — apply runs only on the Forgejo canonical repo (avoids double-apply + state-lock races). Skipping."
+          exit 0
+        fi
+
      # ── Skip CI commits ──
      - |
        if echo "$CI_COMMIT_MESSAGE" | grep -q '\[CI SKIP\]\|\[ci skip\]'; then
@ -220,22 +235,33 @@ steps:
            # (2026-06-27 — see docs/architecture/ci-cd.md)
            if [ "$stack" = "vault" ]; then echo "[vault] SKIPPED (Tier-0, human-applied via OIDC)"; continue; fi
            echo "[$stack] Starting apply..."
-            set +e
-            OUTPUT=$(cd "stacks/$stack" && ../../scripts/tg apply --non-interactive 2>&1)
-            EXIT=$?
-            set -e
-            if [ $EXIT -ne 0 ]; then
-              if echo "$OUTPUT" | grep -q "is locked by"; then
-                echo "[$stack] SKIPPED (locked by another session)"
-              else
-                echo "$OUTPUT" | tail -50
-                echo "[$stack] FAILED (exit $EXIT)"
-                FAILED_PLATFORM_STACKS="$FAILED_PLATFORM_STACKS $stack"
+            ATTEMPT=0
+            while :; do
+              ATTEMPT=$((ATTEMPT + 1))
+              set +e
+              OUTPUT=$(cd "stacks/$stack" && ../../scripts/tg apply --non-interactive 2>&1)
+              EXIT=$?
+              set -e
+              if [ $EXIT -eq 0 ]; then
+                echo "$OUTPUT" | tail -3; echo "[$stack] OK"; break
              fi
-            else
-              echo "$OUTPUT" | tail -3
-              echo "[$stack] OK"
-            fi
+              # Lock contention → SKIP, not fail. Match BOTH the Tier-0 Vault lock
+              # ("is locked by", from scripts/tg) AND the Tier-1 PG-backend lock
+              # ("Error acquiring the state lock" / "already locked"). The PG case
+              # was previously counted as a failure — the #1 source of false reds.
+              if echo "$OUTPUT" | grep -qE 'is locked by|Error acquiring the state lock|already locked'; then
+                echo "[$stack] SKIPPED (locked by another session/run)"; break
+              fi
+              # Transient: provider-registry download timeout / Vault 5xx → bounded
+              # retry. Deliberately NOT helm atomic-timeouts or config errors
+              # (missing arg, invalid index) — those must fail fast, retry can't fix
+              # them and can worsen a stuck helm release.
+              if [ $ATTEMPT -lt 3 ] && echo "$OUTPUT" | grep -qE 'Failed to install provider|Client\.Timeout exceeded while awaiting headers|error reading from Vault.*Code: 5[0-9][0-9]'; then
+                echo "[$stack] transient error (attempt $ATTEMPT/3) — retrying in 15s..."; sleep 15; continue
+              fi
+              echo "$OUTPUT" | tail -50; echo "[$stack] FAILED (exit $EXIT)"
+              FAILED_PLATFORM_STACKS="$FAILED_PLATFORM_STACKS $stack"; break
+            done
          done < .platform_apply
        fi
        # Deferred until after app stacks so both lists get a chance to run.
@ -248,22 +274,27 @@ steps:
          echo "=== Applying app stacks (serial, locked) ==="
          while read -r stack; do
            echo "[$stack] Starting apply..."
-            set +e
-            OUTPUT=$(cd "stacks/$stack" && ../../scripts/tg apply --non-interactive 2>&1)
-            EXIT=$?
-            set -e
-            if [ $EXIT -ne 0 ]; then
-              if echo "$OUTPUT" | grep -q "is locked by"; then
-                echo "[$stack] SKIPPED (locked by another session)"
-              else
-                echo "$OUTPUT" | tail -50
-                echo "[$stack] FAILED (exit $EXIT)"
-                FAILED_APP_STACKS="$FAILED_APP_STACKS $stack"
+            ATTEMPT=0
+            while :; do
+              ATTEMPT=$((ATTEMPT + 1))
+              set +e
+              OUTPUT=$(cd "stacks/$stack" && ../../scripts/tg apply --non-interactive 2>&1)
+              EXIT=$?
+              set -e
+              if [ $EXIT -eq 0 ]; then
+                echo "$OUTPUT" | tail -3; echo "[$stack] OK"; break
              fi
-            else
-              echo "$OUTPUT" | tail -3
-              echo "[$stack] OK"
-            fi
+              # Lock contention → SKIP, not fail (Tier-0 Vault + Tier-1 PG; see platform loop).
+              if echo "$OUTPUT" | grep -qE 'is locked by|Error acquiring the state lock|already locked'; then
+                echo "[$stack] SKIPPED (locked by another session/run)"; break
+              fi
+              # Transient provider-download / Vault 5xx → bounded retry (see platform loop).
+              if [ $ATTEMPT -lt 3 ] && echo "$OUTPUT" | grep -qE 'Failed to install provider|Client\.Timeout exceeded while awaiting headers|error reading from Vault.*Code: 5[0-9][0-9]'; then
+                echo "[$stack] transient error (attempt $ATTEMPT/3) — retrying in 15s..."; sleep 15; continue
+              fi
+              echo "$OUTPUT" | tail -50; echo "[$stack] FAILED (exit $EXIT)"
+              FAILED_APP_STACKS="$FAILED_APP_STACKS $stack"; break
+            done
          done < .app_apply
        fi
        # Fail the step loudly so the pipeline `default` workflow state
--- a/docs/architecture/ci-cd.md
+++ b/docs/architecture/ci-cd.md
@ -234,6 +234,38 @@ Woodpecker is **deploy + cluster-touching steps only**:

 **No build/test pipeline exists on any repo.** Do not (re)introduce one.

+### `default.yml` apply: dual-registration de-dup + reliability (2026-06-28)
+
+infra is registered in Woodpecker on **both** the canonical Forgejo repo (id 82)
+and the legacy GitHub mirror (id 1), and **both fire `default.yml` on every
+push**. Left unguarded, two `terragrunt apply` runs race each other for the
+per-stack PG state lock — historically the #1 source of `Error acquiring the
+state lock` failures and push-supersede "killed" runs.
+
+- **Forge guard** (first command in the `apply` step): the push-apply runs **only
+  on the canonical Forgejo forge**; on the GitHub mirror it logs `[forge-guard]`
+  and `exit 0`s. Detection: `CI_REPO_URL`/`CI_FORGE_URL` contains `github.com` →
+  skip. Fail-open (unknown forge still applies). The mirror keeps running the
+  **crons** (drift-detection, renew-tls, …), which live on repo 1 — only its
+  duplicate push-apply no-ops. (Crons were NOT moved; deactivating repo 1 would
+  have killed them.)
+- **Lock-skip matches both tiers**: a stack whose apply hits a lock is SKIPPED,
+  not failed. The grep now matches the Tier-0 Vault message (`is locked by`) **and**
+  the Tier-1 PG-backend message (`Error acquiring the state lock` / `already
+  locked`) — the PG case was previously miscounted as a hard failure.
+- **Transient retry** (bounded, 3 attempts): only provider-registry download
+  timeouts (`Failed to install provider` / `Client.Timeout`) and Vault 5xx are
+  retried. Config errors (missing arg, invalid index) and helm `atomic` timeouts
+  are NOT retried — they fail fast.
+
+A pre-apply off-infra validate gate was evaluated and rejected: `terraform
+validate` runs without state but catches ~0 of the observed failures (they are
+provider-config-from-Vault-data, server-side-apply conflicts, helm installs, and
+lock contention — all invisible to static validate), and `plan` cannot run
+off-infra (no Vault/PG access). `terragrunt apply` already fails at its plan
+phase without mutating on config errors, so a separate in-pipeline plan-gate was
+also dropped as redundant.
+
 ### Woodpecker API

 Uses **numeric repo IDs** (`/api/repos/<id>/pipelines`), NOT owner/name paths