Merge remote-tracking branch 'origin/master' into wizard/authentik-perf-fix
This commit is contained in:
commit
5fb2004de5
2 changed files with 93 additions and 30 deletions
|
|
@ -65,6 +65,21 @@ steps:
|
|||
# don't need explicit token propagation.
|
||||
VAULT_ADDR: http://vault-active.vault.svc.cluster.local:8200
|
||||
commands:
|
||||
# ── Forge guard: apply ONLY on the canonical Forgejo forge ──
|
||||
# infra is registered in Woodpecker on BOTH the Forgejo canonical repo and
|
||||
# the legacy GitHub mirror, and BOTH fire this push pipeline. Without this
|
||||
# guard both run `terragrunt apply` on every push and race each other for
|
||||
# the per-stack PG state lock — the dominant cause of the "Error acquiring
|
||||
# the state lock" failures + push-supersede "killed" runs. The GitHub-mirror
|
||||
# registration keeps running the CRONS (drift-detection, renew-tls, …) — only
|
||||
# its duplicate push-apply no-ops here. Fail-open: an unknown forge (neither
|
||||
# env var set) still applies, preserving prior behaviour.
|
||||
- |
|
||||
if echo "${CI_REPO_URL:-}${CI_FORGE_URL:-}" | grep -qi 'github\.com'; then
|
||||
echo "[forge-guard] GitHub-mirror push — apply runs only on the Forgejo canonical repo (avoids double-apply + state-lock races). Skipping."
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# ── Skip CI commits ──
|
||||
- |
|
||||
if echo "$CI_COMMIT_MESSAGE" | grep -q '\[CI SKIP\]\|\[ci skip\]'; then
|
||||
|
|
@ -220,22 +235,33 @@ steps:
|
|||
# (2026-06-27 — see docs/architecture/ci-cd.md)
|
||||
if [ "$stack" = "vault" ]; then echo "[vault] SKIPPED (Tier-0, human-applied via OIDC)"; continue; fi
|
||||
echo "[$stack] Starting apply..."
|
||||
set +e
|
||||
OUTPUT=$(cd "stacks/$stack" && ../../scripts/tg apply --non-interactive 2>&1)
|
||||
EXIT=$?
|
||||
set -e
|
||||
if [ $EXIT -ne 0 ]; then
|
||||
if echo "$OUTPUT" | grep -q "is locked by"; then
|
||||
echo "[$stack] SKIPPED (locked by another session)"
|
||||
else
|
||||
echo "$OUTPUT" | tail -50
|
||||
echo "[$stack] FAILED (exit $EXIT)"
|
||||
FAILED_PLATFORM_STACKS="$FAILED_PLATFORM_STACKS $stack"
|
||||
ATTEMPT=0
|
||||
while :; do
|
||||
ATTEMPT=$((ATTEMPT + 1))
|
||||
set +e
|
||||
OUTPUT=$(cd "stacks/$stack" && ../../scripts/tg apply --non-interactive 2>&1)
|
||||
EXIT=$?
|
||||
set -e
|
||||
if [ $EXIT -eq 0 ]; then
|
||||
echo "$OUTPUT" | tail -3; echo "[$stack] OK"; break
|
||||
fi
|
||||
else
|
||||
echo "$OUTPUT" | tail -3
|
||||
echo "[$stack] OK"
|
||||
fi
|
||||
# Lock contention → SKIP, not fail. Match BOTH the Tier-0 Vault lock
|
||||
# ("is locked by", from scripts/tg) AND the Tier-1 PG-backend lock
|
||||
# ("Error acquiring the state lock" / "already locked"). The PG case
|
||||
# was previously counted as a failure — the #1 source of false reds.
|
||||
if echo "$OUTPUT" | grep -qE 'is locked by|Error acquiring the state lock|already locked'; then
|
||||
echo "[$stack] SKIPPED (locked by another session/run)"; break
|
||||
fi
|
||||
# Transient: provider-registry download timeout / Vault 5xx → bounded
|
||||
# retry. Deliberately NOT helm atomic-timeouts or config errors
|
||||
# (missing arg, invalid index) — those must fail fast, retry can't fix
|
||||
# them and can worsen a stuck helm release.
|
||||
if [ $ATTEMPT -lt 3 ] && echo "$OUTPUT" | grep -qE 'Failed to install provider|Client\.Timeout exceeded while awaiting headers|error reading from Vault.*Code: 5[0-9][0-9]'; then
|
||||
echo "[$stack] transient error (attempt $ATTEMPT/3) — retrying in 15s..."; sleep 15; continue
|
||||
fi
|
||||
echo "$OUTPUT" | tail -50; echo "[$stack] FAILED (exit $EXIT)"
|
||||
FAILED_PLATFORM_STACKS="$FAILED_PLATFORM_STACKS $stack"; break
|
||||
done
|
||||
done < .platform_apply
|
||||
fi
|
||||
# Deferred until after app stacks so both lists get a chance to run.
|
||||
|
|
@ -248,22 +274,27 @@ steps:
|
|||
echo "=== Applying app stacks (serial, locked) ==="
|
||||
while read -r stack; do
|
||||
echo "[$stack] Starting apply..."
|
||||
set +e
|
||||
OUTPUT=$(cd "stacks/$stack" && ../../scripts/tg apply --non-interactive 2>&1)
|
||||
EXIT=$?
|
||||
set -e
|
||||
if [ $EXIT -ne 0 ]; then
|
||||
if echo "$OUTPUT" | grep -q "is locked by"; then
|
||||
echo "[$stack] SKIPPED (locked by another session)"
|
||||
else
|
||||
echo "$OUTPUT" | tail -50
|
||||
echo "[$stack] FAILED (exit $EXIT)"
|
||||
FAILED_APP_STACKS="$FAILED_APP_STACKS $stack"
|
||||
ATTEMPT=0
|
||||
while :; do
|
||||
ATTEMPT=$((ATTEMPT + 1))
|
||||
set +e
|
||||
OUTPUT=$(cd "stacks/$stack" && ../../scripts/tg apply --non-interactive 2>&1)
|
||||
EXIT=$?
|
||||
set -e
|
||||
if [ $EXIT -eq 0 ]; then
|
||||
echo "$OUTPUT" | tail -3; echo "[$stack] OK"; break
|
||||
fi
|
||||
else
|
||||
echo "$OUTPUT" | tail -3
|
||||
echo "[$stack] OK"
|
||||
fi
|
||||
# Lock contention → SKIP, not fail (Tier-0 Vault + Tier-1 PG; see platform loop).
|
||||
if echo "$OUTPUT" | grep -qE 'is locked by|Error acquiring the state lock|already locked'; then
|
||||
echo "[$stack] SKIPPED (locked by another session/run)"; break
|
||||
fi
|
||||
# Transient provider-download / Vault 5xx → bounded retry (see platform loop).
|
||||
if [ $ATTEMPT -lt 3 ] && echo "$OUTPUT" | grep -qE 'Failed to install provider|Client\.Timeout exceeded while awaiting headers|error reading from Vault.*Code: 5[0-9][0-9]'; then
|
||||
echo "[$stack] transient error (attempt $ATTEMPT/3) — retrying in 15s..."; sleep 15; continue
|
||||
fi
|
||||
echo "$OUTPUT" | tail -50; echo "[$stack] FAILED (exit $EXIT)"
|
||||
FAILED_APP_STACKS="$FAILED_APP_STACKS $stack"; break
|
||||
done
|
||||
done < .app_apply
|
||||
fi
|
||||
# Fail the step loudly so the pipeline `default` workflow state
|
||||
|
|
|
|||
|
|
@ -234,6 +234,38 @@ Woodpecker is **deploy + cluster-touching steps only**:
|
|||
|
||||
**No build/test pipeline exists on any repo.** Do not (re)introduce one.
|
||||
|
||||
### `default.yml` apply: dual-registration de-dup + reliability (2026-06-28)
|
||||
|
||||
infra is registered in Woodpecker on **both** the canonical Forgejo repo (id 82)
|
||||
and the legacy GitHub mirror (id 1), and **both fire `default.yml` on every
|
||||
push**. Left unguarded, two `terragrunt apply` runs race each other for the
|
||||
per-stack PG state lock — historically the #1 source of `Error acquiring the
|
||||
state lock` failures and push-supersede "killed" runs.
|
||||
|
||||
- **Forge guard** (first command in the `apply` step): the push-apply runs **only
|
||||
on the canonical Forgejo forge**; on the GitHub mirror it logs `[forge-guard]`
|
||||
and `exit 0`s. Detection: `CI_REPO_URL`/`CI_FORGE_URL` contains `github.com` →
|
||||
skip. Fail-open (unknown forge still applies). The mirror keeps running the
|
||||
**crons** (drift-detection, renew-tls, …), which live on repo 1 — only its
|
||||
duplicate push-apply no-ops. (Crons were NOT moved; deactivating repo 1 would
|
||||
have killed them.)
|
||||
- **Lock-skip matches both tiers**: a stack whose apply hits a lock is SKIPPED,
|
||||
not failed. The grep now matches the Tier-0 Vault message (`is locked by`) **and**
|
||||
the Tier-1 PG-backend message (`Error acquiring the state lock` / `already
|
||||
locked`) — the PG case was previously miscounted as a hard failure.
|
||||
- **Transient retry** (bounded, 3 attempts): only provider-registry download
|
||||
timeouts (`Failed to install provider` / `Client.Timeout`) and Vault 5xx are
|
||||
retried. Config errors (missing arg, invalid index) and helm `atomic` timeouts
|
||||
are NOT retried — they fail fast.
|
||||
|
||||
A pre-apply off-infra validate gate was evaluated and rejected: `terraform
|
||||
validate` runs without state but catches ~0 of the observed failures (they are
|
||||
provider-config-from-Vault-data, server-side-apply conflicts, helm installs, and
|
||||
lock contention — all invisible to static validate), and `plan` cannot run
|
||||
off-infra (no Vault/PG access). `terragrunt apply` already fails at its plan
|
||||
phase without mutating on config errors, so a separate in-pipeline plan-gate was
|
||||
also dropped as redundant.
|
||||
|
||||
### Woodpecker API
|
||||
|
||||
Uses **numeric repo IDs** (`/api/repos/<id>/pipelines`), NOT owner/name paths
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue