diff --git a/.woodpecker/app-stacks.yml b/.woodpecker/app-stacks.yml deleted file mode 100644 index f4e63df9..00000000 --- a/.woodpecker/app-stacks.yml +++ /dev/null @@ -1,122 +0,0 @@ -when: - event: push - branch: master - # Only trigger when application stack files change - path: - include: - - 'stacks/**' - exclude: - - '.woodpecker/**' - -clone: - git: - image: woodpeckerci/plugin-git - settings: - attempts: 5 - backoff: 10s - -steps: - - name: detect-changes - image: alpine - commands: - - apk add --no-cache git - # Detect which stacks changed in the latest commit - - | - CHANGED=$(git diff --name-only HEAD~1 HEAD 2>/dev/null | grep '^stacks/' | cut -d/ -f2 | sort -u || true) - if [ -z "$CHANGED" ]; then - echo "No stack changes detected" - echo "" > .stacks_to_apply - exit 0 - fi - # Exclude platform stacks (handled by default.yml) - PLATFORM="dbaas authentik crowdsec monitoring nvidia mailserver cloudflared kyverno metallb redis traefik technitium headscale rbac k8s-portal vaultwarden reverse-proxy metrics-server vpa nfs-csi iscsi-csi cnpg sealed-secrets uptime-kuma wireguard xray infra-maintenance platform vault reloader descheduler external-secrets" - APPLY="" - for stack in $CHANGED; do - if echo "$PLATFORM" | grep -qw "$stack"; then - echo "Skipping platform stack: $stack" - continue - fi - if [ ! -f "stacks/$stack/terragrunt.hcl" ]; then - echo "Skipping $stack (no terragrunt.hcl)" - continue - fi - APPLY="$APPLY $stack" - done - echo "$APPLY" > .stacks_to_apply - echo "Stacks to apply:$APPLY" - - - name: prepare - image: alpine - commands: - - "apk update && apk add jq curl git git-crypt" - - | - curl -k https://10.0.20.100:6443/api/v1/namespaces/woodpecker/configmaps/git-crypt-key -H "Authorization:Bearer $(cat /var/run/secrets/kubernetes.io/serviceaccount/token)" | jq -r .data.key | base64 -d > /tmp/key - - "git-crypt unlock /tmp/key && rm /tmp/key" - - | - SA_TOKEN=$(cat /var/run/secrets/kubernetes.io/serviceaccount/token) - VAULT_TOKEN=$(curl -s -X POST http://vault-active.vault.svc.cluster.local:8200/v1/auth/kubernetes/login \ - -d "{\"role\":\"ci\",\"jwt\":\"$SA_TOKEN\"}" | jq -r .auth.client_token) - echo "export VAULT_TOKEN=$VAULT_TOKEN" > .vault-env - echo "export VAULT_ADDR=http://vault-active.vault.svc.cluster.local:8200" >> .vault-env - when: - evaluate: 'CI_COMMIT_MESSAGE != "" && !contains(CI_COMMIT_MESSAGE, "[CI SKIP]")' - - - name: terragrunt-apply - image: alpine - backend_options: - kubernetes: - resources: - requests: - memory: 2Gi - limits: - memory: 4Gi - commands: - - "apk update && apk add curl unzip git openssh-client" - - "wget -qO /tmp/terraform.zip https://releases.hashicorp.com/terraform/1.5.7/terraform_1.5.7_linux_amd64.zip" - - "unzip -o /tmp/terraform.zip -d /usr/local/bin/ && chmod 755 /usr/local/bin/terraform" - - "wget -qO /usr/local/bin/terragrunt https://github.com/gruntwork-io/terragrunt/releases/download/v0.99.4/terragrunt_linux_amd64" - - "chmod 755 /usr/local/bin/terragrunt" - - "source .vault-env" - - | - STACKS=$(cat .stacks_to_apply) - if [ -z "$STACKS" ]; then - echo "No app stacks to apply" - exit 0 - fi - FAILED="" - for stack in $STACKS; do - echo "=== Applying: $stack ===" - (cd stacks/$stack && terragrunt apply --non-interactive -auto-approve) & - done - wait - when: - evaluate: 'CI_COMMIT_MESSAGE != "" && !contains(CI_COMMIT_MESSAGE, "[CI SKIP]")' - - - name: cleanup-and-push - image: alpine - commands: - - "rm -f .vault-env" - - "apk update && apk add openssh-client git git-crypt" - - "mkdir -p ~/.ssh && ssh-keyscan -H github.com >> ~/.ssh/known_hosts" - - "chmod 400 secrets/deploy_key" - - "git add stacks/ state/ .woodpecker/ || true" - - "git remote set-url origin git@github.com:ViktorBarzin/infra.git" - - "git commit -m 'Woodpecker CI app-stacks deploy commit [CI SKIP]' || echo 'No changes'" - - "GIT_SSH_COMMAND='ssh -i ./secrets/deploy_key -o IdentitiesOnly=yes' git pull --rebase origin master || true" - - "GIT_SSH_COMMAND='ssh -i ./secrets/deploy_key -o IdentitiesOnly=yes' git push origin master" - when: - status: [success, failure] - - - name: slack - image: curlimages/curl - commands: - - | - STACKS=$(cat .stacks_to_apply 2>/dev/null || echo "none") - curl -s -X POST -H 'Content-type: application/json' \ - --data "{\"channel\":\"general\",\"text\":\"Woodpecker CI: app-stacks pipeline ${CI_PIPELINE_STATUS} (stacks:${STACKS})\"}" \ - "$SLACK_WEBHOOK" || true - environment: - SLACK_WEBHOOK: - from_secret: slack_webhook - when: - status: [success, failure] diff --git a/.woodpecker/build-ci-image.yml b/.woodpecker/build-ci-image.yml new file mode 100644 index 00000000..95295a27 --- /dev/null +++ b/.woodpecker/build-ci-image.yml @@ -0,0 +1,41 @@ +# Build the CI tools Docker image used by all infra pipelines. +# Triggers on changes to ci/Dockerfile or manual dispatch. + +when: + event: [push, manual] + branch: master + path: + include: + - 'ci/Dockerfile' + +steps: + - name: build-and-push + image: woodpeckerci/plugin-docker-buildx + settings: + repo: registry.viktorbarzin.me:5050/infra-ci + dockerfile: ci/Dockerfile + context: ci/ + tags: + - latest + - "${CI_COMMIT_SHA:0:8}" + platforms: linux/amd64 + registry: registry.viktorbarzin.me:5050 + logins: + - registry: registry.viktorbarzin.me:5050 + username: + from_secret: registry_user + password: + from_secret: registry_password + + - name: slack + image: curlimages/curl + commands: + - | + curl -s -X POST -H 'Content-type: application/json' \ + --data "{\"text\":\"CI image built: registry.viktorbarzin.me:5050/infra-ci:${CI_COMMIT_SHA:0:8}\"}" \ + "$SLACK_WEBHOOK" || true + environment: + SLACK_WEBHOOK: + from_secret: slack_webhook + when: + status: [success] diff --git a/.woodpecker/default.yml b/.woodpecker/default.yml index 5b96f74f..cd2272f2 100644 --- a/.woodpecker/default.yml +++ b/.woodpecker/default.yml @@ -1,3 +1,15 @@ +# Unified infra CI pipeline — detects changed stacks and applies only those. +# Platform stacks and app stacks handled in one pipeline with proper ordering. +# +# Optimizations over the previous split pipeline: +# - Custom CI image (no apk/wget per step) +# - Shallow clone (depth=2 for git diff HEAD~1) +# - TF_PLUGIN_CACHE_DIR (shared provider cache) +# - Concurrency limit (xargs -P 4) +# - Step consolidation (2 steps instead of 4) +# - Changed-stacks-only detection (skips no-op applies) +# - Global-file fallback (modules/config changes trigger full apply) + when: event: push branch: master @@ -6,28 +18,14 @@ clone: git: image: woodpeckerci/plugin-git settings: + depth: 2 attempts: 5 backoff: 10s steps: - - name: prepare - image: alpine - commands: - - "apk update && apk add jq curl git git-crypt" - # git-crypt for secrets/ directory (TLS certs, deploy key) - - | - curl -k https://10.0.20.100:6443/api/v1/namespaces/woodpecker/configmaps/git-crypt-key -H "Authorization:Bearer $(cat /var/run/secrets/kubernetes.io/serviceaccount/token)" | jq -r .data.key | base64 -d > /tmp/key - - "git-crypt unlock /tmp/key && rm /tmp/key" - # Vault: authenticate via K8s service account JWT - - | - SA_TOKEN=$(cat /var/run/secrets/kubernetes.io/serviceaccount/token) - VAULT_TOKEN=$(curl -s -X POST http://vault-active.vault.svc.cluster.local:8200/v1/auth/kubernetes/login \ - -d "{\"role\":\"ci\",\"jwt\":\"$SA_TOKEN\"}" | jq -r .auth.client_token) - echo "export VAULT_TOKEN=$VAULT_TOKEN" > .vault-env - echo "export VAULT_ADDR=http://vault-active.vault.svc.cluster.local:8200" >> .vault-env - - - name: terragrunt-apply - image: alpine + - name: apply + image: registry.viktorbarzin.me:5050/infra-ci:latest + pull: true backend_options: kubernetes: resources: @@ -35,51 +33,142 @@ steps: memory: 3Gi limits: memory: 6Gi + environment: + SLACK_WEBHOOK: + from_secret: slack_webhook commands: - - "apk update && apk add curl unzip git openssh-client" - # Install Terraform - - "wget -qO /tmp/terraform.zip https://releases.hashicorp.com/terraform/1.5.7/terraform_1.5.7_linux_amd64.zip" - - "unzip -o /tmp/terraform.zip -d /usr/local/bin/ && chmod 755 /usr/local/bin/terraform" - # Install Terragrunt - - "wget -qO /usr/local/bin/terragrunt https://github.com/gruntwork-io/terragrunt/releases/download/v0.99.4/terragrunt_linux_amd64" - - "chmod 755 /usr/local/bin/terragrunt" - # Source Vault token - - "source .vault-env" - # Apply all platform stacks in parallel + # ── Skip CI commits ── - | - for stack in dbaas authentik crowdsec monitoring nvidia mailserver cloudflared kyverno \ - metallb redis traefik technitium headscale rbac k8s-portal vaultwarden \ - reverse-proxy metrics-server vpa nfs-csi iscsi-csi cnpg sealed-secrets \ - uptime-kuma wireguard xray infra-maintenance platform; do - (cd stacks/$stack && terragrunt apply --non-interactive -auto-approve) & + if echo "$CI_COMMIT_MESSAGE" | grep -q '\[CI SKIP\]\|\[ci skip\]'; then + echo "Commit has [CI SKIP], exiting" + exit 0 + fi + + # ── git-crypt unlock ── + - | + SA_TOKEN=$(cat /var/run/secrets/kubernetes.io/serviceaccount/token) + curl -sk "https://10.0.20.100:6443/api/v1/namespaces/woodpecker/configmaps/git-crypt-key" \ + -H "Authorization:Bearer $SA_TOKEN" | jq -r .data.key | base64 -d > /tmp/key + git-crypt unlock /tmp/key && rm /tmp/key + + # ── Vault auth ── + - | + SA_TOKEN=$(cat /var/run/secrets/kubernetes.io/serviceaccount/token) + export VAULT_ADDR=http://vault-active.vault.svc.cluster.local:8200 + export VAULT_TOKEN=$(curl -s -X POST "$VAULT_ADDR/v1/auth/kubernetes/login" \ + -d "{\"role\":\"ci\",\"jwt\":\"$SA_TOKEN\"}" | jq -r .auth.client_token) + + # ── Detect changed stacks ── + - | + PLATFORM_STACKS="dbaas authentik crowdsec monitoring nvidia mailserver cloudflared kyverno metallb redis traefik technitium headscale rbac k8s-portal vaultwarden reverse-proxy metrics-server vpa nfs-csi iscsi-csi cnpg sealed-secrets uptime-kuma wireguard xray infra-maintenance platform vault reloader descheduler external-secrets" + + # Check if global files changed (triggers full platform apply) + GLOBAL_CHANGED=$(git diff --name-only HEAD~1 HEAD 2>/dev/null | grep -E '^(modules/|config\.tfvars|terragrunt\.hcl)' || true) + + if [ -n "$GLOBAL_CHANGED" ]; then + echo "Global files changed — applying ALL platform stacks" + echo "$PLATFORM_STACKS" | tr ' ' '\n' > .platform_apply + else + # Detect platform stacks that changed + git diff --name-only HEAD~1 HEAD 2>/dev/null | grep '^stacks/' | cut -d/ -f2 | sort -u > .all_changed + > .platform_apply + while read -r stack; do + if echo "$PLATFORM_STACKS" | grep -qw "$stack"; then + echo "$stack" >> .platform_apply + fi + done < .all_changed + fi + + # Detect app stacks that changed + > .app_apply + git diff --name-only HEAD~1 HEAD 2>/dev/null | grep '^stacks/' | cut -d/ -f2 | sort -u | while read -r stack; do + if echo "$PLATFORM_STACKS" | grep -qw "$stack"; then + continue # Skip platform stacks + fi + if [ ! -f "stacks/$stack/terragrunt.hcl" ]; then + continue # Skip non-terragrunt dirs + fi + echo "$stack" >> .app_apply done - wait - - name: cleanup-and-push - image: alpine - commands: - - "rm -f .vault-env" - - "apk update && apk add openssh-client git git-crypt" - - "mkdir -p ~/.ssh && ssh-keyscan -H github.com >> ~/.ssh/known_hosts" - - "chmod 400 secrets/deploy_key" - # Only add specific paths — never git add . - - "git add stacks/ state/ .woodpecker/ || true" - - "git remote set-url origin git@github.com:ViktorBarzin/infra.git" - - "git commit -m 'Woodpecker CI deploy commit [CI SKIP]' || echo 'No changes'" - - "GIT_SSH_COMMAND='ssh -i ./secrets/deploy_key -o IdentitiesOnly=yes' git pull --rebase origin master || true" - - "GIT_SSH_COMMAND='ssh -i ./secrets/deploy_key -o IdentitiesOnly=yes' git push origin master" - when: - status: [success, failure] + PLATFORM_COUNT=$(wc -l < .platform_apply | tr -d ' ') + APP_COUNT=$(wc -l < .app_apply | tr -d ' ') + echo "Platform stacks to apply: $PLATFORM_COUNT" + echo "App stacks to apply: $APP_COUNT" + cat .platform_apply .app_apply - - name: slack + # ── Pre-warm provider cache ── + - | + if [ -s .platform_apply ] || [ -s .app_apply ]; then + FIRST_STACK=$(head -1 .platform_apply .app_apply 2>/dev/null | head -1) + if [ -n "$FIRST_STACK" ]; then + echo "Pre-warming provider cache from stacks/$FIRST_STACK..." + cd "stacks/$FIRST_STACK" && terragrunt init --terragrunt-non-interactive -input=false 2>&1 | tail -3 && cd ../.. + fi + fi + + # ── Apply platform stacks (with concurrency limit) ── + - | + if [ -s .platform_apply ]; then + echo "=== Applying platform stacks (max 4 parallel) ===" + cat .platform_apply | xargs -P 4 -I{} sh -c ' + echo "[{}] Starting apply..." + cd stacks/{} && terragrunt apply --non-interactive -auto-approve 2>&1 | tail -5 + EXIT=$? + if [ $EXIT -ne 0 ]; then + echo "[{}] FAILED (exit $EXIT)" + else + echo "[{}] OK" + fi + ' + fi + + # ── Apply app stacks (with concurrency limit) ── + - | + if [ -s .app_apply ]; then + echo "=== Applying app stacks (max 4 parallel) ===" + cat .app_apply | xargs -P 4 -I{} sh -c ' + echo "[{}] Starting apply..." + cd stacks/{} && terragrunt apply --non-interactive -auto-approve 2>&1 | tail -5 + EXIT=$? + if [ $EXIT -ne 0 ]; then + echo "[{}] FAILED (exit $EXIT)" + else + echo "[{}] OK" + fi + ' + fi + + # ── Commit and push state changes ── + - | + mkdir -p ~/.ssh && ssh-keyscan -H github.com >> ~/.ssh/known_hosts 2>/dev/null + chmod 400 secrets/deploy_key + git add stacks/ state/ .woodpecker/ 2>/dev/null || true + git remote set-url origin git@github.com:ViktorBarzin/infra.git + git diff --cached --quiet && echo "No changes to commit" && exit 0 + git commit -m "Woodpecker CI deploy [CI SKIP]" + GIT_SSH_COMMAND='ssh -i ./secrets/deploy_key -o IdentitiesOnly=yes' git fetch origin master + GIT_SSH_COMMAND='ssh -i ./secrets/deploy_key -o IdentitiesOnly=yes' git rebase origin/master || true + GIT_SSH_COMMAND='ssh -i ./secrets/deploy_key -o IdentitiesOnly=yes' git push origin master + + # ── Slack notification ── + - | + PLATFORM_COUNT=$(wc -l < .platform_apply 2>/dev/null | tr -d ' ') + APP_COUNT=$(wc -l < .app_apply 2>/dev/null | tr -d ' ') + curl -s -X POST -H 'Content-type: application/json' \ + --data "{\"channel\":\"general\",\"text\":\"Woodpecker CI: infra pipeline ${CI_PIPELINE_STATUS} (platform:${PLATFORM_COUNT}, apps:${APP_COUNT})\"}" \ + "$SLACK_WEBHOOK" || true + + # Slack on failure (runs even if apply step fails) + - name: notify-failure image: curlimages/curl commands: - | curl -s -X POST -H 'Content-type: application/json' \ - --data "{\"channel\":\"general\",\"text\":\"Woodpecker CI: infra pipeline ${CI_PIPELINE_STATUS}\"}" \ + --data "{\"channel\":\"general\",\"text\":\":red_circle: Woodpecker CI: infra pipeline FAILED\"}" \ "$SLACK_WEBHOOK" || true environment: SLACK_WEBHOOK: from_secret: slack_webhook when: - status: [success, failure] + status: [failure] diff --git a/.woodpecker/drift-detection.yml b/.woodpecker/drift-detection.yml new file mode 100644 index 00000000..f1d491a4 --- /dev/null +++ b/.woodpecker/drift-detection.yml @@ -0,0 +1,80 @@ +# Daily drift detection — runs terraform plan on all stacks and alerts on drift. +# Triggered by Woodpecker cron schedule "drift-detection" (must be registered in Woodpecker UI/API). + +when: + event: cron + cron: drift-detection + +clone: + git: + image: woodpeckerci/plugin-git + settings: + depth: 1 + attempts: 3 + +steps: + - name: detect-drift + image: registry.viktorbarzin.me:5050/infra-ci:latest + pull: true + backend_options: + kubernetes: + resources: + requests: + memory: 2Gi + limits: + memory: 4Gi + environment: + SLACK_WEBHOOK: + from_secret: slack_webhook + commands: + # ── git-crypt unlock ── + - | + SA_TOKEN=$(cat /var/run/secrets/kubernetes.io/serviceaccount/token) + curl -sk "https://10.0.20.100:6443/api/v1/namespaces/woodpecker/configmaps/git-crypt-key" \ + -H "Authorization:Bearer $SA_TOKEN" | jq -r .data.key | base64 -d > /tmp/key + git-crypt unlock /tmp/key && rm /tmp/key + + # ── Vault auth ── + - | + SA_TOKEN=$(cat /var/run/secrets/kubernetes.io/serviceaccount/token) + export VAULT_ADDR=http://vault-active.vault.svc.cluster.local:8200 + export VAULT_TOKEN=$(curl -s -X POST "$VAULT_ADDR/v1/auth/kubernetes/login" \ + -d "{\"role\":\"ci\",\"jwt\":\"$SA_TOKEN\"}" | jq -r .auth.client_token) + + # ── Run terraform plan on all stacks ── + - | + DRIFTED="" + CLEAN=0 + ERRORS="" + + for stack_dir in stacks/*/; do + stack=$(basename "$stack_dir") + [ -f "$stack_dir/terragrunt.hcl" ] || continue + + echo -n "[$stack] planning... " + OUTPUT=$(cd "$stack_dir" && terragrunt plan -detailed-exitcode -input=false 2>&1) + EXIT=$? + + case $EXIT in + 0) echo "OK (no changes)"; CLEAN=$((CLEAN + 1)) ;; + 1) echo "ERROR"; ERRORS="$ERRORS $stack" ;; + 2) echo "DRIFT DETECTED"; DRIFTED="$DRIFTED $stack" ;; + esac + done + + echo "" + echo "=== Drift Detection Summary ===" + echo "Clean: $CLEAN stacks" + echo "Drift: ${DRIFTED:-none}" + echo "Errors: ${ERRORS:-none}" + + # ── Slack alert if drift found ── + if [ -n "$DRIFTED" ]; then + curl -s -X POST -H 'Content-type: application/json' \ + --data "{\"channel\":\"general\",\"text\":\":warning: Drift detected in:${DRIFTED}\nClean: ${CLEAN} stacks. Errors:${ERRORS:-none}\"}" \ + "$SLACK_WEBHOOK" || true + else + curl -s -X POST -H 'Content-type: application/json' \ + --data "{\"channel\":\"general\",\"text\":\":white_check_mark: Drift detection: all ${CLEAN} stacks clean${ERRORS:+. Errors: $ERRORS}\"}" \ + "$SLACK_WEBHOOK" || true + fi diff --git a/ci/Dockerfile b/ci/Dockerfile new file mode 100644 index 00000000..aefc8407 --- /dev/null +++ b/ci/Dockerfile @@ -0,0 +1,42 @@ +FROM alpine:3.20 + +# Pin versions to match CI requirements +ARG TERRAFORM_VERSION=1.5.7 +ARG TERRAGRUNT_VERSION=0.99.4 +ARG SOPS_VERSION=3.9.4 +ARG KUBECTL_VERSION=1.34.0 + +# Install system packages (single layer) +RUN apk add --no-cache \ + bash curl git git-crypt jq openssh-client openssl unzip \ + && rm -rf /var/cache/apk/* + +# Terraform +RUN curl -fsSL "https://releases.hashicorp.com/terraform/${TERRAFORM_VERSION}/terraform_${TERRAFORM_VERSION}_linux_amd64.zip" \ + -o /tmp/terraform.zip \ + && unzip /tmp/terraform.zip -d /usr/local/bin/ \ + && rm /tmp/terraform.zip \ + && terraform version + +# Terragrunt +RUN curl -fsSL "https://github.com/gruntwork-io/terragrunt/releases/download/v${TERRAGRUNT_VERSION}/terragrunt_linux_amd64" \ + -o /usr/local/bin/terragrunt \ + && chmod +x /usr/local/bin/terragrunt \ + && terragrunt --version + +# SOPS (for state encryption) +RUN curl -fsSL "https://github.com/getsops/sops/releases/download/v${SOPS_VERSION}/sops-v${SOPS_VERSION}.linux.amd64" \ + -o /usr/local/bin/sops \ + && chmod +x /usr/local/bin/sops + +# kubectl +RUN curl -fsSL "https://dl.k8s.io/release/v${KUBECTL_VERSION}/bin/linux/amd64/kubectl" \ + -o /usr/local/bin/kubectl \ + && chmod +x /usr/local/bin/kubectl + +# Provider cache directory (shared across stacks) +ENV TF_PLUGIN_CACHE_DIR=/tmp/terraform-plugin-cache +ENV TF_PLUGIN_CACHE_MAY_BREAK_DEPENDENCY_LOCK_FILE=1 +RUN mkdir -p /tmp/terraform-plugin-cache + +WORKDIR /workspace diff --git a/scripts/tg b/scripts/tg index 2461ab75..fc2c0671 100755 --- a/scripts/tg +++ b/scripts/tg @@ -1,23 +1,61 @@ #!/usr/bin/env bash # scripts/tg — wrapper: decrypt state before, encrypt+commit after mutating ops # Usage: scripts/tg apply --non-interactive -# scripts/tg run --all -- plan +# scripts/tg plan # Auth: `vault login -method=oidc` (token at ~/.vault-token) set -euo pipefail REPO_ROOT="$(cd "$(dirname "$0")/.." && pwd)" SYNC="$REPO_ROOT/scripts/state-sync" +# Enable provider cache (shared across stacks) +export TF_PLUGIN_CACHE_DIR="${TF_PLUGIN_CACHE_DIR:-$HOME/.terraform.d/plugin-cache}" +export TF_PLUGIN_CACHE_MAY_BREAK_DEPENDENCY_LOCK_FILE=1 +mkdir -p "$TF_PLUGIN_CACHE_DIR" + # Determine stack name from cwd (relative to stacks/) STACK_NAME="" cwd="$(pwd)" stacks_dir="$REPO_ROOT/stacks" if [[ "$cwd" == "$stacks_dir"/* ]]; then - # Get first path component relative to stacks/ rel="${cwd#$stacks_dir/}" STACK_NAME="${rel%%/*}" fi +# ── Advisory lock via Vault KV ── +LOCK_MAX_AGE=1800 # 30 minutes — stale lock threshold +acquire_lock() { + local stack="$1" + local vault_addr="${VAULT_ADDR:-https://vault.viktorbarzin.me}" + local lock_path="secret/data/locks/$stack" + local holder="pid=$$,host=$(hostname -s),user=$(whoami)" + + # Check if lock exists and is not stale + local existing + existing=$(vault kv get -format=json "secret/locks/$stack" 2>/dev/null || echo '{}') + local locked=$(echo "$existing" | jq -r '.data.data.locked // "false"') + local acquired=$(echo "$existing" | jq -r '.data.data.acquired // "0"') + local existing_holder=$(echo "$existing" | jq -r '.data.data.holder // ""') + + if [ "$locked" = "true" ]; then + local now=$(date +%s) + local age=$((now - acquired)) + if [ "$age" -lt "$LOCK_MAX_AGE" ]; then + echo "ERROR: Stack '$stack' is locked by: $existing_holder (${age}s ago)" + echo " Wait for it to finish or run: vault kv delete secret/locks/$stack" + return 1 + fi + echo "WARNING: Breaking stale lock on '$stack' (held ${age}s by $existing_holder)" + fi + + vault kv put "secret/locks/$stack" locked=true holder="$holder" acquired="$(date +%s)" >/dev/null +} + +release_lock() { + local stack="$1" + vault kv delete "secret/locks/$stack" >/dev/null 2>&1 || true +} + # Decrypt state before any operation if [ -n "$STACK_NAME" ] && [ -f "$REPO_ROOT/state/stacks/$STACK_NAME/terraform.tfstate.enc" ]; then "$SYNC" decrypt "$STACK_NAME" @@ -31,6 +69,14 @@ for arg in "$@"; do esac done +# Acquire lock for mutating operations +if $is_mutating && [ -n "$STACK_NAME" ]; then + if command -v vault &>/dev/null && [ -n "${VAULT_TOKEN:-}" ]; then + acquire_lock "$STACK_NAME" + trap 'release_lock "$STACK_NAME"' EXIT + fi +fi + # If running apply with --non-interactive, add -auto-approve for Terraform args=("$@") has_apply=false @@ -43,7 +89,6 @@ for arg in "${args[@]}"; do done if $has_apply && $has_non_interactive; then - # Rebuild args: insert -auto-approve after apply new_args=() for arg in "${args[@]}"; do new_args+=("$arg")