infra/.woodpecker/default.yml

# Unified infra CI pipeline — detects changed stacks and applies only those.
# Platform stacks and app stacks handled in one pipeline with proper ordering.
#
# Optimizations over the previous split pipeline:
# - Custom CI image (no apk/wget per step)
# - Shallow clone (depth=2 for git diff HEAD~1)
# - TF_PLUGIN_CACHE_DIR (shared provider cache)
# - Serial apply with Vault advisory locks (prevents user/CI race conditions)
# - Step consolidation (2 steps instead of 4)
# - Changed-stacks-only detection (skips no-op applies)
# - Global-file fallback (modules/config changes trigger full apply)
# - Lock-aware: skips stacks locked by users instead of failing

when:
  event: push
  branch: master

clone:
  git:
    image: woodpeckerci/plugin-git
    settings:
      depth: 2
      attempts: 5
      backoff: 10s

steps:
  - name: apply
    image: forgejo.viktorbarzin.me/viktor/infra-ci:latest
    pull: true
    backend_options:
      kubernetes:
        resources:
          requests:
            memory: 3Gi
          limits:
            memory: 6Gi
    environment:
      SLACK_WEBHOOK:
        from_secret: slack_webhook
      # Each `- |` command runs in a fresh shell, so we can't rely on an
      # `export VAULT_ADDR=...` in the auth command persisting — pin it at
      # step level. VAULT_TOKEN is still per-command; we persist it to
      # ~/.vault-token (auto-read by `vault` CLI) so downstream commands
      # don't need explicit token propagation.
      VAULT_ADDR: http://vault-active.vault.svc.cluster.local:8200
    commands:
      # ── Skip CI commits ──
      - |
        if echo "$CI_COMMIT_MESSAGE" | grep -q '\[CI SKIP\]\|\[ci skip\]'; then
          echo "Commit has [CI SKIP], exiting"
          exit 0
        fi

      # ── git-crypt unlock ──
      - |
        SA_TOKEN=$(cat /var/run/secrets/kubernetes.io/serviceaccount/token)
        curl -sk "https://10.0.20.100:6443/api/v1/namespaces/woodpecker/configmaps/git-crypt-key" \
          -H "Authorization:Bearer $SA_TOKEN" | jq -r .data.key | base64 -d > /tmp/key
        git-crypt unlock /tmp/key && rm /tmp/key

      # ── Vault auth ──
      - |
        SA_TOKEN=$(cat /var/run/secrets/kubernetes.io/serviceaccount/token)
        VAULT_TOKEN=$(curl -s -X POST "$VAULT_ADDR/v1/auth/kubernetes/login" \
          -d "{\"role\":\"ci\",\"jwt\":\"$SA_TOKEN\"}" | jq -r .auth.client_token)
        if [ -z "$VAULT_TOKEN" ] || [ "$VAULT_TOKEN" = "null" ]; then
          echo "ERROR: Vault K8s auth failed (role=ci, ns=woodpecker)" >&2
          exit 1
        fi
        # Persist for downstream `- |` blocks (each runs in a fresh shell,
        # so exporting VAULT_TOKEN wouldn't help). `vault`, `scripts/tg`,
        # and `scripts/state-sync` all fall through to ~/.vault-token when
        # the env var is unset.
        umask 077; printf '%s' "$VAULT_TOKEN" > "$HOME/.vault-token"

      # ── Generate kubeconfig from projected SA token ──
      # terragrunt.hcl injects `-var kube_config_path=<repo>/config` for every
      # terraform invocation, so we need a kubeconfig file at that path. The
      # `default` SA in the woodpecker namespace is cluster-admin (via the
      # `woodpecker-default` ClusterRoleBinding), so the projected token is
      # sufficient to apply any stack. Using `tokenFile` (not an inline token)
      # so the provider re-reads it if kubelet rotates the projected token
      # mid-pipeline.
      - |
        cat > config <<'EOF'
        apiVersion: v1
        kind: Config
        clusters:
          - name: kubernetes
            cluster:
              server: https://10.0.20.100:6443
              certificate-authority: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
        contexts:
          - name: ci
            context:
              cluster: kubernetes
              user: ci
        current-context: ci
        users:
          - name: ci
            user:
              tokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
        EOF
        chmod 600 config
        # Sanity check: kubeconfig works
        kubectl --kubeconfig=config get ns kube-system -o name >/dev/null

      # ── Detect changed stacks ──
      - |
        PLATFORM_STACKS="dbaas authentik crowdsec monitoring nvidia mailserver cloudflared kyverno metallb redis traefik technitium headscale rbac k8s-portal vaultwarden reverse-proxy metrics-server vpa nfs-csi iscsi-csi cnpg sealed-secrets uptime-kuma wireguard xray infra-maintenance platform vault reloader descheduler external-secrets"

        # Ensure we have enough history for diff (clone may be shallow)
        if ! git rev-parse HEAD~1 >/dev/null 2>&1; then
          echo "WARNING: HEAD~1 not available (shallow clone?) — fetching more history"
          git fetch --deepen=1 origin master 2>/dev/null || true
        fi

        # If still no parent, apply all platform stacks as a safe fallback
        if ! git rev-parse HEAD~1 >/dev/null 2>&1; then
          echo "Cannot determine changed files — applying ALL platform stacks"
          echo "$PLATFORM_STACKS" | tr ' ' '\n' > .platform_apply
          > .app_apply
        else
          # Check if global files changed (triggers full platform apply)
          GLOBAL_CHANGED=$(git diff --name-only HEAD~1 HEAD | grep -E '^(modules/|config\.tfvars|terragrunt\.hcl)' || true)

          if [ -n "$GLOBAL_CHANGED" ]; then
            echo "Global files changed — applying ALL platform stacks"
            echo "$PLATFORM_STACKS" | tr ' ' '\n' > .platform_apply
          else
            # Detect platform stacks that changed
            git diff --name-only HEAD~1 HEAD | grep '^stacks/' | cut -d/ -f2 | sort -u > .all_changed
            > .platform_apply
            while read -r stack; do
              if echo "$PLATFORM_STACKS" | grep -qw "$stack"; then
                echo "$stack" >> .platform_apply
              fi
            done < .all_changed
          fi

          # Detect app stacks that changed
          > .app_apply
          git diff --name-only HEAD~1 HEAD | grep '^stacks/' | cut -d/ -f2 | sort -u | while read -r stack; do
            if echo "$PLATFORM_STACKS" | grep -qw "$stack"; then
              continue  # Skip platform stacks
            fi
            if [ ! -f "stacks/$stack/terragrunt.hcl" ]; then
              continue  # Skip non-terragrunt dirs
            fi
            echo "$stack" >> .app_apply
          done
        fi

        PLATFORM_COUNT=$(wc -l < .platform_apply | tr -d ' ')
        APP_COUNT=$(wc -l < .app_apply | tr -d ' ')
        echo "Platform stacks to apply: $PLATFORM_COUNT"
        echo "App stacks to apply: $APP_COUNT"
        cat .platform_apply .app_apply

      # ── Pre-warm provider cache ──
      - |
        if [ -s .platform_apply ] || [ -s .app_apply ]; then
          FIRST_STACK=$(cat .platform_apply .app_apply 2>/dev/null | head -1)
          if [ -n "$FIRST_STACK" ]; then
            echo "Pre-warming provider cache from stacks/$FIRST_STACK..."
            cd "stacks/$FIRST_STACK" && terragrunt init --terragrunt-non-interactive -input=false 2>&1 | tail -3 && cd ../..
          fi
        fi

      # ── Apply platform stacks (serial, with Vault advisory locks) ──
      - |
        FAILED_PLATFORM_STACKS=""
        if [ -s .platform_apply ]; then
          echo "=== Applying platform stacks (serial, locked) ==="
          while read -r stack; do
            echo "[$stack] Starting apply..."
            set +e
            OUTPUT=$(cd "stacks/$stack" && ../../scripts/tg apply --non-interactive 2>&1)
            EXIT=$?
            set -e
            if [ $EXIT -ne 0 ]; then
              if echo "$OUTPUT" | grep -q "is locked by"; then
                echo "[$stack] SKIPPED (locked by another session)"
              else
                echo "$OUTPUT" | tail -50
                echo "[$stack] FAILED (exit $EXIT)"
                FAILED_PLATFORM_STACKS="$FAILED_PLATFORM_STACKS $stack"
              fi
            else
              echo "$OUTPUT" | tail -3
              echo "[$stack] OK"
            fi
          done < .platform_apply
        fi
        # Deferred until after app stacks so both lists get a chance to run.
        echo "$FAILED_PLATFORM_STACKS" > .platform_failed

      # ── Apply app stacks (serial, with Vault advisory locks) ──
      - |
        FAILED_APP_STACKS=""
        if [ -s .app_apply ]; then
          echo "=== Applying app stacks (serial, locked) ==="
          while read -r stack; do
            echo "[$stack] Starting apply..."
            set +e
            OUTPUT=$(cd "stacks/$stack" && ../../scripts/tg apply --non-interactive 2>&1)
            EXIT=$?
            set -e
            if [ $EXIT -ne 0 ]; then
              if echo "$OUTPUT" | grep -q "is locked by"; then
                echo "[$stack] SKIPPED (locked by another session)"
              else
                echo "$OUTPUT" | tail -50
                echo "[$stack] FAILED (exit $EXIT)"
                FAILED_APP_STACKS="$FAILED_APP_STACKS $stack"
              fi
            else
              echo "$OUTPUT" | tail -3
              echo "[$stack] OK"
            fi
          done < .app_apply
        fi
        # Fail the step loudly so the pipeline `default` workflow state
        # reflects reality — the service-upgrade agent and CI alert cascade
        # both rely on this (see bd code-e1x). Lock-skipped stacks are NOT
        # counted as failures.
        FAILED_PLATFORM=$(cat .platform_failed 2>/dev/null | tr -d ' ')
        if [ -n "$FAILED_PLATFORM" ] || [ -n "$FAILED_APP_STACKS" ]; then
          echo "=== FAILED STACKS: platform=[$FAILED_PLATFORM ] apps=[$FAILED_APP_STACKS ] ==="
          exit 1
        fi

      # ── Commit and push state changes ──
      - |
        mkdir -p ~/.ssh && ssh-keyscan -H github.com >> ~/.ssh/known_hosts 2>/dev/null
        chmod 400 secrets/deploy_key
        git add stacks/ state/ .woodpecker/ 2>/dev/null || true
        git remote set-url origin git@github.com:ViktorBarzin/infra.git
        git diff --cached --quiet && echo "No changes to commit" && exit 0
        git commit -m "Woodpecker CI deploy [CI SKIP]"
        GIT_SSH_COMMAND='ssh -i ./secrets/deploy_key -o IdentitiesOnly=yes' git fetch origin master
        if ! GIT_SSH_COMMAND='ssh -i ./secrets/deploy_key -o IdentitiesOnly=yes' git rebase origin/master; then
          echo "ERROR: Git rebase failed — state commits could not be pushed"
          echo "Manual intervention required: pull, resolve conflicts, push"
          GIT_SSH_COMMAND='ssh -i ./secrets/deploy_key -o IdentitiesOnly=yes' git rebase --abort || true
          exit 1
        fi
        GIT_SSH_COMMAND='ssh -i ./secrets/deploy_key -o IdentitiesOnly=yes' git push origin master

      # ── Slack notification ──
      - |
        PLATFORM_COUNT=$(wc -l < .platform_apply 2>/dev/null | tr -d ' ')
        APP_COUNT=$(wc -l < .app_apply 2>/dev/null | tr -d ' ')
        curl -s -X POST -H 'Content-type: application/json' \
          --data "{\"channel\":\"general\",\"text\":\"Woodpecker CI: infra pipeline ${CI_PIPELINE_STATUS} (platform:${PLATFORM_COUNT}, apps:${APP_COUNT})\"}" \
          "$SLACK_WEBHOOK" || true

  # Slack on failure (runs even if apply step fails)
  - name: notify-failure
    image: curlimages/curl
    commands:
      - |
        curl -s -X POST -H 'Content-type: application/json' \
          --data "{\"channel\":\"general\",\"text\":\":red_circle: Woodpecker CI: infra pipeline FAILED\"}" \
          "$SLACK_WEBHOOK" || true
    environment:
      SLACK_WEBHOOK:
        from_secret: slack_webhook
    when:
      status: [failure]