fix: restore tree dropped by 6d224861; land stem95su gdrive-sync (10m) [ci skip]

6d224861 came from a --no-checkout worktree whose empty index made the commit drop every file except two. This restores 05b50d2b's full tree and correctly adds stacks/stem95su/gdrive-sync.tf + the service-catalog stem95su entry. Forward-only (parent=6d224861, no force-push); [ci skip] since the live infra was never applied from the broken commit. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-09 08:45:33 +00:00 · 2026-06-09 08:45:33 +00:00 · fd0f4a0365
commit fd0f4a0365
parent 6d224861c4
1166 changed files with 358546 additions and 0 deletions
--- a/.woodpecker/build-ci-image.yml
+++ b/.woodpecker/build-ci-image.yml
@ -0,0 +1,88 @@
+# Build the CI tools Docker image used by all infra pipelines.
+# Triggers on push that touches ci/Dockerfile, or manual (API/UI) so
+# rebuilds after a registry incident don't need a cosmetic Dockerfile edit.
+
+when:
+  - event: push
+    branch: master
+    path:
+      include:
+        - 'ci/Dockerfile'
+  - event: manual
+
+steps:
+  - name: build-and-push
+    image: woodpeckerci/plugin-docker-buildx
+    settings:
+      # Phase 4 of forgejo-registry-consolidation 2026-05-07 —
+      # registry.viktorbarzin.me dropped, Forgejo is the only target.
+      repo:
+        - forgejo.viktorbarzin.me/viktor/infra-ci
+      dockerfile: ci/Dockerfile
+      context: ci/
+      tags:
+        - latest
+        - "${CI_COMMIT_SHA:0:8}"
+      platforms: linux/amd64
+      logins:
+        - registry: forgejo.viktorbarzin.me
+          username:
+            from_secret: forgejo_user
+          password:
+            from_secret: forgejo_push_token
+
+  # Post-push integrity check is now redundant with the every-15min
+  # forgejo-integrity-probe in stacks/monitoring/, which walks
+  # /v2/_catalog + HEADs every blob across the entire Forgejo registry.
+  # If a corruption pattern emerges that the periodic probe misses,
+  # restore a verify step similar to the pre-Phase-4 version (see
+  # commit 49f4956f) but pointed at forgejo.viktorbarzin.me.
+
+  # Break-glass tarball: save the just-pushed infra-ci image to disk on the
+  # registry VM (10.0.20.10) so we can `docker load` it back into a node
+  # when Forgejo is unreachable. Pulls from Forgejo (the only registry now).
+  # Best-effort — failure here doesn't fail the pipeline.
+  # Recovery procedure: docs/runbooks/forgejo-registry-breakglass.md.
+  - name: breakglass-tarball
+    image: alpine:3.20
+    failure: ignore
+    environment:
+      REGISTRY_SSH_KEY:
+        from_secret: registry_ssh_key
+      FORGEJO_USER:
+        from_secret: forgejo_user
+      FORGEJO_PASS:
+        from_secret: forgejo_push_token
+    commands:
+      - apk add --no-cache openssh-client
+      - mkdir -p ~/.ssh && chmod 700 ~/.ssh
+      - printf '%s\n' "$REGISTRY_SSH_KEY" > ~/.ssh/id_ed25519
+      - chmod 600 ~/.ssh/id_ed25519
+      - ssh-keyscan -t ed25519 10.0.20.10 >> ~/.ssh/known_hosts 2>/dev/null
+      - SHA=${CI_COMMIT_SHA:0:8}
+      - |
+        ssh -n -o BatchMode=yes root@10.0.20.10 "
+          set -e
+          mkdir -p /opt/registry/data/private/_breakglass
+          IMAGE=forgejo.viktorbarzin.me/viktor/infra-ci:$SHA
+          echo \$FORGEJO_PASS | docker login forgejo.viktorbarzin.me -u \$FORGEJO_USER --password-stdin
+          docker pull \$IMAGE
+          docker save \$IMAGE | gzip > /opt/registry/data/private/_breakglass/infra-ci-$SHA.tar.gz
+          ln -sfn infra-ci-$SHA.tar.gz /opt/registry/data/private/_breakglass/infra-ci-latest.tar.gz
+          ls -t /opt/registry/data/private/_breakglass/infra-ci-*.tar.gz \
+            | grep -v 'latest' | tail -n +6 | xargs -r rm -v
+          ls -lh /opt/registry/data/private/_breakglass/
+        "
+
+  - name: slack
+    image: curlimages/curl
+    commands:
+      - |
+        curl -s -X POST -H 'Content-type: application/json' \
+          --data "{\"text\":\"CI image built: forgejo.viktorbarzin.me/viktor/infra-ci:${CI_COMMIT_SHA:0:8} (and registry-private mirror)\"}" \
+          "$SLACK_WEBHOOK" || true
+    environment:
+      SLACK_WEBHOOK:
+        from_secret: slack_webhook
+    when:
+      status: [success]
--- a/.woodpecker/build-cli.yml
+++ b/.woodpecker/build-cli.yml
@ -0,0 +1,42 @@
+when:
+  event: push
+
+clone:
+  git:
+    image: woodpeckerci/plugin-git
+    settings:
+      attempts: 5
+      backoff: 10s
+
+steps:
+  - name: build-image
+    image: woodpeckerci/plugin-docker-buildx
+    settings:
+      username: "viktorbarzin"
+      password:
+        from_secret: dockerhub-pat
+      # Phase 4 of forgejo-registry-consolidation 2026-05-07 —
+      # registry.viktorbarzin.me:5050 decommissioned. Push to DockerHub
+      # (the public-facing infra image) AND Forgejo (the cluster pull
+      # source). Same image, two locations.
+      repo:
+        - viktorbarzin/infra
+        - forgejo.viktorbarzin.me/viktor/infra
+      logins:
+        - registry: https://index.docker.io/v1/
+          username: viktorbarzin
+          password:
+            from_secret: dockerhub-pat
+        - registry: forgejo.viktorbarzin.me
+          username:
+            from_secret: forgejo_user
+          password:
+            from_secret: forgejo_push_token
+      dockerfile: cli/Dockerfile
+      context: cli
+      auto_tag: true
+      # cache_from/cache_to removed: registry cache corruption causes
+      # "short read: expected 32 bytes" BuildKit errors. Inline cache
+      # will be re-populated once a clean image is pushed.
+      # cache_from: "registry.viktorbarzin.me:5050/infra:latest"
+      # cache_to: "type=inline"
--- a/.woodpecker/default.yml
+++ b/.woodpecker/default.yml
@ -0,0 +1,270 @@
+# Unified infra CI pipeline — detects changed stacks and applies only those.
+# Platform stacks and app stacks handled in one pipeline with proper ordering.
+#
+# Optimizations over the previous split pipeline:
+# - Custom CI image (no apk/wget per step)
+# - Shallow clone (depth=2 for git diff HEAD~1)
+# - TF_PLUGIN_CACHE_DIR (shared provider cache)
+# - Serial apply with Vault advisory locks (prevents user/CI race conditions)
+# - Step consolidation (2 steps instead of 4)
+# - Changed-stacks-only detection (skips no-op applies)
+# - Global-file fallback (modules/config changes trigger full apply)
+# - Lock-aware: skips stacks locked by users instead of failing
+
+when:
+  event: push
+  branch: master
+
+clone:
+  git:
+    image: woodpeckerci/plugin-git
+    settings:
+      depth: 2
+      attempts: 5
+      backoff: 10s
+
+steps:
+  - name: apply
+    image: forgejo.viktorbarzin.me/viktor/infra-ci:latest
+    pull: true
+    backend_options:
+      kubernetes:
+        resources:
+          requests:
+            memory: 3Gi
+          limits:
+            memory: 6Gi
+    environment:
+      SLACK_WEBHOOK:
+        from_secret: slack_webhook
+      # Each `- |` command runs in a fresh shell, so we can't rely on an
+      # `export VAULT_ADDR=...` in the auth command persisting — pin it at
+      # step level. VAULT_TOKEN is still per-command; we persist it to
+      # ~/.vault-token (auto-read by `vault` CLI) so downstream commands
+      # don't need explicit token propagation.
+      VAULT_ADDR: http://vault-active.vault.svc.cluster.local:8200
+    commands:
+      # ── Skip CI commits ──
+      - |
+        if echo "$CI_COMMIT_MESSAGE" | grep -q '\[CI SKIP\]\|\[ci skip\]'; then
+          echo "Commit has [CI SKIP], exiting"
+          exit 0
+        fi
+
+      # ── git-crypt unlock ──
+      - |
+        SA_TOKEN=$(cat /var/run/secrets/kubernetes.io/serviceaccount/token)
+        curl -sk "https://10.0.20.100:6443/api/v1/namespaces/woodpecker/configmaps/git-crypt-key" \
+          -H "Authorization:Bearer $SA_TOKEN" | jq -r .data.key | base64 -d > /tmp/key
+        git-crypt unlock /tmp/key && rm /tmp/key
+
+      # ── Vault auth ──
+      - |
+        SA_TOKEN=$(cat /var/run/secrets/kubernetes.io/serviceaccount/token)
+        VAULT_TOKEN=$(curl -s -X POST "$VAULT_ADDR/v1/auth/kubernetes/login" \
+          -d "{\"role\":\"ci\",\"jwt\":\"$SA_TOKEN\"}" | jq -r .auth.client_token)
+        if [ -z "$VAULT_TOKEN" ] || [ "$VAULT_TOKEN" = "null" ]; then
+          echo "ERROR: Vault K8s auth failed (role=ci, ns=woodpecker)" >&2
+          exit 1
+        fi
+        # Persist for downstream `- |` blocks (each runs in a fresh shell,
+        # so exporting VAULT_TOKEN wouldn't help). `vault`, `scripts/tg`,
+        # and `scripts/state-sync` all fall through to ~/.vault-token when
+        # the env var is unset.
+        umask 077; printf '%s' "$VAULT_TOKEN" > "$HOME/.vault-token"
+
+      # ── Generate kubeconfig from projected SA token ──
+      # terragrunt.hcl injects `-var kube_config_path=<repo>/config` for every
+      # terraform invocation, so we need a kubeconfig file at that path. The
+      # `default` SA in the woodpecker namespace is cluster-admin (via the
+      # `woodpecker-default` ClusterRoleBinding), so the projected token is
+      # sufficient to apply any stack. Using `tokenFile` (not an inline token)
+      # so the provider re-reads it if kubelet rotates the projected token
+      # mid-pipeline.
+      - |
+        cat > config <<'EOF'
+        apiVersion: v1
+        kind: Config
+        clusters:
+          - name: kubernetes
+            cluster:
+              server: https://10.0.20.100:6443
+              certificate-authority: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
+        contexts:
+          - name: ci
+            context:
+              cluster: kubernetes
+              user: ci
+        current-context: ci
+        users:
+          - name: ci
+            user:
+              tokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
+        EOF
+        chmod 600 config
+        # Sanity check: kubeconfig works
+        kubectl --kubeconfig=config get ns kube-system -o name >/dev/null
+
+      # ── Detect changed stacks ──
+      - |
+        PLATFORM_STACKS="dbaas authentik crowdsec monitoring nvidia mailserver cloudflared kyverno metallb redis traefik technitium headscale rbac k8s-portal vaultwarden reverse-proxy metrics-server vpa nfs-csi iscsi-csi cnpg sealed-secrets uptime-kuma wireguard xray infra-maintenance platform vault reloader descheduler external-secrets"
+
+        # Ensure we have enough history for diff (clone may be shallow)
+        if ! git rev-parse HEAD~1 >/dev/null 2>&1; then
+          echo "WARNING: HEAD~1 not available (shallow clone?) — fetching more history"
+          git fetch --deepen=1 origin master 2>/dev/null || true
+        fi
+
+        # If still no parent, apply all platform stacks as a safe fallback
+        if ! git rev-parse HEAD~1 >/dev/null 2>&1; then
+          echo "Cannot determine changed files — applying ALL platform stacks"
+          echo "$PLATFORM_STACKS" | tr ' ' '\n' > .platform_apply
+          > .app_apply
+        else
+          # Check if global files changed (triggers full platform apply)
+          GLOBAL_CHANGED=$(git diff --name-only HEAD~1 HEAD | grep -E '^(modules/|config\.tfvars|terragrunt\.hcl)' || true)
+
+          if [ -n "$GLOBAL_CHANGED" ]; then
+            echo "Global files changed — applying ALL platform stacks"
+            echo "$PLATFORM_STACKS" | tr ' ' '\n' > .platform_apply
+          else
+            # Detect platform stacks that changed
+            git diff --name-only HEAD~1 HEAD | grep '^stacks/' | cut -d/ -f2 | sort -u > .all_changed
+            > .platform_apply
+            while read -r stack; do
+              if echo "$PLATFORM_STACKS" | grep -qw "$stack"; then
+                echo "$stack" >> .platform_apply
+              fi
+            done < .all_changed
+          fi
+
+          # Detect app stacks that changed
+          > .app_apply
+          git diff --name-only HEAD~1 HEAD | grep '^stacks/' | cut -d/ -f2 | sort -u | while read -r stack; do
+            if echo "$PLATFORM_STACKS" | grep -qw "$stack"; then
+              continue  # Skip platform stacks
+            fi
+            if [ ! -f "stacks/$stack/terragrunt.hcl" ]; then
+              continue  # Skip non-terragrunt dirs
+            fi
+            echo "$stack" >> .app_apply
+          done
+        fi
+
+        PLATFORM_COUNT=$(wc -l < .platform_apply | tr -d ' ')
+        APP_COUNT=$(wc -l < .app_apply | tr -d ' ')
+        echo "Platform stacks to apply: $PLATFORM_COUNT"
+        echo "App stacks to apply: $APP_COUNT"
+        cat .platform_apply .app_apply
+
+      # ── Pre-warm provider cache ──
+      - |
+        if [ -s .platform_apply ] || [ -s .app_apply ]; then
+          FIRST_STACK=$(cat .platform_apply .app_apply 2>/dev/null | head -1)
+          if [ -n "$FIRST_STACK" ]; then
+            echo "Pre-warming provider cache from stacks/$FIRST_STACK..."
+            cd "stacks/$FIRST_STACK" && terragrunt init --terragrunt-non-interactive -input=false 2>&1 | tail -3 && cd ../..
+          fi
+        fi
+
+      # ── Apply platform stacks (serial, with Vault advisory locks) ──
+      - |
+        FAILED_PLATFORM_STACKS=""
+        if [ -s .platform_apply ]; then
+          echo "=== Applying platform stacks (serial, locked) ==="
+          while read -r stack; do
+            echo "[$stack] Starting apply..."
+            set +e
+            OUTPUT=$(cd "stacks/$stack" && ../../scripts/tg apply --non-interactive 2>&1)
+            EXIT=$?
+            set -e
+            if [ $EXIT -ne 0 ]; then
+              if echo "$OUTPUT" | grep -q "is locked by"; then
+                echo "[$stack] SKIPPED (locked by another session)"
+              else
+                echo "$OUTPUT" | tail -50
+                echo "[$stack] FAILED (exit $EXIT)"
+                FAILED_PLATFORM_STACKS="$FAILED_PLATFORM_STACKS $stack"
+              fi
+            else
+              echo "$OUTPUT" | tail -3
+              echo "[$stack] OK"
+            fi
+          done < .platform_apply
+        fi
+        # Deferred until after app stacks so both lists get a chance to run.
+        echo "$FAILED_PLATFORM_STACKS" > .platform_failed
+
+      # ── Apply app stacks (serial, with Vault advisory locks) ──
+      - |
+        FAILED_APP_STACKS=""
+        if [ -s .app_apply ]; then
+          echo "=== Applying app stacks (serial, locked) ==="
+          while read -r stack; do
+            echo "[$stack] Starting apply..."
+            set +e
+            OUTPUT=$(cd "stacks/$stack" && ../../scripts/tg apply --non-interactive 2>&1)
+            EXIT=$?
+            set -e
+            if [ $EXIT -ne 0 ]; then
+              if echo "$OUTPUT" | grep -q "is locked by"; then
+                echo "[$stack] SKIPPED (locked by another session)"
+              else
+                echo "$OUTPUT" | tail -50
+                echo "[$stack] FAILED (exit $EXIT)"
+                FAILED_APP_STACKS="$FAILED_APP_STACKS $stack"
+              fi
+            else
+              echo "$OUTPUT" | tail -3
+              echo "[$stack] OK"
+            fi
+          done < .app_apply
+        fi
+        # Fail the step loudly so the pipeline `default` workflow state
+        # reflects reality — the service-upgrade agent and CI alert cascade
+        # both rely on this (see bd code-e1x). Lock-skipped stacks are NOT
+        # counted as failures.
+        FAILED_PLATFORM=$(cat .platform_failed 2>/dev/null | tr -d ' ')
+        if [ -n "$FAILED_PLATFORM" ] || [ -n "$FAILED_APP_STACKS" ]; then
+          echo "=== FAILED STACKS: platform=[$FAILED_PLATFORM ] apps=[$FAILED_APP_STACKS ] ==="
+          exit 1
+        fi
+
+      # ── Commit and push state changes ──
+      - |
+        mkdir -p ~/.ssh && ssh-keyscan -H github.com >> ~/.ssh/known_hosts 2>/dev/null
+        chmod 400 secrets/deploy_key
+        git add stacks/ state/ .woodpecker/ 2>/dev/null || true
+        git remote set-url origin git@github.com:ViktorBarzin/infra.git
+        git diff --cached --quiet && echo "No changes to commit" && exit 0
+        git commit -m "Woodpecker CI deploy [CI SKIP]"
+        GIT_SSH_COMMAND='ssh -i ./secrets/deploy_key -o IdentitiesOnly=yes' git fetch origin master
+        if ! GIT_SSH_COMMAND='ssh -i ./secrets/deploy_key -o IdentitiesOnly=yes' git rebase origin/master; then
+          echo "ERROR: Git rebase failed — state commits could not be pushed"
+          echo "Manual intervention required: pull, resolve conflicts, push"
+          GIT_SSH_COMMAND='ssh -i ./secrets/deploy_key -o IdentitiesOnly=yes' git rebase --abort || true
+          exit 1
+        fi
+        GIT_SSH_COMMAND='ssh -i ./secrets/deploy_key -o IdentitiesOnly=yes' git push origin master
+
+      # ── Slack notification ──
+      - |
+        PLATFORM_COUNT=$(wc -l < .platform_apply 2>/dev/null | tr -d ' ')
+        APP_COUNT=$(wc -l < .app_apply 2>/dev/null | tr -d ' ')
+        curl -s -X POST -H 'Content-type: application/json' \
+          --data "{\"channel\":\"general\",\"text\":\"Woodpecker CI: infra pipeline ${CI_PIPELINE_STATUS} (platform:${PLATFORM_COUNT}, apps:${APP_COUNT})\"}" \
+          "$SLACK_WEBHOOK" || true
+
+  # Slack on failure (runs even if apply step fails)
+  - name: notify-failure
+    image: curlimages/curl
+    commands:
+      - |
+        curl -s -X POST -H 'Content-type: application/json' \
+          --data "{\"channel\":\"general\",\"text\":\":red_circle: Woodpecker CI: infra pipeline FAILED\"}" \
+          "$SLACK_WEBHOOK" || true
+    environment:
+      SLACK_WEBHOOK:
+        from_secret: slack_webhook
+    when:
+      status: [failure]
--- a/.woodpecker/drift-detection.yml
+++ b/.woodpecker/drift-detection.yml
@ -0,0 +1,151 @@
+# Daily drift detection — runs terraform plan on all stacks and alerts on drift.
+# Triggered by Woodpecker cron schedule "drift-detection" (must be registered in Woodpecker UI/API).
+
+when:
+  event: cron
+  cron: drift-detection
+
+clone:
+  git:
+    image: woodpeckerci/plugin-git
+    settings:
+      depth: 1
+      attempts: 3
+
+steps:
+  - name: detect-drift
+    image: forgejo.viktorbarzin.me/viktor/infra-ci:latest
+    pull: true
+    backend_options:
+      kubernetes:
+        resources:
+          requests:
+            memory: 2Gi
+          limits:
+            memory: 4Gi
+    environment:
+      SLACK_WEBHOOK:
+        from_secret: slack_webhook
+    commands:
+      # ── git-crypt unlock ──
+      - |
+        SA_TOKEN=$(cat /var/run/secrets/kubernetes.io/serviceaccount/token)
+        curl -sk "https://10.0.20.100:6443/api/v1/namespaces/woodpecker/configmaps/git-crypt-key" \
+          -H "Authorization:Bearer $SA_TOKEN" | jq -r .data.key | base64 -d > /tmp/key
+        git-crypt unlock /tmp/key && rm /tmp/key
+
+      # ── Vault auth ──
+      - |
+        SA_TOKEN=$(cat /var/run/secrets/kubernetes.io/serviceaccount/token)
+        export VAULT_ADDR=http://vault-active.vault.svc.cluster.local:8200
+        export VAULT_TOKEN=$(curl -s -X POST "$VAULT_ADDR/v1/auth/kubernetes/login" \
+          -d "{\"role\":\"ci\",\"jwt\":\"$SA_TOKEN\"}" | jq -r .auth.client_token)
+
+      # ── Generate kubeconfig from projected SA token ──
+      # See default.yml for rationale. terragrunt.hcl injects
+      # `-var kube_config_path=<repo>/config` for every terraform invocation,
+      # so we need a kubeconfig file at that path. The woodpecker default SA
+      # is cluster-admin, so the projected token is sufficient.
+      - |
+        cat > config <<'EOF'
+        apiVersion: v1
+        kind: Config
+        clusters:
+          - name: kubernetes
+            cluster:
+              server: https://10.0.20.100:6443
+              certificate-authority: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
+        contexts:
+          - name: ci
+            context:
+              cluster: kubernetes
+              user: ci
+        current-context: ci
+        users:
+          - name: ci
+            user:
+              tokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
+        EOF
+        chmod 600 config
+        kubectl --kubeconfig=config get ns kube-system -o name >/dev/null
+
+      # ── Run terraform plan on all stacks ──
+      # Emits two timestamps per drifted stack so the Pushgateway/Prometheus
+      # side can compute drift-age-hours via `time() - drift_stack_first_seen`.
+      - |
+        DRIFTED=""
+        CLEAN=0
+        ERRORS=""
+        NOW=$(date +%s)
+        # Metrics accumulator — written once per stack, then pushed as a batch.
+        METRICS=""
+
+        for stack_dir in stacks/*/; do
+          stack=$(basename "$stack_dir")
+          [ -f "$stack_dir/terragrunt.hcl" ] || continue
+
+          echo -n "[$stack] planning... "
+          OUTPUT=$(cd "$stack_dir" && terragrunt plan -detailed-exitcode -input=false 2>&1)
+          EXIT=$?
+
+          case $EXIT in
+            0)
+              echo "OK (no changes)"
+              CLEAN=$((CLEAN + 1))
+              # drift_stack_state=0 means clean; age-hours irrelevant so we
+              # still push 0 so per-stack gauges don't go stale.
+              METRICS="${METRICS}drift_stack_state{stack=\"$stack\"} 0\n"
+              METRICS="${METRICS}drift_stack_age_hours{stack=\"$stack\"} 0\n"
+              ;;
+            1)
+              echo "ERROR"
+              ERRORS="$ERRORS $stack"
+              METRICS="${METRICS}drift_stack_state{stack=\"$stack\"} 2\n"
+              ;;
+            2)
+              echo "DRIFT DETECTED"
+              DRIFTED="$DRIFTED $stack"
+              # Fetch first-seen timestamp from Pushgateway (preserve across runs).
+              FIRST_SEEN=$(curl -s "http://prometheus-prometheus-pushgateway.monitoring:9091/metrics" \
+                | awk -v s="$stack" '$1 == "drift_stack_first_seen{stack=\""s"\"}" {print $2; exit}')
+              if [ -z "$FIRST_SEEN" ] || [ "$FIRST_SEEN" = "0" ]; then
+                FIRST_SEEN="$NOW"
+              fi
+              AGE_HOURS=$(( (NOW - FIRST_SEEN) / 3600 ))
+              METRICS="${METRICS}drift_stack_state{stack=\"$stack\"} 1\n"
+              METRICS="${METRICS}drift_stack_first_seen{stack=\"$stack\"} $FIRST_SEEN\n"
+              METRICS="${METRICS}drift_stack_age_hours{stack=\"$stack\"} $AGE_HOURS\n"
+              ;;
+          esac
+        done
+
+        # Summary counters — single gauge per run.
+        DRIFT_COUNT=$(echo "$DRIFTED" | wc -w)
+        ERROR_COUNT=$(echo "$ERRORS" | wc -w)
+        METRICS="${METRICS}drift_stack_count $DRIFT_COUNT\n"
+        METRICS="${METRICS}drift_error_count $ERROR_COUNT\n"
+        METRICS="${METRICS}drift_clean_count $CLEAN\n"
+        METRICS="${METRICS}drift_detection_last_run_timestamp $NOW\n"
+
+        # ── Push to Pushgateway ──
+        # One batched push keeps the run atomic: either all metrics land or none.
+        printf "%b" "$METRICS" | curl -s --data-binary @- \
+          http://prometheus-prometheus-pushgateway.monitoring:9091/metrics/job/drift-detection \
+          || echo "(pushgateway unavailable, metrics lost for this run)"
+
+        echo ""
+        echo "=== Drift Detection Summary ==="
+        echo "Clean: $CLEAN stacks"
+        echo "Drift: ${DRIFTED:-none}"
+        echo "Errors: ${ERRORS:-none}"
+
+        # ── Slack alert if drift found ──
+        if [ -n "$DRIFTED" ]; then
+          curl -s -X POST -H 'Content-type: application/json' \
+            --data "{\"channel\":\"general\",\"text\":\":warning: Drift detected in:${DRIFTED}\nClean: ${CLEAN} stacks. Errors:${ERRORS:-none}\"}" \
+            "$SLACK_WEBHOOK" || true
+        else
+          curl -s -X POST -H 'Content-type: application/json' \
+            --data "{\"channel\":\"general\",\"text\":\":white_check_mark: Drift detection: all ${CLEAN} stacks clean${ERRORS:+. Errors: $ERRORS}\"}" \
+            "$SLACK_WEBHOOK" || true
+        fi
--- a/.woodpecker/issue-automation.yml
+++ b/.woodpecker/issue-automation.yml
@ -0,0 +1,78 @@
+when:
+  event: manual
+
+clone:
+  git:
+    image: woodpeckerci/plugin-git
+    settings:
+      depth: 2
+
+steps:
+  - name: run-issue-responder
+    image: alpine:3.20
+    commands:
+      - apk add --no-cache curl jq
+      # Authenticate to Vault via K8s SA JWT
+      - |
+        SA_TOKEN=$(cat /var/run/secrets/kubernetes.io/serviceaccount/token)
+        VAULT_RESP=$(curl -sf -X POST http://vault-active.vault.svc.cluster.local:8200/v1/auth/kubernetes/login \
+          -d "{\"role\":\"ci\",\"jwt\":\"$$SA_TOKEN\"}")
+        VAULT_TOKEN=$(echo "$$VAULT_RESP" | jq -r .auth.client_token)
+        if [ -z "$$VAULT_TOKEN" ] || [ "$$VAULT_TOKEN" = "null" ]; then
+          echo "ERROR: Vault authentication failed"
+          exit 1
+        fi
+        echo "Vault authenticated"
+      # Fetch API token for claude-agent-service
+      - |
+        AGENT_TOKEN=$(curl -sf -H "X-Vault-Token: $$VAULT_TOKEN" \
+          http://vault-active.vault.svc.cluster.local:8200/v1/secret/data/claude-agent-service | \
+          jq -r '.data.data.api_bearer_token')
+        if [ -z "$$AGENT_TOKEN" ] || [ "$$AGENT_TOKEN" = "null" ]; then
+          echo "ERROR: Failed to fetch agent API token"
+          exit 1
+        fi
+        echo "Agent token fetched"
+      # Submit job to claude-agent-service
+      - |
+        ISSUE_NUM="${ISSUE_NUMBER:-}"
+        ISSUE_TITLE="${ISSUE_TITLE:-}"
+        ISSUE_LABELS="${ISSUE_LABELS:-}"
+        ISSUE_URL="${ISSUE_URL:-}"
+
+        if [ -z "$$ISSUE_NUM" ]; then
+          echo "ERROR: No issue number provided"
+          exit 1
+        fi
+
+        echo "Processing issue #$$ISSUE_NUM: $$ISSUE_TITLE"
+
+        PAYLOAD=$(jq -n \
+          --arg prompt "Process GitHub Issue #$$ISSUE_NUM: $$ISSUE_TITLE. Labels: $$ISSUE_LABELS. URL: $$ISSUE_URL. Read the issue body via GitHub API, investigate, and take appropriate action." \
+          --arg agent ".claude/agents/issue-responder" \
+          '{prompt: $prompt, agent: $agent, max_budget_usd: 10, timeout_seconds: 1800}')
+
+        RESP=$(curl -sf -X POST \
+          -H "Authorization: Bearer $$AGENT_TOKEN" \
+          -H "Content-Type: application/json" \
+          -d "$$PAYLOAD" \
+          http://claude-agent-service.claude-agent.svc.cluster.local:8080/execute)
+
+        JOB_ID=$(echo "$$RESP" | jq -r '.job_id')
+        echo "Job submitted: $$JOB_ID"
+      # Poll for completion (30min max)
+      - |
+        for i in $(seq 1 120); do
+          sleep 15
+          RESULT=$(curl -sf \
+            -H "Authorization: Bearer $$AGENT_TOKEN" \
+            http://claude-agent-service.claude-agent.svc.cluster.local:8080/jobs/$$JOB_ID)
+          STATUS=$(echo "$$RESULT" | jq -r '.status')
+          echo "[$$i/120] Status: $$STATUS"
+          if [ "$$STATUS" != "running" ]; then
+            echo "$$RESULT" | jq .
+            if [ "$$STATUS" = "completed" ]; then exit 0; else exit 1; fi
+          fi
+        done
+        echo "ERROR: Job timed out after 30 minutes"
+        exit 1
--- a/.woodpecker/k8s-portal.yml
+++ b/.woodpecker/k8s-portal.yml
@ -0,0 +1,49 @@
+when:
+  event: push
+  branch: master
+  path:
+    include:
+      - "stacks/platform/modules/k8s-portal/files/**"
+
+clone:
+  git:
+    image: woodpeckerci/plugin-git
+    settings:
+      attempts: 5
+      backoff: 10s
+
+steps:
+  - name: build-and-push
+    image: woodpeckerci/plugin-docker-buildx
+    settings:
+      username: "viktorbarzin"
+      password:
+        from_secret: dockerhub-pat
+      repo: viktorbarzin/k8s-portal
+      dockerfile: stacks/platform/modules/k8s-portal/files/Dockerfile
+      context: stacks/platform/modules/k8s-portal/files
+      platforms:
+        - linux/amd64
+      tag: ["${CI_PIPELINE_NUMBER}", "latest"]
+      cache_from: "viktorbarzin/k8s-portal:latest"
+      cache_to: "type=inline"
+
+  - name: deploy
+    image: bitnami/kubectl:latest
+    commands:
+      - "kubectl set image deployment/k8s-portal portal=viktorbarzin/k8s-portal:${CI_PIPELINE_NUMBER} -n k8s-portal"
+      - "kubectl rollout status deployment/k8s-portal -n k8s-portal --timeout=120s"
+      - "echo 'k8s-portal deployed successfully (build ${CI_PIPELINE_NUMBER})'"
+
+  - name: slack
+    image: curlimages/curl
+    commands:
+      - |
+        curl -s -X POST -H 'Content-type: application/json' \
+          --data "{\"text\":\"K8s Portal: build #${CI_PIPELINE_NUMBER} ${CI_PIPELINE_STATUS}\"}" \
+          "$SLACK_WEBHOOK" || true
+    environment:
+      SLACK_WEBHOOK:
+        from_secret: slack_webhook
+    when:
+      status: [success, failure]
--- a/.woodpecker/postmortem-todos.yml
+++ b/.woodpecker/postmortem-todos.yml
@ -0,0 +1,32 @@
+when:
+  event: push
+  branch: master
+  path:
+    include:
+      - 'docs/post-mortems/*.md'
+    exclude:
+      - '.woodpecker/**'
+
+clone:
+  git:
+    image: woodpeckerci/plugin-git
+    settings:
+      depth: 5
+
+steps:
+  - name: parse-and-implement
+    image: python:3.12-alpine
+    commands:
+      - apk add --no-cache jq curl git
+      - sh scripts/postmortem-pipeline.sh
+
+  - name: notify-slack
+    image: alpine
+    environment:
+      SLACK_WEBHOOK:
+        from_secret: slack_webhook
+    commands:
+      - apk add --no-cache curl
+      - "curl -sf -X POST https://hooks.slack.com/services/$SLACK_WEBHOOK -H 'Content-Type: application/json' -d '{\"text\": \"Post-mortem TODO pipeline completed\"}' || true"
+    when:
+      - status: [success, failure]
--- a/.woodpecker/provision-user.yml
+++ b/.woodpecker/provision-user.yml
@ -0,0 +1,160 @@
+when:
+  event: manual
+
+clone:
+  git:
+    image: woodpeckerci/plugin-git
+    settings:
+      attempts: 5
+      backoff: 10s
+
+steps:
+  - name: validate-inputs
+    image: alpine
+    commands:
+      - |
+        if [ -z "$USERNAME" ] || [ -z "$EMAIL" ]; then
+          echo "ERROR: USERNAME and EMAIL variables are required"
+          echo "Trigger with: POST /api/repos/1/pipelines {branch:master, variables:{USERNAME:x, EMAIL:y}}"
+          exit 1
+        fi
+        # Validate username: lowercase alphanumeric + dash/underscore, 2-63 chars
+        if ! echo "$USERNAME" | grep -qE '^[a-z0-9][a-z0-9_-]{0,61}[a-z0-9]$'; then
+          echo "ERROR: USERNAME must be 2-63 chars, lowercase alphanumeric/dash/underscore"
+          exit 1
+        fi
+        # Validate email: basic format check
+        if ! echo "$EMAIL" | grep -qE '^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$'; then
+          echo "ERROR: EMAIL must be a valid email address"
+          exit 1
+        fi
+        echo "Provisioning user: $USERNAME ($EMAIL)"
+        echo "export PROVISION_USERNAME='$USERNAME'" > .provision-env
+        echo "export PROVISION_EMAIL='$EMAIL'" >> .provision-env
+
+  - name: prepare
+    image: alpine
+    commands:
+      - "apk update && apk add jq curl git git-crypt"
+      # git-crypt for secrets/ directory
+      - |
+        curl -k https://10.0.20.100:6443/api/v1/namespaces/woodpecker/configmaps/git-crypt-key \
+          -H "Authorization:Bearer $(cat /var/run/secrets/kubernetes.io/serviceaccount/token)" \
+          | jq -r .data.key | base64 -d > /tmp/key
+      - "git-crypt unlock /tmp/key; rm -f /tmp/key"
+      # Vault: authenticate via K8s service account JWT
+      - |
+        SA_TOKEN=$(cat /var/run/secrets/kubernetes.io/serviceaccount/token)
+        VAULT_TOKEN=$(curl -s -X POST http://vault-active.vault.svc.cluster.local:8200/v1/auth/kubernetes/login \
+          -d "{\"role\":\"ci\",\"jwt\":\"$SA_TOKEN\"}" | jq -r .auth.client_token)
+        echo "export VAULT_TOKEN=$VAULT_TOKEN" > .vault-env
+        echo "export VAULT_ADDR=http://vault-active.vault.svc.cluster.local:8200" >> .vault-env
+
+  - name: update-vault-kv
+    image: alpine
+    commands:
+      - "apk update && apk add jq curl"
+      # Read current platform secret
+      - |
+        . .provision-env && . .vault-env
+        CURRENT=$(curl -s -H "X-Vault-Token: $VAULT_TOKEN" \
+          "$VAULT_ADDR/v1/secret/data/platform" | jq -r '.data.data')
+
+        # Parse current k8s_users (stored as JSON string)
+        CURRENT_USERS=$(echo "$CURRENT" | jq -r '.k8s_users')
+
+        # Check if user already exists
+        if echo "$CURRENT_USERS" | jq -e --arg u "$PROVISION_USERNAME" '.[$u]' >/dev/null 2>&1; then
+          echo "User $PROVISION_USERNAME already exists in k8s_users — skipping Vault KV update"
+          exit 0
+        fi
+
+        # Add new user with convention defaults
+        UPDATED_USERS=$(echo "$CURRENT_USERS" | jq --arg u "$PROVISION_USERNAME" --arg e "$PROVISION_EMAIL" \
+          '. + {($u): {"role":"namespace-owner","email":$e,"namespaces":[$u],"domains":[],"quota":{"cpu_requests":"2","memory_requests":"4Gi","memory_limits":"8Gi","pods":"20"}}}')
+
+        # Write back full platform secret with updated k8s_users (as JSON string)
+        PAYLOAD=$(echo "$CURRENT" | jq --arg users "$UPDATED_USERS" '.k8s_users = $users')
+
+        curl -s -X POST -H "X-Vault-Token: $VAULT_TOKEN" \
+          "$VAULT_ADDR/v1/secret/data/platform" \
+          -d "{\"data\": $PAYLOAD}" | jq .
+
+        echo "Added $PROVISION_USERNAME to k8s_users in Vault"
+
+  - name: create-authentik-groups
+    image: alpine
+    commands:
+      - "apk update && apk add jq curl"
+      - |
+        source .provision-env && source .vault-env
+
+        # Get Authentik API token from Vault
+        AUTHENTIK_TOKEN=$(curl -s -H "X-Vault-Token: $VAULT_TOKEN" \
+          "$VAULT_ADDR/v1/secret/data/viktor" | jq -r '.data.data.authentik_api_token')
+        AUTHENTIK_URL="https://authentik.viktorbarzin.me"
+
+        # Create sops-USERNAME group if it doesn't exist
+        SOPS_GROUP="sops-$PROVISION_USERNAME"
+        EXISTING=$(curl -s -H "Authorization: Bearer $AUTHENTIK_TOKEN" \
+          "$AUTHENTIK_URL/api/v3/core/groups/?name=$SOPS_GROUP" | jq -r '.results | length')
+
+        if [ "$EXISTING" = "0" ]; then
+          GROUP_PAYLOAD=$(jq -n --arg name "$SOPS_GROUP" '{"name": $name, "is_superuser": false}')
+          GROUP_PK=$(curl -s -X POST -H "Authorization: Bearer $AUTHENTIK_TOKEN" \
+            -H "Content-Type: application/json" \
+            "$AUTHENTIK_URL/api/v3/core/groups/" \
+            -d "$GROUP_PAYLOAD" | jq -r '.pk')
+          echo "Created Authentik group $SOPS_GROUP (pk=$GROUP_PK)"
+        else
+          GROUP_PK=$(curl -s -H "Authorization: Bearer $AUTHENTIK_TOKEN" \
+            "$AUTHENTIK_URL/api/v3/core/groups/?name=$SOPS_GROUP" | jq -r '.results[0].pk')
+          echo "Authentik group $SOPS_GROUP already exists (pk=$GROUP_PK)"
+        fi
+
+        # Find the user by username
+        USER_PK=$(curl -s -H "Authorization: Bearer $AUTHENTIK_TOKEN" \
+          "$AUTHENTIK_URL/api/v3/core/users/?username=$PROVISION_USERNAME" | jq -r '.results[0].pk')
+
+        if [ "$USER_PK" = "null" ] || [ -z "$USER_PK" ]; then
+          echo "WARNING: User $PROVISION_USERNAME not found in Authentik — group assignment skipped"
+          echo "The user may not have signed up yet. Groups will need manual assignment."
+          exit 0
+        fi
+
+        # Add user to sops group
+        CURRENT_MEMBERS=$(curl -s -H "Authorization: Bearer $AUTHENTIK_TOKEN" \
+          "$AUTHENTIK_URL/api/v3/core/groups/$GROUP_PK/" | jq -r '.users')
+        UPDATED_MEMBERS=$(echo "$CURRENT_MEMBERS" | jq --argjson uid "$USER_PK" '. + [$uid] | unique')
+
+        curl -s -X PATCH -H "Authorization: Bearer $AUTHENTIK_TOKEN" \
+          -H "Content-Type: application/json" \
+          "$AUTHENTIK_URL/api/v3/core/groups/$GROUP_PK/" \
+          -d "{\"users\": $UPDATED_MEMBERS}" | jq .
+
+        echo "Added user $PROVISION_USERNAME (pk=$USER_PK) to group $SOPS_GROUP"
+
+  - name: notify-apply-needed
+    image: curlimages/curl
+    commands:
+      - |
+        . .provision-env
+        echo "User $PROVISION_USERNAME added to Vault KV and Authentik sops group."
+        echo "Manual step needed: apply vault + rbac + woodpecker stacks."
+        echo "  cd stacks/vault && ../../scripts/tg apply --non-interactive"
+        echo "  cd stacks/rbac && ../../scripts/tg apply --non-interactive"
+        echo "  cd stacks/woodpecker && ../../scripts/tg apply --non-interactive"
+
+  - name: slack
+    image: curlimages/curl
+    commands:
+      - |
+        . .provision-env 2>/dev/null || true
+        curl -s -X POST -H 'Content-type: application/json' \
+          --data "{\"channel\":\"general\",\"text\":\"Woodpecker CI: User provisioned — $PROVISION_USERNAME added to Vault KV + Authentik. Run: cd stacks/vault && ../../scripts/tg apply --non-interactive && cd ../rbac && ../../scripts/tg apply --non-interactive\"}" \
+          "$SLACK_WEBHOOK" || true
+    environment:
+      SLACK_WEBHOOK:
+        from_secret: slack_webhook
+    when:
+      status: [success, failure]
--- a/.woodpecker/pve-nfs-exports-sync.yml
+++ b/.woodpecker/pve-nfs-exports-sync.yml
@ -0,0 +1,63 @@
+# Sync infra/scripts/pve-nfs-exports → PVE host /etc/exports on change.
+#
+# Wave 6b of the state-drift consolidation plan: move the "scp + exportfs -ra"
+# deploy step out of runbook-human-hands and into CI so the Proxmox NFS export
+# table tracks git.
+#
+# Trigger: push to master that touches `scripts/pve-nfs-exports`. The file
+# header documents the deploy invocation; this pipeline codifies it.
+#
+# Credentials:
+#   - pve_ssh_key: Woodpecker repo-secret (ed25519 keypair provisioned
+#     2026-04-18 as `woodpecker-pve-nfs-exports-sync`). Public key lives in
+#     /root/.ssh/authorized_keys on the PVE host. Private key mirrored in
+#     Vault `secret/woodpecker/pve_ssh_key` for recovery.
+
+when:
+  - event: push
+    branch: master
+    path: scripts/pve-nfs-exports
+  - event: manual
+
+clone:
+  git:
+    image: woodpeckerci/plugin-git
+    settings:
+      depth: 1
+      attempts: 3
+
+steps:
+  - name: deploy
+    image: alpine:3.20
+    environment:
+      PVE_SSH_KEY:
+        from_secret: pve_ssh_key
+      SLACK_WEBHOOK:
+        from_secret: slack_webhook
+    commands:
+      - apk add --no-cache openssh-client curl
+      - mkdir -p ~/.ssh && chmod 700 ~/.ssh
+      - printf '%s\n' "$PVE_SSH_KEY" > ~/.ssh/id_ed25519
+      - chmod 600 ~/.ssh/id_ed25519
+      # Pin host key — CI's ~/.ssh/known_hosts is ephemeral, so accept-new on first pull.
+      - ssh-keyscan -t ed25519 192.168.1.127 >> ~/.ssh/known_hosts 2>/dev/null
+      # Diff what we'd ship, so pipeline logs show the intended change.
+      - echo '---diff---' && ssh -o BatchMode=yes root@192.168.1.127 "cat /etc/exports" > /tmp/remote.exports || true
+      - diff -u /tmp/remote.exports scripts/pve-nfs-exports || true
+      - echo '---applying---'
+      - scp -o BatchMode=yes scripts/pve-nfs-exports root@192.168.1.127:/etc/exports
+      - ssh -o BatchMode=yes root@192.168.1.127 "exportfs -ra && exportfs -s | head -5"
+      - echo '---done---'
+
+  - name: slack
+    image: curlimages/curl:8.11.0
+    environment:
+      SLACK_WEBHOOK:
+        from_secret: slack_webhook
+    commands:
+      - |
+        curl -s -X POST -H 'Content-type: application/json' \
+          --data "{\"channel\":\"general\",\"text\":\"PVE /etc/exports sync: ${CI_PIPELINE_STATUS}\"}" \
+          "$SLACK_WEBHOOK" || true
+    when:
+      status: [success, failure]
--- a/.woodpecker/registry-config-sync.yml
+++ b/.woodpecker/registry-config-sync.yml
@ -0,0 +1,156 @@
+# Sync modules/docker-registry/* → /opt/registry/ on docker-registry VM
+# (10.0.20.10) on change, and bounce containers + nginx when needed.
+#
+# Replaces the manual "ssh + scp + docker compose up -d" that was required
+# after the 2026-04-19 `registry:2 → registry:2.8.3` pin landed. The deploy
+# flow is now: edit a file in modules/docker-registry/ → git push → this
+# pipeline runs → registry VM picks up the change.
+#
+# Trigger: push to master that touches any managed file (see `when.path`),
+# or a manual run via Woodpecker UI / API.
+#
+# Credentials:
+#   - registry_ssh_key: Woodpecker repo-secret (ed25519 keypair provisioned
+#     2026-04-19 as `woodpecker-registry-config-sync`). Public key lives in
+#     /root/.ssh/authorized_keys on 10.0.20.10. Private key mirrored in
+#     Vault `secret/woodpecker/registry_ssh_key` (subkeys private_key /
+#     public_key / known_hosts_entry) for recovery.
+#
+# Why bounce nginx every time: nginx caches upstream DNS at startup, so if
+# any registry-* container gets recreated (new IP on the docker bridge),
+# nginx keeps forwarding to a stale address. Always restart nginx as the
+# last step — see docs/runbooks/registry-vm.md § "Bouncing registry
+# containers — the nginx DNS trap".
+
+when:
+  - event: push
+    branch: master
+    path:
+      include:
+        - 'modules/docker-registry/docker-compose.yml'
+        - 'modules/docker-registry/fix-broken-blobs.sh'
+        - 'modules/docker-registry/cleanup-tags.sh'
+        - 'modules/docker-registry/nginx_registry.conf'
+        - 'modules/docker-registry/config-private.yml'
+  - event: manual
+
+clone:
+  git:
+    image: woodpeckerci/plugin-git
+    settings:
+      depth: 1
+      attempts: 3
+
+steps:
+  - name: deploy
+    image: alpine:3.20
+    environment:
+      REGISTRY_SSH_KEY:
+        from_secret: registry_ssh_key
+    commands:
+      - apk add --no-cache openssh-client rsync
+      - mkdir -p ~/.ssh && chmod 700 ~/.ssh
+      - printf '%s\n' "$REGISTRY_SSH_KEY" > ~/.ssh/id_ed25519
+      - chmod 600 ~/.ssh/id_ed25519
+      # Pin host key — CI's ~/.ssh/known_hosts is ephemeral, so accept-new on first pull.
+      - ssh-keyscan -t ed25519 10.0.20.10 >> ~/.ssh/known_hosts 2>/dev/null
+      - echo '---detecting changed files---'
+      - |
+        # Mirror the remote state of each file so we can diff and decide what bounces.
+        CHANGED=""
+        for f in docker-compose.yml fix-broken-blobs.sh cleanup-tags.sh nginx_registry.conf config-private.yml; do
+          LOCAL="modules/docker-registry/$f"
+          REMOTE="/opt/registry/$f"
+          if [ ! -f "$LOCAL" ]; then
+            echo "skip $f (not in repo)"
+            continue
+          fi
+          # Pull the remote copy into /tmp for a diff. ssh -n avoids stdin-hogging.
+          REMOTE_CONTENT=$(ssh -n -o BatchMode=yes root@10.0.20.10 "cat $REMOTE 2>/dev/null || true")
+          LOCAL_CONTENT=$(cat "$LOCAL")
+          if [ "$LOCAL_CONTENT" = "$REMOTE_CONTENT" ]; then
+            echo "unchanged: $f"
+          else
+            echo "---diff: $f ---"
+            echo "$REMOTE_CONTENT" > /tmp/remote.txt
+            diff -u /tmp/remote.txt "$LOCAL" | head -40 || true
+            CHANGED="$CHANGED $f"
+          fi
+        done
+        echo "CHANGED_FILES=$CHANGED"
+        printf '%s' "$CHANGED" > /tmp/changed
+      - echo '---applying---'
+      - |
+        CHANGED=$(cat /tmp/changed)
+        if [ -z "$CHANGED" ]; then
+          echo "No files changed — exiting cleanly (manual run with no drift)."
+          exit 0
+        fi
+        # Ship every managed file unconditionally — scp is cheap, idempotency is safe.
+        scp -o BatchMode=yes \
+          modules/docker-registry/docker-compose.yml \
+          modules/docker-registry/fix-broken-blobs.sh \
+          modules/docker-registry/cleanup-tags.sh \
+          modules/docker-registry/nginx_registry.conf \
+          modules/docker-registry/config-private.yml \
+          root@10.0.20.10:/opt/registry/
+        ssh -n -o BatchMode=yes root@10.0.20.10 '
+          chmod +x /opt/registry/fix-broken-blobs.sh /opt/registry/cleanup-tags.sh
+        '
+      - echo '---bouncing containers + nginx---'
+      - |
+        CHANGED=$(cat /tmp/changed)
+        # Compose-visible files: docker-compose.yml (image tag, mounts) and
+        # config-private.yml (registry config → needs registry-private reload).
+        BOUNCE_COMPOSE=0
+        BOUNCE_NGINX=0
+        echo "$CHANGED" | grep -q "docker-compose.yml" && BOUNCE_COMPOSE=1
+        echo "$CHANGED" | grep -q "config-private.yml" && BOUNCE_COMPOSE=1
+        echo "$CHANGED" | grep -q "nginx_registry.conf" && BOUNCE_NGINX=1
+
+        if [ "$BOUNCE_COMPOSE" = "1" ]; then
+          echo "compose-visible change → pull + up -d"
+          ssh -n -o BatchMode=yes root@10.0.20.10 '
+            cd /opt/registry
+            docker compose pull 2>&1 | tail -5
+            docker compose up -d 2>&1 | tail -20
+          '
+          # Any compose recreate requires nginx DNS refresh too.
+          BOUNCE_NGINX=1
+        fi
+
+        if [ "$BOUNCE_NGINX" = "1" ]; then
+          echo "bouncing nginx to flush upstream DNS cache"
+          ssh -n -o BatchMode=yes root@10.0.20.10 '
+            docker restart registry-nginx
+            sleep 3
+            docker ps --format "{{.Names}}\t{{.Image}}\t{{.Status}}" | grep -E "registry-"
+          '
+        fi
+
+        if [ "$BOUNCE_COMPOSE" = "0" ] && [ "$BOUNCE_NGINX" = "0" ]; then
+          echo "only script files changed (cron-picks-up semantics) — no bounce needed"
+        fi
+      - echo '---verify---'
+      - |
+        ssh -n -o BatchMode=yes root@10.0.20.10 '
+          echo "=== catalog ==="
+          # Prove auth + routing survived.
+          curl -sk -o /dev/null -w "catalog (unauth → 401 expected): HTTP %{http_code}\n" \
+            https://127.0.0.1:5050/v2/
+          echo "=== integrity scan (dry-run) ==="
+          python3 /opt/registry/fix-broken-blobs.sh --dry-run 2>&1 | tail -5
+        '
+
+  - name: slack
+    image: curlimages/curl:8.11.0
+    environment:
+      SLACK_WEBHOOK:
+        from_secret: slack_webhook
+    commands:
+      - |
+        curl -s -X POST -H 'Content-type: application/json' \
+          --data "{\"channel\":\"general\",\"text\":\"Registry config sync on 10.0.20.10: ${CI_PIPELINE_STATUS}\"}" \
+          "$SLACK_WEBHOOK" || true
+    when:
+      status: [success, failure]
--- a/.woodpecker/renew-tls.yml
+++ b/.woodpecker/renew-tls.yml
@ -0,0 +1,79 @@
+when:
+  event: cron
+  cron: renew-tls-certificate
+
+clone:
+  git:
+    image: woodpeckerci/plugin-git
+    settings:
+      attempts: 5
+      backoff: 10s
+
+steps:
+  - name: prepare
+    image: alpine
+    commands:
+      - "apk update && apk add jq curl git git-crypt"
+      - |
+        curl -k https://10.0.20.100:6443/api/v1/namespaces/woodpecker/configmaps/git-crypt-key -H "Authorization:Bearer $(cat /var/run/secrets/kubernetes.io/serviceaccount/token)" | jq -r .data.key | base64 -d > /tmp/key
+      - "git-crypt unlock /tmp/key && rm /tmp/key"
+
+  - name: renew-tls
+    image: alpine
+    environment:
+      TECHNITIUM_API_KEY:
+        from_secret: TECHNITIUM_API_KEY
+      CLOUDFLARE_TOKEN:
+        from_secret: CLOUDFLARE_TOKEN
+      CLOUDFLARE_ZONE_ID:
+        from_secret: CLOUDFLARE_ZONE_ID
+    commands:
+      - "apk update && apk add certbot curl jq"
+      - "./modules/kubernetes/setup_tls_secret/renew2.sh"
+
+  - name: commit-certs
+    image: alpine
+    commands:
+      - "apk update && apk add openssh-client git git-crypt"
+      - "mkdir -p ~/.ssh && ssh-keyscan -H github.com >> ~/.ssh/known_hosts"
+      - "chmod 400 secrets/deploy_key"
+      # Only add specific paths — never git add .
+      - "git add secrets/ state/ || true"
+      - "git remote set-url origin git@github.com:ViktorBarzin/infra.git"
+      - "git commit -m 'Woodpecker CI Update TLS Certificates Commit' || echo 'No changes'"
+      - "GIT_SSH_COMMAND='ssh -i ./secrets/deploy_key -o IdentitiesOnly=yes' git pull --rebase origin master"
+      - "GIT_SSH_COMMAND='ssh -i ./secrets/deploy_key -o IdentitiesOnly=yes' git push origin master"
+
+  - name: verify-cert
+    image: alpine
+    commands:
+      - "apk update && apk add openssl"
+      - "openssl x509 -checkend 604800 -noout -in secrets/fullchain.pem"
+      - "echo 'Certificate is valid for at least 7 more days'"
+
+  - name: update-tls-source-secret
+    image: alpine
+    commands:
+      - "apk update && apk add curl"
+      - "curl -LO https://dl.k8s.io/release/v1.31.0/bin/linux/amd64/kubectl && chmod +x kubectl && mv kubectl /usr/local/bin/"
+      - |
+        SECRET_YAML=$(kubectl create secret tls tls-secret \
+          --cert=secrets/fullchain.pem --key=secrets/privkey.pem \
+          --namespace=placeholder --dry-run=client -o yaml)
+        for ns in $(kubectl get ns -o jsonpath='{.items[*].metadata.name}' | tr ' ' '\n' | grep -v '^kube-'); do
+          echo "$SECRET_YAML" | sed "s/namespace: placeholder/namespace: $ns/" | kubectl apply -f - 2>/dev/null || true
+        done
+      - "echo 'TLS secret updated in all namespaces'"
+
+  - name: slack
+    image: curlimages/curl
+    commands:
+      - |
+        curl -s -X POST -H 'Content-type: application/json' \
+          --data "{\"channel\":\"general\",\"text\":\"Woodpecker CI: TLS certificate renewal ${CI_PIPELINE_STATUS}\"}" \
+          "$SLACK_WEBHOOK" || true
+    environment:
+      SLACK_WEBHOOK:
+        from_secret: slack_webhook
+    when:
+      status: [success, failure]