fix: restore tree dropped by 6d224861; land stem95su gdrive-sync (10m) [ci skip]
6d224861 came from a --no-checkout worktree whose empty index made the
commit drop every file except two. This restores 05b50d2b's full tree and
correctly adds stacks/stem95su/gdrive-sync.tf + the service-catalog stem95su
entry. Forward-only (parent=6d224861, no force-push); [ci skip] since the
live infra was never applied from the broken commit.
Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
parent
6d224861c4
commit
fd0f4a0365
1166 changed files with 358546 additions and 0 deletions
88
.woodpecker/build-ci-image.yml
Normal file
88
.woodpecker/build-ci-image.yml
Normal file
|
|
@ -0,0 +1,88 @@
|
|||
# Build the CI tools Docker image used by all infra pipelines.
|
||||
# Triggers on push that touches ci/Dockerfile, or manual (API/UI) so
|
||||
# rebuilds after a registry incident don't need a cosmetic Dockerfile edit.
|
||||
|
||||
when:
|
||||
- event: push
|
||||
branch: master
|
||||
path:
|
||||
include:
|
||||
- 'ci/Dockerfile'
|
||||
- event: manual
|
||||
|
||||
steps:
|
||||
- name: build-and-push
|
||||
image: woodpeckerci/plugin-docker-buildx
|
||||
settings:
|
||||
# Phase 4 of forgejo-registry-consolidation 2026-05-07 —
|
||||
# registry.viktorbarzin.me dropped, Forgejo is the only target.
|
||||
repo:
|
||||
- forgejo.viktorbarzin.me/viktor/infra-ci
|
||||
dockerfile: ci/Dockerfile
|
||||
context: ci/
|
||||
tags:
|
||||
- latest
|
||||
- "${CI_COMMIT_SHA:0:8}"
|
||||
platforms: linux/amd64
|
||||
logins:
|
||||
- registry: forgejo.viktorbarzin.me
|
||||
username:
|
||||
from_secret: forgejo_user
|
||||
password:
|
||||
from_secret: forgejo_push_token
|
||||
|
||||
# Post-push integrity check is now redundant with the every-15min
|
||||
# forgejo-integrity-probe in stacks/monitoring/, which walks
|
||||
# /v2/_catalog + HEADs every blob across the entire Forgejo registry.
|
||||
# If a corruption pattern emerges that the periodic probe misses,
|
||||
# restore a verify step similar to the pre-Phase-4 version (see
|
||||
# commit 49f4956f) but pointed at forgejo.viktorbarzin.me.
|
||||
|
||||
# Break-glass tarball: save the just-pushed infra-ci image to disk on the
|
||||
# registry VM (10.0.20.10) so we can `docker load` it back into a node
|
||||
# when Forgejo is unreachable. Pulls from Forgejo (the only registry now).
|
||||
# Best-effort — failure here doesn't fail the pipeline.
|
||||
# Recovery procedure: docs/runbooks/forgejo-registry-breakglass.md.
|
||||
- name: breakglass-tarball
|
||||
image: alpine:3.20
|
||||
failure: ignore
|
||||
environment:
|
||||
REGISTRY_SSH_KEY:
|
||||
from_secret: registry_ssh_key
|
||||
FORGEJO_USER:
|
||||
from_secret: forgejo_user
|
||||
FORGEJO_PASS:
|
||||
from_secret: forgejo_push_token
|
||||
commands:
|
||||
- apk add --no-cache openssh-client
|
||||
- mkdir -p ~/.ssh && chmod 700 ~/.ssh
|
||||
- printf '%s\n' "$REGISTRY_SSH_KEY" > ~/.ssh/id_ed25519
|
||||
- chmod 600 ~/.ssh/id_ed25519
|
||||
- ssh-keyscan -t ed25519 10.0.20.10 >> ~/.ssh/known_hosts 2>/dev/null
|
||||
- SHA=${CI_COMMIT_SHA:0:8}
|
||||
- |
|
||||
ssh -n -o BatchMode=yes root@10.0.20.10 "
|
||||
set -e
|
||||
mkdir -p /opt/registry/data/private/_breakglass
|
||||
IMAGE=forgejo.viktorbarzin.me/viktor/infra-ci:$SHA
|
||||
echo \$FORGEJO_PASS | docker login forgejo.viktorbarzin.me -u \$FORGEJO_USER --password-stdin
|
||||
docker pull \$IMAGE
|
||||
docker save \$IMAGE | gzip > /opt/registry/data/private/_breakglass/infra-ci-$SHA.tar.gz
|
||||
ln -sfn infra-ci-$SHA.tar.gz /opt/registry/data/private/_breakglass/infra-ci-latest.tar.gz
|
||||
ls -t /opt/registry/data/private/_breakglass/infra-ci-*.tar.gz \
|
||||
| grep -v 'latest' | tail -n +6 | xargs -r rm -v
|
||||
ls -lh /opt/registry/data/private/_breakglass/
|
||||
"
|
||||
|
||||
- name: slack
|
||||
image: curlimages/curl
|
||||
commands:
|
||||
- |
|
||||
curl -s -X POST -H 'Content-type: application/json' \
|
||||
--data "{\"text\":\"CI image built: forgejo.viktorbarzin.me/viktor/infra-ci:${CI_COMMIT_SHA:0:8} (and registry-private mirror)\"}" \
|
||||
"$SLACK_WEBHOOK" || true
|
||||
environment:
|
||||
SLACK_WEBHOOK:
|
||||
from_secret: slack_webhook
|
||||
when:
|
||||
status: [success]
|
||||
42
.woodpecker/build-cli.yml
Normal file
42
.woodpecker/build-cli.yml
Normal file
|
|
@ -0,0 +1,42 @@
|
|||
when:
|
||||
event: push
|
||||
|
||||
clone:
|
||||
git:
|
||||
image: woodpeckerci/plugin-git
|
||||
settings:
|
||||
attempts: 5
|
||||
backoff: 10s
|
||||
|
||||
steps:
|
||||
- name: build-image
|
||||
image: woodpeckerci/plugin-docker-buildx
|
||||
settings:
|
||||
username: "viktorbarzin"
|
||||
password:
|
||||
from_secret: dockerhub-pat
|
||||
# Phase 4 of forgejo-registry-consolidation 2026-05-07 —
|
||||
# registry.viktorbarzin.me:5050 decommissioned. Push to DockerHub
|
||||
# (the public-facing infra image) AND Forgejo (the cluster pull
|
||||
# source). Same image, two locations.
|
||||
repo:
|
||||
- viktorbarzin/infra
|
||||
- forgejo.viktorbarzin.me/viktor/infra
|
||||
logins:
|
||||
- registry: https://index.docker.io/v1/
|
||||
username: viktorbarzin
|
||||
password:
|
||||
from_secret: dockerhub-pat
|
||||
- registry: forgejo.viktorbarzin.me
|
||||
username:
|
||||
from_secret: forgejo_user
|
||||
password:
|
||||
from_secret: forgejo_push_token
|
||||
dockerfile: cli/Dockerfile
|
||||
context: cli
|
||||
auto_tag: true
|
||||
# cache_from/cache_to removed: registry cache corruption causes
|
||||
# "short read: expected 32 bytes" BuildKit errors. Inline cache
|
||||
# will be re-populated once a clean image is pushed.
|
||||
# cache_from: "registry.viktorbarzin.me:5050/infra:latest"
|
||||
# cache_to: "type=inline"
|
||||
270
.woodpecker/default.yml
Normal file
270
.woodpecker/default.yml
Normal file
|
|
@ -0,0 +1,270 @@
|
|||
# Unified infra CI pipeline — detects changed stacks and applies only those.
|
||||
# Platform stacks and app stacks handled in one pipeline with proper ordering.
|
||||
#
|
||||
# Optimizations over the previous split pipeline:
|
||||
# - Custom CI image (no apk/wget per step)
|
||||
# - Shallow clone (depth=2 for git diff HEAD~1)
|
||||
# - TF_PLUGIN_CACHE_DIR (shared provider cache)
|
||||
# - Serial apply with Vault advisory locks (prevents user/CI race conditions)
|
||||
# - Step consolidation (2 steps instead of 4)
|
||||
# - Changed-stacks-only detection (skips no-op applies)
|
||||
# - Global-file fallback (modules/config changes trigger full apply)
|
||||
# - Lock-aware: skips stacks locked by users instead of failing
|
||||
|
||||
when:
|
||||
event: push
|
||||
branch: master
|
||||
|
||||
clone:
|
||||
git:
|
||||
image: woodpeckerci/plugin-git
|
||||
settings:
|
||||
depth: 2
|
||||
attempts: 5
|
||||
backoff: 10s
|
||||
|
||||
steps:
|
||||
- name: apply
|
||||
image: forgejo.viktorbarzin.me/viktor/infra-ci:latest
|
||||
pull: true
|
||||
backend_options:
|
||||
kubernetes:
|
||||
resources:
|
||||
requests:
|
||||
memory: 3Gi
|
||||
limits:
|
||||
memory: 6Gi
|
||||
environment:
|
||||
SLACK_WEBHOOK:
|
||||
from_secret: slack_webhook
|
||||
# Each `- |` command runs in a fresh shell, so we can't rely on an
|
||||
# `export VAULT_ADDR=...` in the auth command persisting — pin it at
|
||||
# step level. VAULT_TOKEN is still per-command; we persist it to
|
||||
# ~/.vault-token (auto-read by `vault` CLI) so downstream commands
|
||||
# don't need explicit token propagation.
|
||||
VAULT_ADDR: http://vault-active.vault.svc.cluster.local:8200
|
||||
commands:
|
||||
# ── Skip CI commits ──
|
||||
- |
|
||||
if echo "$CI_COMMIT_MESSAGE" | grep -q '\[CI SKIP\]\|\[ci skip\]'; then
|
||||
echo "Commit has [CI SKIP], exiting"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# ── git-crypt unlock ──
|
||||
- |
|
||||
SA_TOKEN=$(cat /var/run/secrets/kubernetes.io/serviceaccount/token)
|
||||
curl -sk "https://10.0.20.100:6443/api/v1/namespaces/woodpecker/configmaps/git-crypt-key" \
|
||||
-H "Authorization:Bearer $SA_TOKEN" | jq -r .data.key | base64 -d > /tmp/key
|
||||
git-crypt unlock /tmp/key && rm /tmp/key
|
||||
|
||||
# ── Vault auth ──
|
||||
- |
|
||||
SA_TOKEN=$(cat /var/run/secrets/kubernetes.io/serviceaccount/token)
|
||||
VAULT_TOKEN=$(curl -s -X POST "$VAULT_ADDR/v1/auth/kubernetes/login" \
|
||||
-d "{\"role\":\"ci\",\"jwt\":\"$SA_TOKEN\"}" | jq -r .auth.client_token)
|
||||
if [ -z "$VAULT_TOKEN" ] || [ "$VAULT_TOKEN" = "null" ]; then
|
||||
echo "ERROR: Vault K8s auth failed (role=ci, ns=woodpecker)" >&2
|
||||
exit 1
|
||||
fi
|
||||
# Persist for downstream `- |` blocks (each runs in a fresh shell,
|
||||
# so exporting VAULT_TOKEN wouldn't help). `vault`, `scripts/tg`,
|
||||
# and `scripts/state-sync` all fall through to ~/.vault-token when
|
||||
# the env var is unset.
|
||||
umask 077; printf '%s' "$VAULT_TOKEN" > "$HOME/.vault-token"
|
||||
|
||||
# ── Generate kubeconfig from projected SA token ──
|
||||
# terragrunt.hcl injects `-var kube_config_path=<repo>/config` for every
|
||||
# terraform invocation, so we need a kubeconfig file at that path. The
|
||||
# `default` SA in the woodpecker namespace is cluster-admin (via the
|
||||
# `woodpecker-default` ClusterRoleBinding), so the projected token is
|
||||
# sufficient to apply any stack. Using `tokenFile` (not an inline token)
|
||||
# so the provider re-reads it if kubelet rotates the projected token
|
||||
# mid-pipeline.
|
||||
- |
|
||||
cat > config <<'EOF'
|
||||
apiVersion: v1
|
||||
kind: Config
|
||||
clusters:
|
||||
- name: kubernetes
|
||||
cluster:
|
||||
server: https://10.0.20.100:6443
|
||||
certificate-authority: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
|
||||
contexts:
|
||||
- name: ci
|
||||
context:
|
||||
cluster: kubernetes
|
||||
user: ci
|
||||
current-context: ci
|
||||
users:
|
||||
- name: ci
|
||||
user:
|
||||
tokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
|
||||
EOF
|
||||
chmod 600 config
|
||||
# Sanity check: kubeconfig works
|
||||
kubectl --kubeconfig=config get ns kube-system -o name >/dev/null
|
||||
|
||||
# ── Detect changed stacks ──
|
||||
- |
|
||||
PLATFORM_STACKS="dbaas authentik crowdsec monitoring nvidia mailserver cloudflared kyverno metallb redis traefik technitium headscale rbac k8s-portal vaultwarden reverse-proxy metrics-server vpa nfs-csi iscsi-csi cnpg sealed-secrets uptime-kuma wireguard xray infra-maintenance platform vault reloader descheduler external-secrets"
|
||||
|
||||
# Ensure we have enough history for diff (clone may be shallow)
|
||||
if ! git rev-parse HEAD~1 >/dev/null 2>&1; then
|
||||
echo "WARNING: HEAD~1 not available (shallow clone?) — fetching more history"
|
||||
git fetch --deepen=1 origin master 2>/dev/null || true
|
||||
fi
|
||||
|
||||
# If still no parent, apply all platform stacks as a safe fallback
|
||||
if ! git rev-parse HEAD~1 >/dev/null 2>&1; then
|
||||
echo "Cannot determine changed files — applying ALL platform stacks"
|
||||
echo "$PLATFORM_STACKS" | tr ' ' '\n' > .platform_apply
|
||||
> .app_apply
|
||||
else
|
||||
# Check if global files changed (triggers full platform apply)
|
||||
GLOBAL_CHANGED=$(git diff --name-only HEAD~1 HEAD | grep -E '^(modules/|config\.tfvars|terragrunt\.hcl)' || true)
|
||||
|
||||
if [ -n "$GLOBAL_CHANGED" ]; then
|
||||
echo "Global files changed — applying ALL platform stacks"
|
||||
echo "$PLATFORM_STACKS" | tr ' ' '\n' > .platform_apply
|
||||
else
|
||||
# Detect platform stacks that changed
|
||||
git diff --name-only HEAD~1 HEAD | grep '^stacks/' | cut -d/ -f2 | sort -u > .all_changed
|
||||
> .platform_apply
|
||||
while read -r stack; do
|
||||
if echo "$PLATFORM_STACKS" | grep -qw "$stack"; then
|
||||
echo "$stack" >> .platform_apply
|
||||
fi
|
||||
done < .all_changed
|
||||
fi
|
||||
|
||||
# Detect app stacks that changed
|
||||
> .app_apply
|
||||
git diff --name-only HEAD~1 HEAD | grep '^stacks/' | cut -d/ -f2 | sort -u | while read -r stack; do
|
||||
if echo "$PLATFORM_STACKS" | grep -qw "$stack"; then
|
||||
continue # Skip platform stacks
|
||||
fi
|
||||
if [ ! -f "stacks/$stack/terragrunt.hcl" ]; then
|
||||
continue # Skip non-terragrunt dirs
|
||||
fi
|
||||
echo "$stack" >> .app_apply
|
||||
done
|
||||
fi
|
||||
|
||||
PLATFORM_COUNT=$(wc -l < .platform_apply | tr -d ' ')
|
||||
APP_COUNT=$(wc -l < .app_apply | tr -d ' ')
|
||||
echo "Platform stacks to apply: $PLATFORM_COUNT"
|
||||
echo "App stacks to apply: $APP_COUNT"
|
||||
cat .platform_apply .app_apply
|
||||
|
||||
# ── Pre-warm provider cache ──
|
||||
- |
|
||||
if [ -s .platform_apply ] || [ -s .app_apply ]; then
|
||||
FIRST_STACK=$(cat .platform_apply .app_apply 2>/dev/null | head -1)
|
||||
if [ -n "$FIRST_STACK" ]; then
|
||||
echo "Pre-warming provider cache from stacks/$FIRST_STACK..."
|
||||
cd "stacks/$FIRST_STACK" && terragrunt init --terragrunt-non-interactive -input=false 2>&1 | tail -3 && cd ../..
|
||||
fi
|
||||
fi
|
||||
|
||||
# ── Apply platform stacks (serial, with Vault advisory locks) ──
|
||||
- |
|
||||
FAILED_PLATFORM_STACKS=""
|
||||
if [ -s .platform_apply ]; then
|
||||
echo "=== Applying platform stacks (serial, locked) ==="
|
||||
while read -r stack; do
|
||||
echo "[$stack] Starting apply..."
|
||||
set +e
|
||||
OUTPUT=$(cd "stacks/$stack" && ../../scripts/tg apply --non-interactive 2>&1)
|
||||
EXIT=$?
|
||||
set -e
|
||||
if [ $EXIT -ne 0 ]; then
|
||||
if echo "$OUTPUT" | grep -q "is locked by"; then
|
||||
echo "[$stack] SKIPPED (locked by another session)"
|
||||
else
|
||||
echo "$OUTPUT" | tail -50
|
||||
echo "[$stack] FAILED (exit $EXIT)"
|
||||
FAILED_PLATFORM_STACKS="$FAILED_PLATFORM_STACKS $stack"
|
||||
fi
|
||||
else
|
||||
echo "$OUTPUT" | tail -3
|
||||
echo "[$stack] OK"
|
||||
fi
|
||||
done < .platform_apply
|
||||
fi
|
||||
# Deferred until after app stacks so both lists get a chance to run.
|
||||
echo "$FAILED_PLATFORM_STACKS" > .platform_failed
|
||||
|
||||
# ── Apply app stacks (serial, with Vault advisory locks) ──
|
||||
- |
|
||||
FAILED_APP_STACKS=""
|
||||
if [ -s .app_apply ]; then
|
||||
echo "=== Applying app stacks (serial, locked) ==="
|
||||
while read -r stack; do
|
||||
echo "[$stack] Starting apply..."
|
||||
set +e
|
||||
OUTPUT=$(cd "stacks/$stack" && ../../scripts/tg apply --non-interactive 2>&1)
|
||||
EXIT=$?
|
||||
set -e
|
||||
if [ $EXIT -ne 0 ]; then
|
||||
if echo "$OUTPUT" | grep -q "is locked by"; then
|
||||
echo "[$stack] SKIPPED (locked by another session)"
|
||||
else
|
||||
echo "$OUTPUT" | tail -50
|
||||
echo "[$stack] FAILED (exit $EXIT)"
|
||||
FAILED_APP_STACKS="$FAILED_APP_STACKS $stack"
|
||||
fi
|
||||
else
|
||||
echo "$OUTPUT" | tail -3
|
||||
echo "[$stack] OK"
|
||||
fi
|
||||
done < .app_apply
|
||||
fi
|
||||
# Fail the step loudly so the pipeline `default` workflow state
|
||||
# reflects reality — the service-upgrade agent and CI alert cascade
|
||||
# both rely on this (see bd code-e1x). Lock-skipped stacks are NOT
|
||||
# counted as failures.
|
||||
FAILED_PLATFORM=$(cat .platform_failed 2>/dev/null | tr -d ' ')
|
||||
if [ -n "$FAILED_PLATFORM" ] || [ -n "$FAILED_APP_STACKS" ]; then
|
||||
echo "=== FAILED STACKS: platform=[$FAILED_PLATFORM ] apps=[$FAILED_APP_STACKS ] ==="
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# ── Commit and push state changes ──
|
||||
- |
|
||||
mkdir -p ~/.ssh && ssh-keyscan -H github.com >> ~/.ssh/known_hosts 2>/dev/null
|
||||
chmod 400 secrets/deploy_key
|
||||
git add stacks/ state/ .woodpecker/ 2>/dev/null || true
|
||||
git remote set-url origin git@github.com:ViktorBarzin/infra.git
|
||||
git diff --cached --quiet && echo "No changes to commit" && exit 0
|
||||
git commit -m "Woodpecker CI deploy [CI SKIP]"
|
||||
GIT_SSH_COMMAND='ssh -i ./secrets/deploy_key -o IdentitiesOnly=yes' git fetch origin master
|
||||
if ! GIT_SSH_COMMAND='ssh -i ./secrets/deploy_key -o IdentitiesOnly=yes' git rebase origin/master; then
|
||||
echo "ERROR: Git rebase failed — state commits could not be pushed"
|
||||
echo "Manual intervention required: pull, resolve conflicts, push"
|
||||
GIT_SSH_COMMAND='ssh -i ./secrets/deploy_key -o IdentitiesOnly=yes' git rebase --abort || true
|
||||
exit 1
|
||||
fi
|
||||
GIT_SSH_COMMAND='ssh -i ./secrets/deploy_key -o IdentitiesOnly=yes' git push origin master
|
||||
|
||||
# ── Slack notification ──
|
||||
- |
|
||||
PLATFORM_COUNT=$(wc -l < .platform_apply 2>/dev/null | tr -d ' ')
|
||||
APP_COUNT=$(wc -l < .app_apply 2>/dev/null | tr -d ' ')
|
||||
curl -s -X POST -H 'Content-type: application/json' \
|
||||
--data "{\"channel\":\"general\",\"text\":\"Woodpecker CI: infra pipeline ${CI_PIPELINE_STATUS} (platform:${PLATFORM_COUNT}, apps:${APP_COUNT})\"}" \
|
||||
"$SLACK_WEBHOOK" || true
|
||||
|
||||
# Slack on failure (runs even if apply step fails)
|
||||
- name: notify-failure
|
||||
image: curlimages/curl
|
||||
commands:
|
||||
- |
|
||||
curl -s -X POST -H 'Content-type: application/json' \
|
||||
--data "{\"channel\":\"general\",\"text\":\":red_circle: Woodpecker CI: infra pipeline FAILED\"}" \
|
||||
"$SLACK_WEBHOOK" || true
|
||||
environment:
|
||||
SLACK_WEBHOOK:
|
||||
from_secret: slack_webhook
|
||||
when:
|
||||
status: [failure]
|
||||
151
.woodpecker/drift-detection.yml
Normal file
151
.woodpecker/drift-detection.yml
Normal file
|
|
@ -0,0 +1,151 @@
|
|||
# Daily drift detection — runs terraform plan on all stacks and alerts on drift.
|
||||
# Triggered by Woodpecker cron schedule "drift-detection" (must be registered in Woodpecker UI/API).
|
||||
|
||||
when:
|
||||
event: cron
|
||||
cron: drift-detection
|
||||
|
||||
clone:
|
||||
git:
|
||||
image: woodpeckerci/plugin-git
|
||||
settings:
|
||||
depth: 1
|
||||
attempts: 3
|
||||
|
||||
steps:
|
||||
- name: detect-drift
|
||||
image: forgejo.viktorbarzin.me/viktor/infra-ci:latest
|
||||
pull: true
|
||||
backend_options:
|
||||
kubernetes:
|
||||
resources:
|
||||
requests:
|
||||
memory: 2Gi
|
||||
limits:
|
||||
memory: 4Gi
|
||||
environment:
|
||||
SLACK_WEBHOOK:
|
||||
from_secret: slack_webhook
|
||||
commands:
|
||||
# ── git-crypt unlock ──
|
||||
- |
|
||||
SA_TOKEN=$(cat /var/run/secrets/kubernetes.io/serviceaccount/token)
|
||||
curl -sk "https://10.0.20.100:6443/api/v1/namespaces/woodpecker/configmaps/git-crypt-key" \
|
||||
-H "Authorization:Bearer $SA_TOKEN" | jq -r .data.key | base64 -d > /tmp/key
|
||||
git-crypt unlock /tmp/key && rm /tmp/key
|
||||
|
||||
# ── Vault auth ──
|
||||
- |
|
||||
SA_TOKEN=$(cat /var/run/secrets/kubernetes.io/serviceaccount/token)
|
||||
export VAULT_ADDR=http://vault-active.vault.svc.cluster.local:8200
|
||||
export VAULT_TOKEN=$(curl -s -X POST "$VAULT_ADDR/v1/auth/kubernetes/login" \
|
||||
-d "{\"role\":\"ci\",\"jwt\":\"$SA_TOKEN\"}" | jq -r .auth.client_token)
|
||||
|
||||
# ── Generate kubeconfig from projected SA token ──
|
||||
# See default.yml for rationale. terragrunt.hcl injects
|
||||
# `-var kube_config_path=<repo>/config` for every terraform invocation,
|
||||
# so we need a kubeconfig file at that path. The woodpecker default SA
|
||||
# is cluster-admin, so the projected token is sufficient.
|
||||
- |
|
||||
cat > config <<'EOF'
|
||||
apiVersion: v1
|
||||
kind: Config
|
||||
clusters:
|
||||
- name: kubernetes
|
||||
cluster:
|
||||
server: https://10.0.20.100:6443
|
||||
certificate-authority: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
|
||||
contexts:
|
||||
- name: ci
|
||||
context:
|
||||
cluster: kubernetes
|
||||
user: ci
|
||||
current-context: ci
|
||||
users:
|
||||
- name: ci
|
||||
user:
|
||||
tokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
|
||||
EOF
|
||||
chmod 600 config
|
||||
kubectl --kubeconfig=config get ns kube-system -o name >/dev/null
|
||||
|
||||
# ── Run terraform plan on all stacks ──
|
||||
# Emits two timestamps per drifted stack so the Pushgateway/Prometheus
|
||||
# side can compute drift-age-hours via `time() - drift_stack_first_seen`.
|
||||
- |
|
||||
DRIFTED=""
|
||||
CLEAN=0
|
||||
ERRORS=""
|
||||
NOW=$(date +%s)
|
||||
# Metrics accumulator — written once per stack, then pushed as a batch.
|
||||
METRICS=""
|
||||
|
||||
for stack_dir in stacks/*/; do
|
||||
stack=$(basename "$stack_dir")
|
||||
[ -f "$stack_dir/terragrunt.hcl" ] || continue
|
||||
|
||||
echo -n "[$stack] planning... "
|
||||
OUTPUT=$(cd "$stack_dir" && terragrunt plan -detailed-exitcode -input=false 2>&1)
|
||||
EXIT=$?
|
||||
|
||||
case $EXIT in
|
||||
0)
|
||||
echo "OK (no changes)"
|
||||
CLEAN=$((CLEAN + 1))
|
||||
# drift_stack_state=0 means clean; age-hours irrelevant so we
|
||||
# still push 0 so per-stack gauges don't go stale.
|
||||
METRICS="${METRICS}drift_stack_state{stack=\"$stack\"} 0\n"
|
||||
METRICS="${METRICS}drift_stack_age_hours{stack=\"$stack\"} 0\n"
|
||||
;;
|
||||
1)
|
||||
echo "ERROR"
|
||||
ERRORS="$ERRORS $stack"
|
||||
METRICS="${METRICS}drift_stack_state{stack=\"$stack\"} 2\n"
|
||||
;;
|
||||
2)
|
||||
echo "DRIFT DETECTED"
|
||||
DRIFTED="$DRIFTED $stack"
|
||||
# Fetch first-seen timestamp from Pushgateway (preserve across runs).
|
||||
FIRST_SEEN=$(curl -s "http://prometheus-prometheus-pushgateway.monitoring:9091/metrics" \
|
||||
| awk -v s="$stack" '$1 == "drift_stack_first_seen{stack=\""s"\"}" {print $2; exit}')
|
||||
if [ -z "$FIRST_SEEN" ] || [ "$FIRST_SEEN" = "0" ]; then
|
||||
FIRST_SEEN="$NOW"
|
||||
fi
|
||||
AGE_HOURS=$(( (NOW - FIRST_SEEN) / 3600 ))
|
||||
METRICS="${METRICS}drift_stack_state{stack=\"$stack\"} 1\n"
|
||||
METRICS="${METRICS}drift_stack_first_seen{stack=\"$stack\"} $FIRST_SEEN\n"
|
||||
METRICS="${METRICS}drift_stack_age_hours{stack=\"$stack\"} $AGE_HOURS\n"
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
# Summary counters — single gauge per run.
|
||||
DRIFT_COUNT=$(echo "$DRIFTED" | wc -w)
|
||||
ERROR_COUNT=$(echo "$ERRORS" | wc -w)
|
||||
METRICS="${METRICS}drift_stack_count $DRIFT_COUNT\n"
|
||||
METRICS="${METRICS}drift_error_count $ERROR_COUNT\n"
|
||||
METRICS="${METRICS}drift_clean_count $CLEAN\n"
|
||||
METRICS="${METRICS}drift_detection_last_run_timestamp $NOW\n"
|
||||
|
||||
# ── Push to Pushgateway ──
|
||||
# One batched push keeps the run atomic: either all metrics land or none.
|
||||
printf "%b" "$METRICS" | curl -s --data-binary @- \
|
||||
http://prometheus-prometheus-pushgateway.monitoring:9091/metrics/job/drift-detection \
|
||||
|| echo "(pushgateway unavailable, metrics lost for this run)"
|
||||
|
||||
echo ""
|
||||
echo "=== Drift Detection Summary ==="
|
||||
echo "Clean: $CLEAN stacks"
|
||||
echo "Drift: ${DRIFTED:-none}"
|
||||
echo "Errors: ${ERRORS:-none}"
|
||||
|
||||
# ── Slack alert if drift found ──
|
||||
if [ -n "$DRIFTED" ]; then
|
||||
curl -s -X POST -H 'Content-type: application/json' \
|
||||
--data "{\"channel\":\"general\",\"text\":\":warning: Drift detected in:${DRIFTED}\nClean: ${CLEAN} stacks. Errors:${ERRORS:-none}\"}" \
|
||||
"$SLACK_WEBHOOK" || true
|
||||
else
|
||||
curl -s -X POST -H 'Content-type: application/json' \
|
||||
--data "{\"channel\":\"general\",\"text\":\":white_check_mark: Drift detection: all ${CLEAN} stacks clean${ERRORS:+. Errors: $ERRORS}\"}" \
|
||||
"$SLACK_WEBHOOK" || true
|
||||
fi
|
||||
78
.woodpecker/issue-automation.yml
Normal file
78
.woodpecker/issue-automation.yml
Normal file
|
|
@ -0,0 +1,78 @@
|
|||
when:
|
||||
event: manual
|
||||
|
||||
clone:
|
||||
git:
|
||||
image: woodpeckerci/plugin-git
|
||||
settings:
|
||||
depth: 2
|
||||
|
||||
steps:
|
||||
- name: run-issue-responder
|
||||
image: alpine:3.20
|
||||
commands:
|
||||
- apk add --no-cache curl jq
|
||||
# Authenticate to Vault via K8s SA JWT
|
||||
- |
|
||||
SA_TOKEN=$(cat /var/run/secrets/kubernetes.io/serviceaccount/token)
|
||||
VAULT_RESP=$(curl -sf -X POST http://vault-active.vault.svc.cluster.local:8200/v1/auth/kubernetes/login \
|
||||
-d "{\"role\":\"ci\",\"jwt\":\"$$SA_TOKEN\"}")
|
||||
VAULT_TOKEN=$(echo "$$VAULT_RESP" | jq -r .auth.client_token)
|
||||
if [ -z "$$VAULT_TOKEN" ] || [ "$$VAULT_TOKEN" = "null" ]; then
|
||||
echo "ERROR: Vault authentication failed"
|
||||
exit 1
|
||||
fi
|
||||
echo "Vault authenticated"
|
||||
# Fetch API token for claude-agent-service
|
||||
- |
|
||||
AGENT_TOKEN=$(curl -sf -H "X-Vault-Token: $$VAULT_TOKEN" \
|
||||
http://vault-active.vault.svc.cluster.local:8200/v1/secret/data/claude-agent-service | \
|
||||
jq -r '.data.data.api_bearer_token')
|
||||
if [ -z "$$AGENT_TOKEN" ] || [ "$$AGENT_TOKEN" = "null" ]; then
|
||||
echo "ERROR: Failed to fetch agent API token"
|
||||
exit 1
|
||||
fi
|
||||
echo "Agent token fetched"
|
||||
# Submit job to claude-agent-service
|
||||
- |
|
||||
ISSUE_NUM="${ISSUE_NUMBER:-}"
|
||||
ISSUE_TITLE="${ISSUE_TITLE:-}"
|
||||
ISSUE_LABELS="${ISSUE_LABELS:-}"
|
||||
ISSUE_URL="${ISSUE_URL:-}"
|
||||
|
||||
if [ -z "$$ISSUE_NUM" ]; then
|
||||
echo "ERROR: No issue number provided"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "Processing issue #$$ISSUE_NUM: $$ISSUE_TITLE"
|
||||
|
||||
PAYLOAD=$(jq -n \
|
||||
--arg prompt "Process GitHub Issue #$$ISSUE_NUM: $$ISSUE_TITLE. Labels: $$ISSUE_LABELS. URL: $$ISSUE_URL. Read the issue body via GitHub API, investigate, and take appropriate action." \
|
||||
--arg agent ".claude/agents/issue-responder" \
|
||||
'{prompt: $prompt, agent: $agent, max_budget_usd: 10, timeout_seconds: 1800}')
|
||||
|
||||
RESP=$(curl -sf -X POST \
|
||||
-H "Authorization: Bearer $$AGENT_TOKEN" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d "$$PAYLOAD" \
|
||||
http://claude-agent-service.claude-agent.svc.cluster.local:8080/execute)
|
||||
|
||||
JOB_ID=$(echo "$$RESP" | jq -r '.job_id')
|
||||
echo "Job submitted: $$JOB_ID"
|
||||
# Poll for completion (30min max)
|
||||
- |
|
||||
for i in $(seq 1 120); do
|
||||
sleep 15
|
||||
RESULT=$(curl -sf \
|
||||
-H "Authorization: Bearer $$AGENT_TOKEN" \
|
||||
http://claude-agent-service.claude-agent.svc.cluster.local:8080/jobs/$$JOB_ID)
|
||||
STATUS=$(echo "$$RESULT" | jq -r '.status')
|
||||
echo "[$$i/120] Status: $$STATUS"
|
||||
if [ "$$STATUS" != "running" ]; then
|
||||
echo "$$RESULT" | jq .
|
||||
if [ "$$STATUS" = "completed" ]; then exit 0; else exit 1; fi
|
||||
fi
|
||||
done
|
||||
echo "ERROR: Job timed out after 30 minutes"
|
||||
exit 1
|
||||
49
.woodpecker/k8s-portal.yml
Normal file
49
.woodpecker/k8s-portal.yml
Normal file
|
|
@ -0,0 +1,49 @@
|
|||
when:
|
||||
event: push
|
||||
branch: master
|
||||
path:
|
||||
include:
|
||||
- "stacks/platform/modules/k8s-portal/files/**"
|
||||
|
||||
clone:
|
||||
git:
|
||||
image: woodpeckerci/plugin-git
|
||||
settings:
|
||||
attempts: 5
|
||||
backoff: 10s
|
||||
|
||||
steps:
|
||||
- name: build-and-push
|
||||
image: woodpeckerci/plugin-docker-buildx
|
||||
settings:
|
||||
username: "viktorbarzin"
|
||||
password:
|
||||
from_secret: dockerhub-pat
|
||||
repo: viktorbarzin/k8s-portal
|
||||
dockerfile: stacks/platform/modules/k8s-portal/files/Dockerfile
|
||||
context: stacks/platform/modules/k8s-portal/files
|
||||
platforms:
|
||||
- linux/amd64
|
||||
tag: ["${CI_PIPELINE_NUMBER}", "latest"]
|
||||
cache_from: "viktorbarzin/k8s-portal:latest"
|
||||
cache_to: "type=inline"
|
||||
|
||||
- name: deploy
|
||||
image: bitnami/kubectl:latest
|
||||
commands:
|
||||
- "kubectl set image deployment/k8s-portal portal=viktorbarzin/k8s-portal:${CI_PIPELINE_NUMBER} -n k8s-portal"
|
||||
- "kubectl rollout status deployment/k8s-portal -n k8s-portal --timeout=120s"
|
||||
- "echo 'k8s-portal deployed successfully (build ${CI_PIPELINE_NUMBER})'"
|
||||
|
||||
- name: slack
|
||||
image: curlimages/curl
|
||||
commands:
|
||||
- |
|
||||
curl -s -X POST -H 'Content-type: application/json' \
|
||||
--data "{\"text\":\"K8s Portal: build #${CI_PIPELINE_NUMBER} ${CI_PIPELINE_STATUS}\"}" \
|
||||
"$SLACK_WEBHOOK" || true
|
||||
environment:
|
||||
SLACK_WEBHOOK:
|
||||
from_secret: slack_webhook
|
||||
when:
|
||||
status: [success, failure]
|
||||
32
.woodpecker/postmortem-todos.yml
Normal file
32
.woodpecker/postmortem-todos.yml
Normal file
|
|
@ -0,0 +1,32 @@
|
|||
when:
|
||||
event: push
|
||||
branch: master
|
||||
path:
|
||||
include:
|
||||
- 'docs/post-mortems/*.md'
|
||||
exclude:
|
||||
- '.woodpecker/**'
|
||||
|
||||
clone:
|
||||
git:
|
||||
image: woodpeckerci/plugin-git
|
||||
settings:
|
||||
depth: 5
|
||||
|
||||
steps:
|
||||
- name: parse-and-implement
|
||||
image: python:3.12-alpine
|
||||
commands:
|
||||
- apk add --no-cache jq curl git
|
||||
- sh scripts/postmortem-pipeline.sh
|
||||
|
||||
- name: notify-slack
|
||||
image: alpine
|
||||
environment:
|
||||
SLACK_WEBHOOK:
|
||||
from_secret: slack_webhook
|
||||
commands:
|
||||
- apk add --no-cache curl
|
||||
- "curl -sf -X POST https://hooks.slack.com/services/$SLACK_WEBHOOK -H 'Content-Type: application/json' -d '{\"text\": \"Post-mortem TODO pipeline completed\"}' || true"
|
||||
when:
|
||||
- status: [success, failure]
|
||||
160
.woodpecker/provision-user.yml
Normal file
160
.woodpecker/provision-user.yml
Normal file
|
|
@ -0,0 +1,160 @@
|
|||
when:
|
||||
event: manual
|
||||
|
||||
clone:
|
||||
git:
|
||||
image: woodpeckerci/plugin-git
|
||||
settings:
|
||||
attempts: 5
|
||||
backoff: 10s
|
||||
|
||||
steps:
|
||||
- name: validate-inputs
|
||||
image: alpine
|
||||
commands:
|
||||
- |
|
||||
if [ -z "$USERNAME" ] || [ -z "$EMAIL" ]; then
|
||||
echo "ERROR: USERNAME and EMAIL variables are required"
|
||||
echo "Trigger with: POST /api/repos/1/pipelines {branch:master, variables:{USERNAME:x, EMAIL:y}}"
|
||||
exit 1
|
||||
fi
|
||||
# Validate username: lowercase alphanumeric + dash/underscore, 2-63 chars
|
||||
if ! echo "$USERNAME" | grep -qE '^[a-z0-9][a-z0-9_-]{0,61}[a-z0-9]$'; then
|
||||
echo "ERROR: USERNAME must be 2-63 chars, lowercase alphanumeric/dash/underscore"
|
||||
exit 1
|
||||
fi
|
||||
# Validate email: basic format check
|
||||
if ! echo "$EMAIL" | grep -qE '^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$'; then
|
||||
echo "ERROR: EMAIL must be a valid email address"
|
||||
exit 1
|
||||
fi
|
||||
echo "Provisioning user: $USERNAME ($EMAIL)"
|
||||
echo "export PROVISION_USERNAME='$USERNAME'" > .provision-env
|
||||
echo "export PROVISION_EMAIL='$EMAIL'" >> .provision-env
|
||||
|
||||
- name: prepare
|
||||
image: alpine
|
||||
commands:
|
||||
- "apk update && apk add jq curl git git-crypt"
|
||||
# git-crypt for secrets/ directory
|
||||
- |
|
||||
curl -k https://10.0.20.100:6443/api/v1/namespaces/woodpecker/configmaps/git-crypt-key \
|
||||
-H "Authorization:Bearer $(cat /var/run/secrets/kubernetes.io/serviceaccount/token)" \
|
||||
| jq -r .data.key | base64 -d > /tmp/key
|
||||
- "git-crypt unlock /tmp/key; rm -f /tmp/key"
|
||||
# Vault: authenticate via K8s service account JWT
|
||||
- |
|
||||
SA_TOKEN=$(cat /var/run/secrets/kubernetes.io/serviceaccount/token)
|
||||
VAULT_TOKEN=$(curl -s -X POST http://vault-active.vault.svc.cluster.local:8200/v1/auth/kubernetes/login \
|
||||
-d "{\"role\":\"ci\",\"jwt\":\"$SA_TOKEN\"}" | jq -r .auth.client_token)
|
||||
echo "export VAULT_TOKEN=$VAULT_TOKEN" > .vault-env
|
||||
echo "export VAULT_ADDR=http://vault-active.vault.svc.cluster.local:8200" >> .vault-env
|
||||
|
||||
- name: update-vault-kv
|
||||
image: alpine
|
||||
commands:
|
||||
- "apk update && apk add jq curl"
|
||||
# Read current platform secret
|
||||
- |
|
||||
. .provision-env && . .vault-env
|
||||
CURRENT=$(curl -s -H "X-Vault-Token: $VAULT_TOKEN" \
|
||||
"$VAULT_ADDR/v1/secret/data/platform" | jq -r '.data.data')
|
||||
|
||||
# Parse current k8s_users (stored as JSON string)
|
||||
CURRENT_USERS=$(echo "$CURRENT" | jq -r '.k8s_users')
|
||||
|
||||
# Check if user already exists
|
||||
if echo "$CURRENT_USERS" | jq -e --arg u "$PROVISION_USERNAME" '.[$u]' >/dev/null 2>&1; then
|
||||
echo "User $PROVISION_USERNAME already exists in k8s_users — skipping Vault KV update"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# Add new user with convention defaults
|
||||
UPDATED_USERS=$(echo "$CURRENT_USERS" | jq --arg u "$PROVISION_USERNAME" --arg e "$PROVISION_EMAIL" \
|
||||
'. + {($u): {"role":"namespace-owner","email":$e,"namespaces":[$u],"domains":[],"quota":{"cpu_requests":"2","memory_requests":"4Gi","memory_limits":"8Gi","pods":"20"}}}')
|
||||
|
||||
# Write back full platform secret with updated k8s_users (as JSON string)
|
||||
PAYLOAD=$(echo "$CURRENT" | jq --arg users "$UPDATED_USERS" '.k8s_users = $users')
|
||||
|
||||
curl -s -X POST -H "X-Vault-Token: $VAULT_TOKEN" \
|
||||
"$VAULT_ADDR/v1/secret/data/platform" \
|
||||
-d "{\"data\": $PAYLOAD}" | jq .
|
||||
|
||||
echo "Added $PROVISION_USERNAME to k8s_users in Vault"
|
||||
|
||||
- name: create-authentik-groups
|
||||
image: alpine
|
||||
commands:
|
||||
- "apk update && apk add jq curl"
|
||||
- |
|
||||
source .provision-env && source .vault-env
|
||||
|
||||
# Get Authentik API token from Vault
|
||||
AUTHENTIK_TOKEN=$(curl -s -H "X-Vault-Token: $VAULT_TOKEN" \
|
||||
"$VAULT_ADDR/v1/secret/data/viktor" | jq -r '.data.data.authentik_api_token')
|
||||
AUTHENTIK_URL="https://authentik.viktorbarzin.me"
|
||||
|
||||
# Create sops-USERNAME group if it doesn't exist
|
||||
SOPS_GROUP="sops-$PROVISION_USERNAME"
|
||||
EXISTING=$(curl -s -H "Authorization: Bearer $AUTHENTIK_TOKEN" \
|
||||
"$AUTHENTIK_URL/api/v3/core/groups/?name=$SOPS_GROUP" | jq -r '.results | length')
|
||||
|
||||
if [ "$EXISTING" = "0" ]; then
|
||||
GROUP_PAYLOAD=$(jq -n --arg name "$SOPS_GROUP" '{"name": $name, "is_superuser": false}')
|
||||
GROUP_PK=$(curl -s -X POST -H "Authorization: Bearer $AUTHENTIK_TOKEN" \
|
||||
-H "Content-Type: application/json" \
|
||||
"$AUTHENTIK_URL/api/v3/core/groups/" \
|
||||
-d "$GROUP_PAYLOAD" | jq -r '.pk')
|
||||
echo "Created Authentik group $SOPS_GROUP (pk=$GROUP_PK)"
|
||||
else
|
||||
GROUP_PK=$(curl -s -H "Authorization: Bearer $AUTHENTIK_TOKEN" \
|
||||
"$AUTHENTIK_URL/api/v3/core/groups/?name=$SOPS_GROUP" | jq -r '.results[0].pk')
|
||||
echo "Authentik group $SOPS_GROUP already exists (pk=$GROUP_PK)"
|
||||
fi
|
||||
|
||||
# Find the user by username
|
||||
USER_PK=$(curl -s -H "Authorization: Bearer $AUTHENTIK_TOKEN" \
|
||||
"$AUTHENTIK_URL/api/v3/core/users/?username=$PROVISION_USERNAME" | jq -r '.results[0].pk')
|
||||
|
||||
if [ "$USER_PK" = "null" ] || [ -z "$USER_PK" ]; then
|
||||
echo "WARNING: User $PROVISION_USERNAME not found in Authentik — group assignment skipped"
|
||||
echo "The user may not have signed up yet. Groups will need manual assignment."
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# Add user to sops group
|
||||
CURRENT_MEMBERS=$(curl -s -H "Authorization: Bearer $AUTHENTIK_TOKEN" \
|
||||
"$AUTHENTIK_URL/api/v3/core/groups/$GROUP_PK/" | jq -r '.users')
|
||||
UPDATED_MEMBERS=$(echo "$CURRENT_MEMBERS" | jq --argjson uid "$USER_PK" '. + [$uid] | unique')
|
||||
|
||||
curl -s -X PATCH -H "Authorization: Bearer $AUTHENTIK_TOKEN" \
|
||||
-H "Content-Type: application/json" \
|
||||
"$AUTHENTIK_URL/api/v3/core/groups/$GROUP_PK/" \
|
||||
-d "{\"users\": $UPDATED_MEMBERS}" | jq .
|
||||
|
||||
echo "Added user $PROVISION_USERNAME (pk=$USER_PK) to group $SOPS_GROUP"
|
||||
|
||||
- name: notify-apply-needed
|
||||
image: curlimages/curl
|
||||
commands:
|
||||
- |
|
||||
. .provision-env
|
||||
echo "User $PROVISION_USERNAME added to Vault KV and Authentik sops group."
|
||||
echo "Manual step needed: apply vault + rbac + woodpecker stacks."
|
||||
echo " cd stacks/vault && ../../scripts/tg apply --non-interactive"
|
||||
echo " cd stacks/rbac && ../../scripts/tg apply --non-interactive"
|
||||
echo " cd stacks/woodpecker && ../../scripts/tg apply --non-interactive"
|
||||
|
||||
- name: slack
|
||||
image: curlimages/curl
|
||||
commands:
|
||||
- |
|
||||
. .provision-env 2>/dev/null || true
|
||||
curl -s -X POST -H 'Content-type: application/json' \
|
||||
--data "{\"channel\":\"general\",\"text\":\"Woodpecker CI: User provisioned — $PROVISION_USERNAME added to Vault KV + Authentik. Run: cd stacks/vault && ../../scripts/tg apply --non-interactive && cd ../rbac && ../../scripts/tg apply --non-interactive\"}" \
|
||||
"$SLACK_WEBHOOK" || true
|
||||
environment:
|
||||
SLACK_WEBHOOK:
|
||||
from_secret: slack_webhook
|
||||
when:
|
||||
status: [success, failure]
|
||||
63
.woodpecker/pve-nfs-exports-sync.yml
Normal file
63
.woodpecker/pve-nfs-exports-sync.yml
Normal file
|
|
@ -0,0 +1,63 @@
|
|||
# Sync infra/scripts/pve-nfs-exports → PVE host /etc/exports on change.
|
||||
#
|
||||
# Wave 6b of the state-drift consolidation plan: move the "scp + exportfs -ra"
|
||||
# deploy step out of runbook-human-hands and into CI so the Proxmox NFS export
|
||||
# table tracks git.
|
||||
#
|
||||
# Trigger: push to master that touches `scripts/pve-nfs-exports`. The file
|
||||
# header documents the deploy invocation; this pipeline codifies it.
|
||||
#
|
||||
# Credentials:
|
||||
# - pve_ssh_key: Woodpecker repo-secret (ed25519 keypair provisioned
|
||||
# 2026-04-18 as `woodpecker-pve-nfs-exports-sync`). Public key lives in
|
||||
# /root/.ssh/authorized_keys on the PVE host. Private key mirrored in
|
||||
# Vault `secret/woodpecker/pve_ssh_key` for recovery.
|
||||
|
||||
when:
|
||||
- event: push
|
||||
branch: master
|
||||
path: scripts/pve-nfs-exports
|
||||
- event: manual
|
||||
|
||||
clone:
|
||||
git:
|
||||
image: woodpeckerci/plugin-git
|
||||
settings:
|
||||
depth: 1
|
||||
attempts: 3
|
||||
|
||||
steps:
|
||||
- name: deploy
|
||||
image: alpine:3.20
|
||||
environment:
|
||||
PVE_SSH_KEY:
|
||||
from_secret: pve_ssh_key
|
||||
SLACK_WEBHOOK:
|
||||
from_secret: slack_webhook
|
||||
commands:
|
||||
- apk add --no-cache openssh-client curl
|
||||
- mkdir -p ~/.ssh && chmod 700 ~/.ssh
|
||||
- printf '%s\n' "$PVE_SSH_KEY" > ~/.ssh/id_ed25519
|
||||
- chmod 600 ~/.ssh/id_ed25519
|
||||
# Pin host key — CI's ~/.ssh/known_hosts is ephemeral, so accept-new on first pull.
|
||||
- ssh-keyscan -t ed25519 192.168.1.127 >> ~/.ssh/known_hosts 2>/dev/null
|
||||
# Diff what we'd ship, so pipeline logs show the intended change.
|
||||
- echo '---diff---' && ssh -o BatchMode=yes root@192.168.1.127 "cat /etc/exports" > /tmp/remote.exports || true
|
||||
- diff -u /tmp/remote.exports scripts/pve-nfs-exports || true
|
||||
- echo '---applying---'
|
||||
- scp -o BatchMode=yes scripts/pve-nfs-exports root@192.168.1.127:/etc/exports
|
||||
- ssh -o BatchMode=yes root@192.168.1.127 "exportfs -ra && exportfs -s | head -5"
|
||||
- echo '---done---'
|
||||
|
||||
- name: slack
|
||||
image: curlimages/curl:8.11.0
|
||||
environment:
|
||||
SLACK_WEBHOOK:
|
||||
from_secret: slack_webhook
|
||||
commands:
|
||||
- |
|
||||
curl -s -X POST -H 'Content-type: application/json' \
|
||||
--data "{\"channel\":\"general\",\"text\":\"PVE /etc/exports sync: ${CI_PIPELINE_STATUS}\"}" \
|
||||
"$SLACK_WEBHOOK" || true
|
||||
when:
|
||||
status: [success, failure]
|
||||
156
.woodpecker/registry-config-sync.yml
Normal file
156
.woodpecker/registry-config-sync.yml
Normal file
|
|
@ -0,0 +1,156 @@
|
|||
# Sync modules/docker-registry/* → /opt/registry/ on docker-registry VM
|
||||
# (10.0.20.10) on change, and bounce containers + nginx when needed.
|
||||
#
|
||||
# Replaces the manual "ssh + scp + docker compose up -d" that was required
|
||||
# after the 2026-04-19 `registry:2 → registry:2.8.3` pin landed. The deploy
|
||||
# flow is now: edit a file in modules/docker-registry/ → git push → this
|
||||
# pipeline runs → registry VM picks up the change.
|
||||
#
|
||||
# Trigger: push to master that touches any managed file (see `when.path`),
|
||||
# or a manual run via Woodpecker UI / API.
|
||||
#
|
||||
# Credentials:
|
||||
# - registry_ssh_key: Woodpecker repo-secret (ed25519 keypair provisioned
|
||||
# 2026-04-19 as `woodpecker-registry-config-sync`). Public key lives in
|
||||
# /root/.ssh/authorized_keys on 10.0.20.10. Private key mirrored in
|
||||
# Vault `secret/woodpecker/registry_ssh_key` (subkeys private_key /
|
||||
# public_key / known_hosts_entry) for recovery.
|
||||
#
|
||||
# Why bounce nginx every time: nginx caches upstream DNS at startup, so if
|
||||
# any registry-* container gets recreated (new IP on the docker bridge),
|
||||
# nginx keeps forwarding to a stale address. Always restart nginx as the
|
||||
# last step — see docs/runbooks/registry-vm.md § "Bouncing registry
|
||||
# containers — the nginx DNS trap".
|
||||
|
||||
when:
|
||||
- event: push
|
||||
branch: master
|
||||
path:
|
||||
include:
|
||||
- 'modules/docker-registry/docker-compose.yml'
|
||||
- 'modules/docker-registry/fix-broken-blobs.sh'
|
||||
- 'modules/docker-registry/cleanup-tags.sh'
|
||||
- 'modules/docker-registry/nginx_registry.conf'
|
||||
- 'modules/docker-registry/config-private.yml'
|
||||
- event: manual
|
||||
|
||||
clone:
|
||||
git:
|
||||
image: woodpeckerci/plugin-git
|
||||
settings:
|
||||
depth: 1
|
||||
attempts: 3
|
||||
|
||||
steps:
|
||||
- name: deploy
|
||||
image: alpine:3.20
|
||||
environment:
|
||||
REGISTRY_SSH_KEY:
|
||||
from_secret: registry_ssh_key
|
||||
commands:
|
||||
- apk add --no-cache openssh-client rsync
|
||||
- mkdir -p ~/.ssh && chmod 700 ~/.ssh
|
||||
- printf '%s\n' "$REGISTRY_SSH_KEY" > ~/.ssh/id_ed25519
|
||||
- chmod 600 ~/.ssh/id_ed25519
|
||||
# Pin host key — CI's ~/.ssh/known_hosts is ephemeral, so accept-new on first pull.
|
||||
- ssh-keyscan -t ed25519 10.0.20.10 >> ~/.ssh/known_hosts 2>/dev/null
|
||||
- echo '---detecting changed files---'
|
||||
- |
|
||||
# Mirror the remote state of each file so we can diff and decide what bounces.
|
||||
CHANGED=""
|
||||
for f in docker-compose.yml fix-broken-blobs.sh cleanup-tags.sh nginx_registry.conf config-private.yml; do
|
||||
LOCAL="modules/docker-registry/$f"
|
||||
REMOTE="/opt/registry/$f"
|
||||
if [ ! -f "$LOCAL" ]; then
|
||||
echo "skip $f (not in repo)"
|
||||
continue
|
||||
fi
|
||||
# Pull the remote copy into /tmp for a diff. ssh -n avoids stdin-hogging.
|
||||
REMOTE_CONTENT=$(ssh -n -o BatchMode=yes root@10.0.20.10 "cat $REMOTE 2>/dev/null || true")
|
||||
LOCAL_CONTENT=$(cat "$LOCAL")
|
||||
if [ "$LOCAL_CONTENT" = "$REMOTE_CONTENT" ]; then
|
||||
echo "unchanged: $f"
|
||||
else
|
||||
echo "---diff: $f ---"
|
||||
echo "$REMOTE_CONTENT" > /tmp/remote.txt
|
||||
diff -u /tmp/remote.txt "$LOCAL" | head -40 || true
|
||||
CHANGED="$CHANGED $f"
|
||||
fi
|
||||
done
|
||||
echo "CHANGED_FILES=$CHANGED"
|
||||
printf '%s' "$CHANGED" > /tmp/changed
|
||||
- echo '---applying---'
|
||||
- |
|
||||
CHANGED=$(cat /tmp/changed)
|
||||
if [ -z "$CHANGED" ]; then
|
||||
echo "No files changed — exiting cleanly (manual run with no drift)."
|
||||
exit 0
|
||||
fi
|
||||
# Ship every managed file unconditionally — scp is cheap, idempotency is safe.
|
||||
scp -o BatchMode=yes \
|
||||
modules/docker-registry/docker-compose.yml \
|
||||
modules/docker-registry/fix-broken-blobs.sh \
|
||||
modules/docker-registry/cleanup-tags.sh \
|
||||
modules/docker-registry/nginx_registry.conf \
|
||||
modules/docker-registry/config-private.yml \
|
||||
root@10.0.20.10:/opt/registry/
|
||||
ssh -n -o BatchMode=yes root@10.0.20.10 '
|
||||
chmod +x /opt/registry/fix-broken-blobs.sh /opt/registry/cleanup-tags.sh
|
||||
'
|
||||
- echo '---bouncing containers + nginx---'
|
||||
- |
|
||||
CHANGED=$(cat /tmp/changed)
|
||||
# Compose-visible files: docker-compose.yml (image tag, mounts) and
|
||||
# config-private.yml (registry config → needs registry-private reload).
|
||||
BOUNCE_COMPOSE=0
|
||||
BOUNCE_NGINX=0
|
||||
echo "$CHANGED" | grep -q "docker-compose.yml" && BOUNCE_COMPOSE=1
|
||||
echo "$CHANGED" | grep -q "config-private.yml" && BOUNCE_COMPOSE=1
|
||||
echo "$CHANGED" | grep -q "nginx_registry.conf" && BOUNCE_NGINX=1
|
||||
|
||||
if [ "$BOUNCE_COMPOSE" = "1" ]; then
|
||||
echo "compose-visible change → pull + up -d"
|
||||
ssh -n -o BatchMode=yes root@10.0.20.10 '
|
||||
cd /opt/registry
|
||||
docker compose pull 2>&1 | tail -5
|
||||
docker compose up -d 2>&1 | tail -20
|
||||
'
|
||||
# Any compose recreate requires nginx DNS refresh too.
|
||||
BOUNCE_NGINX=1
|
||||
fi
|
||||
|
||||
if [ "$BOUNCE_NGINX" = "1" ]; then
|
||||
echo "bouncing nginx to flush upstream DNS cache"
|
||||
ssh -n -o BatchMode=yes root@10.0.20.10 '
|
||||
docker restart registry-nginx
|
||||
sleep 3
|
||||
docker ps --format "{{.Names}}\t{{.Image}}\t{{.Status}}" | grep -E "registry-"
|
||||
'
|
||||
fi
|
||||
|
||||
if [ "$BOUNCE_COMPOSE" = "0" ] && [ "$BOUNCE_NGINX" = "0" ]; then
|
||||
echo "only script files changed (cron-picks-up semantics) — no bounce needed"
|
||||
fi
|
||||
- echo '---verify---'
|
||||
- |
|
||||
ssh -n -o BatchMode=yes root@10.0.20.10 '
|
||||
echo "=== catalog ==="
|
||||
# Prove auth + routing survived.
|
||||
curl -sk -o /dev/null -w "catalog (unauth → 401 expected): HTTP %{http_code}\n" \
|
||||
https://127.0.0.1:5050/v2/
|
||||
echo "=== integrity scan (dry-run) ==="
|
||||
python3 /opt/registry/fix-broken-blobs.sh --dry-run 2>&1 | tail -5
|
||||
'
|
||||
|
||||
- name: slack
|
||||
image: curlimages/curl:8.11.0
|
||||
environment:
|
||||
SLACK_WEBHOOK:
|
||||
from_secret: slack_webhook
|
||||
commands:
|
||||
- |
|
||||
curl -s -X POST -H 'Content-type: application/json' \
|
||||
--data "{\"channel\":\"general\",\"text\":\"Registry config sync on 10.0.20.10: ${CI_PIPELINE_STATUS}\"}" \
|
||||
"$SLACK_WEBHOOK" || true
|
||||
when:
|
||||
status: [success, failure]
|
||||
79
.woodpecker/renew-tls.yml
Normal file
79
.woodpecker/renew-tls.yml
Normal file
|
|
@ -0,0 +1,79 @@
|
|||
when:
|
||||
event: cron
|
||||
cron: renew-tls-certificate
|
||||
|
||||
clone:
|
||||
git:
|
||||
image: woodpeckerci/plugin-git
|
||||
settings:
|
||||
attempts: 5
|
||||
backoff: 10s
|
||||
|
||||
steps:
|
||||
- name: prepare
|
||||
image: alpine
|
||||
commands:
|
||||
- "apk update && apk add jq curl git git-crypt"
|
||||
- |
|
||||
curl -k https://10.0.20.100:6443/api/v1/namespaces/woodpecker/configmaps/git-crypt-key -H "Authorization:Bearer $(cat /var/run/secrets/kubernetes.io/serviceaccount/token)" | jq -r .data.key | base64 -d > /tmp/key
|
||||
- "git-crypt unlock /tmp/key && rm /tmp/key"
|
||||
|
||||
- name: renew-tls
|
||||
image: alpine
|
||||
environment:
|
||||
TECHNITIUM_API_KEY:
|
||||
from_secret: TECHNITIUM_API_KEY
|
||||
CLOUDFLARE_TOKEN:
|
||||
from_secret: CLOUDFLARE_TOKEN
|
||||
CLOUDFLARE_ZONE_ID:
|
||||
from_secret: CLOUDFLARE_ZONE_ID
|
||||
commands:
|
||||
- "apk update && apk add certbot curl jq"
|
||||
- "./modules/kubernetes/setup_tls_secret/renew2.sh"
|
||||
|
||||
- name: commit-certs
|
||||
image: alpine
|
||||
commands:
|
||||
- "apk update && apk add openssh-client git git-crypt"
|
||||
- "mkdir -p ~/.ssh && ssh-keyscan -H github.com >> ~/.ssh/known_hosts"
|
||||
- "chmod 400 secrets/deploy_key"
|
||||
# Only add specific paths — never git add .
|
||||
- "git add secrets/ state/ || true"
|
||||
- "git remote set-url origin git@github.com:ViktorBarzin/infra.git"
|
||||
- "git commit -m 'Woodpecker CI Update TLS Certificates Commit' || echo 'No changes'"
|
||||
- "GIT_SSH_COMMAND='ssh -i ./secrets/deploy_key -o IdentitiesOnly=yes' git pull --rebase origin master"
|
||||
- "GIT_SSH_COMMAND='ssh -i ./secrets/deploy_key -o IdentitiesOnly=yes' git push origin master"
|
||||
|
||||
- name: verify-cert
|
||||
image: alpine
|
||||
commands:
|
||||
- "apk update && apk add openssl"
|
||||
- "openssl x509 -checkend 604800 -noout -in secrets/fullchain.pem"
|
||||
- "echo 'Certificate is valid for at least 7 more days'"
|
||||
|
||||
- name: update-tls-source-secret
|
||||
image: alpine
|
||||
commands:
|
||||
- "apk update && apk add curl"
|
||||
- "curl -LO https://dl.k8s.io/release/v1.31.0/bin/linux/amd64/kubectl && chmod +x kubectl && mv kubectl /usr/local/bin/"
|
||||
- |
|
||||
SECRET_YAML=$(kubectl create secret tls tls-secret \
|
||||
--cert=secrets/fullchain.pem --key=secrets/privkey.pem \
|
||||
--namespace=placeholder --dry-run=client -o yaml)
|
||||
for ns in $(kubectl get ns -o jsonpath='{.items[*].metadata.name}' | tr ' ' '\n' | grep -v '^kube-'); do
|
||||
echo "$SECRET_YAML" | sed "s/namespace: placeholder/namespace: $ns/" | kubectl apply -f - 2>/dev/null || true
|
||||
done
|
||||
- "echo 'TLS secret updated in all namespaces'"
|
||||
|
||||
- name: slack
|
||||
image: curlimages/curl
|
||||
commands:
|
||||
- |
|
||||
curl -s -X POST -H 'Content-type: application/json' \
|
||||
--data "{\"channel\":\"general\",\"text\":\"Woodpecker CI: TLS certificate renewal ${CI_PIPELINE_STATUS}\"}" \
|
||||
"$SLACK_WEBHOOK" || true
|
||||
environment:
|
||||
SLACK_WEBHOOK:
|
||||
from_secret: slack_webhook
|
||||
when:
|
||||
status: [success, failure]
|
||||
Loading…
Add table
Add a link
Reference in a new issue