From fbb41eff9db08583be86e130b974b7e7c6145f03 Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Thu, 7 May 2026 16:01:20 +0000 Subject: [PATCH] [ci] Phase 1: infra-ci dual-push + break-glass tarball MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds Forgejo as a second push target on the build-ci-image pipeline and saves the just-pushed image as a gzipped tarball on the registry VM disk (/opt/registry/data/private/_breakglass/) so we can recover infra-ci with `ctr images import` if both registries are down. * Dual-push: registry.viktorbarzin.me:5050/infra-ci AND forgejo.viktorbarzin.me/viktor/infra-ci, in the same woodpeckerci/plugin-docker-buildx step. Same image bytes; the Forgejo integrity probe (every 15min) catches any divergence. * Break-glass step: SSHes to 10.0.20.10, docker pulls + saves + gzips, keeps last 5 tarballs (latest symlink). Failure-tolerant so a transient registry blip doesn't fail the build pipeline. * Runbook docs/runbooks/forgejo-registry-breakglass.md documents the recovery flow (when to use, scp+ctr import, node cordon, underlying-issue fix). Tarball mirrors to Synology automatically through the existing daily offsite-sync-backup job — no new sync wiring needed. Co-Authored-By: Claude Opus 4.7 --- .woodpecker/build-ci-image.yml | 50 +++++++- docs/runbooks/forgejo-registry-breakglass.md | 126 +++++++++++++++++++ 2 files changed, 173 insertions(+), 3 deletions(-) create mode 100644 docs/runbooks/forgejo-registry-breakglass.md diff --git a/.woodpecker/build-ci-image.yml b/.woodpecker/build-ci-image.yml index 40d8e667..b60cdf8e 100644 --- a/.woodpecker/build-ci-image.yml +++ b/.woodpecker/build-ci-image.yml @@ -14,20 +14,30 @@ steps: - name: build-and-push image: woodpeckerci/plugin-docker-buildx settings: - repo: registry.viktorbarzin.me:5050/infra-ci + # Dual-push during the Forgejo registry consolidation bake. infra-ci + # is the most safety-critical image — every infra pipeline pulls it, + # including the one that fixes Forgejo when it breaks. Tarball + # break-glass below covers the chicken-and-egg. + repo: + - registry.viktorbarzin.me:5050/infra-ci + - forgejo.viktorbarzin.me/viktor/infra-ci dockerfile: ci/Dockerfile context: ci/ tags: - latest - "${CI_COMMIT_SHA:0:8}" platforms: linux/amd64 - registry: registry.viktorbarzin.me:5050 logins: - registry: registry.viktorbarzin.me:5050 username: from_secret: registry_user password: from_secret: registry_password + - registry: forgejo.viktorbarzin.me + username: + from_secret: forgejo_user + password: + from_secret: forgejo_push_token # Post-push integrity check. Re-resolves the image we just pushed and HEADs # every blob it references — top-level manifest (index or single), each child @@ -106,12 +116,46 @@ steps: echo "=== All manifests + blobs verified. Push integrity intact. ===" + # Break-glass tarball: save the just-pushed infra-ci image to disk on the + # registry VM (10.0.20.10) so we can `docker load` it back into a node + # when Forgejo is unreachable AND registry-private is gone (post-Phase 4). + # Best-effort — failure here doesn't fail the pipeline. + # Recovery procedure: docs/runbooks/forgejo-registry-breakglass.md. + - name: breakglass-tarball + image: alpine:3.20 + failure: ignore + environment: + REGISTRY_SSH_KEY: + from_secret: registry_ssh_key + commands: + - apk add --no-cache openssh-client + - mkdir -p ~/.ssh && chmod 700 ~/.ssh + - printf '%s\n' "$REGISTRY_SSH_KEY" > ~/.ssh/id_ed25519 + - chmod 600 ~/.ssh/id_ed25519 + - ssh-keyscan -t ed25519 10.0.20.10 >> ~/.ssh/known_hosts 2>/dev/null + - SHA=${CI_COMMIT_SHA:0:8} + - | + ssh -n -o BatchMode=yes root@10.0.20.10 " + set -e + mkdir -p /opt/registry/data/private/_breakglass + IMAGE=registry.viktorbarzin.me:5050/infra-ci:$SHA + # Pull from the local registry-private — fast hop on the VM itself. + docker pull \$IMAGE + docker save \$IMAGE | gzip > /opt/registry/data/private/_breakglass/infra-ci-$SHA.tar.gz + ln -sfn infra-ci-$SHA.tar.gz /opt/registry/data/private/_breakglass/infra-ci-latest.tar.gz + # Retain last 5 by mtime; older versions are still recoverable from + # registry blobs until a corruption event. + ls -t /opt/registry/data/private/_breakglass/infra-ci-*.tar.gz \ + | grep -v 'latest' | tail -n +6 | xargs -r rm -v + ls -lh /opt/registry/data/private/_breakglass/ + " + - name: slack image: curlimages/curl commands: - | curl -s -X POST -H 'Content-type: application/json' \ - --data "{\"text\":\"CI image built: registry.viktorbarzin.me:5050/infra-ci:${CI_COMMIT_SHA:0:8}\"}" \ + --data "{\"text\":\"CI image built: forgejo.viktorbarzin.me/viktor/infra-ci:${CI_COMMIT_SHA:0:8} (and registry-private mirror)\"}" \ "$SLACK_WEBHOOK" || true environment: SLACK_WEBHOOK: diff --git a/docs/runbooks/forgejo-registry-breakglass.md b/docs/runbooks/forgejo-registry-breakglass.md new file mode 100644 index 00000000..664893d5 --- /dev/null +++ b/docs/runbooks/forgejo-registry-breakglass.md @@ -0,0 +1,126 @@ +# Runbook: Forgejo registry break-glass — recovering infra-ci + +Last updated: 2026-05-07 + +## When to use this runbook + +When **all** of the following are true: + +1. Forgejo (`forgejo.viktorbarzin.me`) is unreachable. +2. `registry-private` is also gone (post-Phase 4 of the consolidation), + so you can't fall back to `registry.viktorbarzin.me:5050/infra-ci`. +3. You need to run an infra Woodpecker pipeline (apply, build-cli, + drift-detection, etc.) — but those pipelines pull `infra-ci` and + crash because the registry is down. + +If only Forgejo is down but `registry-private` is still alive, the +pipelines work — `image:` references in `infra/.woodpecker/*.yml` +still hit `registry.viktorbarzin.me:5050/infra-ci` until Phase 3 +flips them. Skip this runbook entirely. + +## What's available + +The `build-ci-image.yml` Woodpecker pipeline saves a tarball after +each successful push: + +| Location | Path | +|---|---| +| Registry VM disk (10.0.20.10) | `/opt/registry/data/private/_breakglass/infra-ci-.tar.gz` | +| Registry VM disk (latest symlink) | `/opt/registry/data/private/_breakglass/infra-ci-latest.tar.gz` | +| Synology NAS (offsite copy via daily-backup sync) | `/volume1/Backup/Viki/pve-backup/_forgejo-breakglass/` | + +The registry VM keeps the last 5 tarballs. Synology mirrors them +through the existing offsite-sync-backup job (`/usr/local/bin/ +offsite-sync-backup`). + +## Recovery procedure + +The goal is to get a working `infra-ci` image onto a k8s node so +Woodpecker pods can run it. Then run a Woodpecker pipeline that +restores Forgejo from PVC backup or rebuilds it. + +### Step 1 — copy the tarball to a node + +From your workstation (the registry VM is reachable but Forgejo is +not — the rest of the cluster might be in a similar partial state): + +```bash +ssh wizard@10.0.20.103 # any responsive k8s node +sudo mkdir -p /var/breakglass +sudo scp root@10.0.20.10:/opt/registry/data/private/_breakglass/infra-ci-latest.tar.gz \ + /var/breakglass/ +``` + +If the registry VM is also down, fall back to Synology: + +```bash +sudo scp 192.168.1.13:/volume1/Backup/Viki/pve-backup/_forgejo-breakglass/infra-ci-latest.tar.gz \ + /var/breakglass/ +``` + +### Step 2 — load into containerd + +`docker load` won't help on a k8s node — it loads into the docker +daemon, which kubelet/containerd doesn't see. Use `ctr`: + +```bash +sudo ctr -n k8s.io images import /var/breakglass/infra-ci-latest.tar.gz +sudo ctr -n k8s.io images list | grep infra-ci +``` + +Confirm the image is tagged with the original repository name +(`registry.viktorbarzin.me:5050/infra-ci:` — the tarball was +saved with that tag, NOT the Forgejo name). + +### Step 3 — pin pods to this node + +Add a node selector or taint-toleration to whatever pipeline you +need to run. Simplest: cordon the other nodes briefly so Woodpecker +schedules onto this one. + +```bash +for n in $(kubectl get nodes -o name | grep -v $(hostname)); do + kubectl cordon ${n#node/} +done +``` + +Run the pipeline. After it completes: + +```bash +for n in $(kubectl get nodes -o name); do + kubectl uncordon ${n#node/} +done +``` + +### Step 4 — fix the underlying problem + +The pipeline you just ran was meant to restore Forgejo. Common +options: + +- **Forgejo PVC corrupt** — `docs/runbooks/forgejo-registry-rebuild-image.md` + walks through PVC restore from LVM snapshot or PVE backup. +- **Forgejo OOM-loop** — bump memory request+limit in + `infra/stacks/forgejo/main.tf` and apply. +- **Forgejo unreachable due to network** — check Traefik, MetalLB, + pfSense. + +Once Forgejo is back, run `build-ci-image.yml` manually so the +tarball regenerates with the latest commit. + +## Why this exists + +The 2026-04-19 post-mortem on the registry-orphan-index incident +showed that a single registry going corrupt could block ALL infra +pipelines (because every pipeline pulls `infra-ci` from that +registry). The dual-push to Forgejo + registry-private removes that +single-point-of-failure during the bake. After Phase 4 +decommissions registry-private, the tarball is the last line of +defense. + +## Why on the registry VM and not in-cluster + +The Forgejo pod and registry-private pod both depend on cluster +networking + storage. The registry VM is an independent +non-clustered VM with local storage. If the cluster is in a bad +state, the VM's disk is still readable from any other host on the +LAN.