diff --git a/.claude/reference/service-catalog.md b/.claude/reference/service-catalog.md index 905c9e94..3d888e93 100644 --- a/.claude/reference/service-catalog.md +++ b/.claude/reference/service-catalog.md @@ -137,3 +137,18 @@ jellyfin, jellyseerr, tdarr, affine, health, family, openclaw - `*.viktor.actualbudget` - Actualbudget factory instances - `*.freedify` - Freedify factory instances - `mailserver.*` - Mail server components (antispam, admin) + +## Key Runbooks + +Operational surfaces that aren't k8s services (VMs, pipelines, host-side +procedures) are documented in `infra/docs/runbooks/`: + +| Surface | Runbook | +|---|---| +| Private Docker registry VM (10.0.20.10) | [registry-vm.md](../../docs/runbooks/registry-vm.md) | +| Rebuild after orphan-index incident | [registry-rebuild-image.md](../../docs/runbooks/registry-rebuild-image.md) | +| PVE host operations (backups, LVM) | [proxmox-host.md](../../docs/runbooks/proxmox-host.md) | +| NFS prerequisites and CSI mount options | [nfs-prerequisites.md](../../docs/runbooks/nfs-prerequisites.md) | +| pfSense + Unbound DNS | [pfsense-unbound.md](../../docs/runbooks/pfsense-unbound.md) | +| Mailserver PROXY-protocol / HAProxy | [mailserver-pfsense-haproxy.md](../../docs/runbooks/mailserver-pfsense-haproxy.md) | +| Technitium apply flow | [technitium-apply.md](../../docs/runbooks/technitium-apply.md) | diff --git a/.woodpecker/build-ci-image.yml b/.woodpecker/build-ci-image.yml index f08c00ce..40d8e667 100644 --- a/.woodpecker/build-ci-image.yml +++ b/.woodpecker/build-ci-image.yml @@ -1,12 +1,14 @@ # Build the CI tools Docker image used by all infra pipelines. -# Triggers on changes to ci/Dockerfile only (push to master). +# Triggers on push that touches ci/Dockerfile, or manual (API/UI) so +# rebuilds after a registry incident don't need a cosmetic Dockerfile edit. when: - event: push - branch: master - path: - include: - - 'ci/Dockerfile' + - event: push + branch: master + path: + include: + - 'ci/Dockerfile' + - event: manual steps: - name: build-and-push @@ -27,6 +29,83 @@ steps: password: from_secret: registry_password + # Post-push integrity check. Re-resolves the image we just pushed and HEADs + # every blob it references — top-level manifest (index or single), each child + # platform manifest, each config blob, each layer blob. If any returns !=200 + # the pipeline fails loudly here so we never ship a broken index downstream. + # Historical context: 2026-04-13 and 2026-04-19 incidents both shipped indexes + # whose platform/attestation children had been GC-orphaned on the registry VM. + - name: verify-integrity + image: alpine:3.20 + environment: + REG_USER: + from_secret: registry_user + REG_PASS: + from_secret: registry_password + commands: + - apk add --no-cache curl jq + - REG=registry.viktorbarzin.me:5050 + - REPO=infra-ci + - SHA=${CI_COMMIT_SHA:0:8} + - AUTH="$REG_USER:$REG_PASS" + - | + set -euo pipefail + ACCEPT='Accept: application/vnd.oci.image.index.v1+json,application/vnd.oci.image.manifest.v1+json,application/vnd.docker.distribution.manifest.list.v2+json,application/vnd.docker.distribution.manifest.v2+json' + + fetch_manifest() { + # Prints the body to $2, returns the HTTP code as stdout. + curl -sk -u "$AUTH" -H "$ACCEPT" \ + -o "$2" -w '%{http_code}' \ + "https://$REG/v2/$REPO/manifests/$1" + } + head_blob() { + curl -sk -u "$AUTH" -o /dev/null -w '%{http_code}' \ + -I "https://$REG/v2/$REPO/blobs/$1" + } + + verify_single_manifest() { + local ref="$1" tmp=/tmp/m-$$.json + local rc cfg + rc=$(fetch_manifest "$ref" "$tmp") + if [ "$rc" != "200" ]; then + echo "FAIL: manifest $ref returned HTTP $rc"; return 1 + fi + cfg=$(jq -r '.config.digest // empty' "$tmp") + if [ -n "$cfg" ]; then + rc=$(head_blob "$cfg") + [ "$rc" = "200" ] || { echo "FAIL: config blob $cfg returned HTTP $rc"; return 1; } + fi + jq -r '.layers[]?.digest' "$tmp" > /tmp/layers-$$.txt + while IFS= read -r layer; do + [ -z "$layer" ] && continue + rc=$(head_blob "$layer") + [ "$rc" = "200" ] || { echo "FAIL: layer blob $layer returned HTTP $rc"; return 1; } + done < /tmp/layers-$$.txt + return 0 + } + + echo "=== Verifying push integrity for $REPO:$SHA ===" + TOP=/tmp/top-$$.json + rc=$(fetch_manifest "$SHA" "$TOP") + [ "$rc" = "200" ] || { echo "FAIL: top manifest :$SHA returned HTTP $rc"; exit 1; } + + MT=$(jq -r '.mediaType // empty' "$TOP") + echo "Top-level media type: ${MT:-}" + + if echo "$MT" | grep -Eq 'manifest\.list|image\.index'; then + jq -r '.manifests[].digest' "$TOP" > /tmp/children-$$.txt + echo "Multi-platform index: $(wc -l For each repository, keeps the last 10 tags by mtime, rmtrees the rest. + This walks `_manifests/tags/` directly, bypassing the registry API. + │ + ├─> [2] registry:2 garbage-collect runs weekly (Sun 03:25 for the + │ private registry). Walks live manifests through refcounts, but + │ distribution/distribution#3324 showed this walker has historical + │ bugs with OCI image-index children — it can decrement a shared + │ child's refcount below 1 and delete the blob even while the + │ index that references it is still referenced. + │ + └─> [3] Result: the `infra-ci:latest` index is intact + (`_manifests/revisions/sha256//data` present on disk), but + its `.manifests[0].digest` — the `linux/amd64` child — points + to a `blobs/sha256/98/98f718c8…/` whose `data` file is gone. + +[pull] containerd resolves `infra-ci:latest` + │ + ├─> GET /v2/infra-ci/manifests/latest → 200 OK, returns the index + │ + └─> GET /v2/infra-ci/manifests/sha256:98f718c8… → 404 Not Found + └─> containerd fails the pull with "manifest unknown" + └─> woodpecker exit 126 +``` + +## Why Existing Remediation Missed It + +1. **`fix-broken-blobs.sh` only scans layer links.** The existing cron + walks `_layers/sha256/` and removes link files whose blob `data` is + missing. It does NOT inspect `_manifests/revisions/sha256/` to see + whether an image-index's referenced children still exist. That's + exactly the class of orphan this incident represents. +2. **`registry:2` image tag was floating.** `docker-compose.yml` pinned + only to `registry:2`. Whatever Docker Inc. last rebuilt as + "v2-current" was running, with no version pin. Any regression in + the upstream walker would silently swap in. +3. **No integrity monitoring.** Prometheus alerted on cache hit rate + and registry-down, but nothing probes "are the manifests the registry + advertises actually fetchable?" +4. **CI pipeline didn't verify its own push.** `buildx --push` returns + success as soon as it uploads. If a child blob upload 0-byted or + the client disconnected mid-push (distinct from the GC mode but the + same on-disk symptom), nothing would notice until the next pull. + +## Permanent Fix — Three Phases + +### Phase 1 — Detection (ship today) + +1. **Post-push integrity check** in `.woodpecker/build-ci-image.yml`. + After `build-and-push`, a new step walks the just-pushed manifest + (and every child of an image index) and HEADs every referenced blob. + Any non-200 fails the pipeline immediately, catching broken pushes at + the source rather than leaking them to consumers. +2. **Prometheus alert `RegistryManifestIntegrityFailure`.** A new + CronJob (`registry-integrity-probe`, every 15m, in the `monitoring` + namespace) walks the private registry's catalog, HEADs every tag's + manifest, follows each image index's children, and pushes + `registry_manifest_integrity_failures` to Pushgateway. Accompanying + alerts: `RegistryIntegrityProbeStale`, `RegistryCatalogInaccessible`. +3. **Post-mortem** — this document. Linked from + `.claude/reference/service-catalog.md` via the new runbook. + +### Phase 2 — Prevention + +4. **Pin `registry:2` → `registry:2.8.3`** in + `modules/docker-registry/docker-compose.yml` (all six registry + services). Removes the floating-tag footgun. +5. **Extend `fix-broken-blobs.sh`** to scan every + `_manifests/revisions/sha256/` that is an image index and + flag children whose blob `data` file is missing. The script prints a + loud WARNING per orphan; it does not auto-delete the index, because + deleting a published image is a conscious decision, not an automated + repair. + +### Phase 3 — Recovery tooling + +6. **Manual event trigger** on `build-ci-image.yml`. Rebuilds no longer + need a cosmetic Dockerfile edit — POST to the Woodpecker API or + click "Run manually" in the UI. +7. **Runbook** `docs/runbooks/registry-rebuild-image.md` — exact + command sequence for the next time this happens, plus fallback paths. + +## Out of Scope + +- **Pull-through caches.** The DockerHub / GHCR mirrors on + `:5000` / `:5010` are healthy (74.5% cache hit rate, no 404s). The + orphan problem is private-registry-only. No changes to nginx or + containerd `hosts.toml`. +- **Registry HA / replication.** Single-VM SPOF is a known + architectural choice. Harbor or a replicated registry would solve + more than this incident requires, at multi-day cost. Synology offsite + snapshots already give RPO < 1 day. +- **Disabling `cleanup-tags.sh`.** Keeping storage bounded is still + necessary; the fix is detection + rebuild, not "stop cleaning up". + +## Lessons + +- **Repeat incidents deserve root-cause work, not a third hot-fix.** The + 2026-04-13 incident was closed when CI turned green. Without a probe + and without a scan for orphan indexes, the next incident was + inevitable — and it happened six days later against a different image. +- **"No alert fired, so it wasn't detected" is a monitoring gap, not an + outage feature.** The registry was serving 404s for 2+ hours before + anyone noticed, because our only signal was "pipeline failures" and + our eyes were elsewhere. The new probe closes that gap. +- **CI pipelines should verify their own output.** The `buildx --push` + "success" exit code is not a guarantee of pulled-back integrity — as + this incident proves. A 30-second post-push HEAD walk is cheap + insurance. + +## Related + +- **Prior incident (same failure mode, different image)**: memory `709` + / `710` — 2026-04-13. +- **Runbook**: `docs/runbooks/registry-rebuild-image.md` (new). +- **Hot-fix commits**: `a05d63ee`, `6371e75e`, `c113be4d`. +- **Upstream bug class**: `distribution/distribution#3324`. diff --git a/docs/runbooks/registry-rebuild-image.md b/docs/runbooks/registry-rebuild-image.md new file mode 100644 index 00000000..c1feb34f --- /dev/null +++ b/docs/runbooks/registry-rebuild-image.md @@ -0,0 +1,170 @@ +# Runbook: Rebuild an Image After a Registry Orphan-Index Incident + +Last updated: 2026-04-19 + +## When to use this + +Pipelines that pull from `registry.viktorbarzin.me:5050` are failing with +messages like: + +- `failed to resolve reference … : not found` +- `manifest unknown` +- `image can't be pulled` (Woodpecker exit 126) +- `error pulling image`: HEAD on a child manifest digest returns 404 + +…and `skopeo inspect --tls-verify --creds "$USER:$PASS" docker://registry.viktorbarzin.me:5050/:` +returns an OCI image index whose `manifests[].digest` references are 404 +on the registry. + +This is the **orphan OCI-index** failure mode documented in +`docs/post-mortems/2026-04-19-registry-orphan-index.md`. The fix is to +rebuild the affected image from source so the registry receives a fresh, +complete push. + +If the symptom is different (e.g., registry container down, TLS expiry, +auth failure), use `docs/runbooks/registry-vm.md` instead. + +## Phase 1 — Confirm the diagnosis + +From any host with `skopeo`: + +```sh +REG=registry.viktorbarzin.me:5050 +IMAGE=infra-ci +TAG=latest + +# 1. Confirm the index exists. +skopeo inspect --tls-verify --creds "$USER:$PASS" \ + --raw "docker://$REG/$IMAGE:$TAG" | jq '.mediaType, .manifests[].digest' + +# 2. HEAD each child. Any non-200 = confirmed orphan. +for d in $(skopeo inspect --tls-verify --creds "$USER:$PASS" --raw \ + "docker://$REG/$IMAGE:$TAG" | jq -r '.manifests[].digest'); do + code=$(curl -sk -u "$USER:$PASS" -o /dev/null -w '%{http_code}' \ + -I "https://$REG/v2/$IMAGE/manifests/$d") + echo "$d → $code" +done +``` + +If every child is 200, the problem is elsewhere — stop here and check +the registry VM, TLS, or auth. + +The `registry-integrity-probe` CronJob in the `monitoring` namespace +runs this same check every 15 minutes across every tag in the catalog; +its last run is also a fast way to see which image(s) are affected: + +```sh +kubectl -n monitoring logs \ + $(kubectl -n monitoring get pods -l job-name -o name \ + | grep registry-integrity-probe | head -1) +``` + +## Phase 2 — Rebuild + +### Option A (preferred): rebuild via CI + +Find the `build-*.yml` pipeline that produces the image: + +| Image | Pipeline | Repo ID | +|---|---|---| +| `infra-ci` | `.woodpecker/build-ci-image.yml` | 1 (infra) | +| `infra` (cli) | `.woodpecker/build-cli.yml` | 1 (infra) | +| `k8s-portal` | `.woodpecker/k8s-portal.yml` | 1 (infra) | + +Trigger a manual build. The Woodpecker API expects a numeric repo ID +(paths with `owner/name` return HTML): + +```sh +WOODPECKER_TOKEN=$(vault kv get -field=woodpecker_admin_token secret/viktor) + +# Kick off a manual build against master. +curl -s -X POST \ + -H "Authorization: Bearer $WOODPECKER_TOKEN" \ + -H "Content-Type: application/json" \ + "https://ci.viktorbarzin.me/api/repos/1/pipelines" \ + -d '{"branch":"master"}' | jq .number + +# Follow the pipeline at https://ci.viktorbarzin.me/repos/1/pipeline/ +``` + +The pipeline's `verify-integrity` step walks every blob the push +references. If it passes, the registry now has a clean index; pull +consumers will recover on next attempt. + +### Option B (fallback): build on the registry VM + +Only use this if Woodpecker itself is broken (its own pipeline runs +from the same `infra-ci` image, so a corrupted `infra-ci:latest` can +prevent Option A from recovering). + +```sh +ssh root@10.0.20.10 ' + cd /tmp + git clone --depth 1 https://github.com/ViktorBarzin/infra + cd infra/ci + docker build -t registry.viktorbarzin.me:5050/infra-ci:manual -t registry.viktorbarzin.me:5050/infra-ci:latest . + docker login -u "$USER" -p "$PASS" registry.viktorbarzin.me:5050 + docker push registry.viktorbarzin.me:5050/infra-ci:manual + docker push registry.viktorbarzin.me:5050/infra-ci:latest +' +``` + +Then re-run any pipelines that failed — Woodpecker UI → Restart, or: + +```sh +curl -s -X POST \ + -H "Authorization: Bearer $WOODPECKER_TOKEN" \ + "https://ci.viktorbarzin.me/api/repos/1/pipelines/" +``` + +## Phase 3 — Verify + +```sh +# 1. Pull the image fresh (bypassing containerd cache) and check its index. +REG=registry.viktorbarzin.me:5050 +skopeo inspect --tls-verify --creds "$USER:$PASS" \ + --raw "docker://$REG/infra-ci:latest" \ + | jq '.manifests[] | {digest, platform}' + +# 2. HEAD every child digest — all should be 200. +for d in $(skopeo inspect --tls-verify --creds "$USER:$PASS" --raw \ + "docker://$REG/infra-ci:latest" | jq -r '.manifests[].digest'); do + code=$(curl -sk -u "$USER:$PASS" -o /dev/null -w '%{http_code}' \ + -I "https://$REG/v2/infra-ci/manifests/$d") + [ "$code" = "200" ] || echo "STILL BROKEN: $d → $code" +done +echo "verified" + +# 3. Kick off the next scheduled probe for good measure. +kubectl -n monitoring create job --from=cronjob/registry-integrity-probe \ + registry-integrity-probe-verify-$(date +%s) +kubectl -n monitoring logs -f -l job-name=registry-integrity-probe-verify-$(date +%s) +``` + +The `RegistryManifestIntegrityFailure` alert clears automatically when +the probe's next run returns zero failures. + +## Phase 4 — Investigate orphans + +Once the immediate fix is in, check whether any OTHER images on the +registry have orphan children: + +```sh +ssh root@10.0.20.10 'python3 /opt/registry/fix-broken-blobs.sh --dry-run 2>&1 | grep "ORPHAN INDEX"' +``` + +Each hit is a separate image that will eventually fail to pull. Rebuild +them in the same way (Option A preferred). If the list is long, open a +beads task — do NOT batch-delete the indexes; that's a destructive +registry operation outside this runbook's scope. + +## Related + +- `docs/post-mortems/2026-04-19-registry-orphan-index.md` — why this + happens. +- `docs/runbooks/registry-vm.md` — VM-level operations (DNS, + `docker compose` restarts). +- `modules/docker-registry/fix-broken-blobs.sh` — the scanner cron + itself, runs nightly and after each GC. +- `stacks/monitoring/modules/monitoring/main.tf` — + `registry_integrity_probe` CronJob definition. diff --git a/docs/runbooks/registry-vm.md b/docs/runbooks/registry-vm.md index 4c6fcd16..0d7518e4 100644 --- a/docs/runbooks/registry-vm.md +++ b/docs/runbooks/registry-vm.md @@ -145,3 +145,7 @@ ssh root@10.0.20.10 ' - `docs/architecture/dns.md` — resolver IP assignments per subnet. - `.claude/CLAUDE.md` (at repo root) — notes on the private registry and `containerd` `hosts.toml` redirects. +- `docs/runbooks/registry-rebuild-image.md` — rebuild an image after an + orphan OCI-index incident (different class of problem than DNS). +- `docs/post-mortems/2026-04-19-registry-orphan-index.md` — root cause + + detection gaps behind the recurring missing-blob incidents. diff --git a/modules/docker-registry/docker-compose.yml b/modules/docker-registry/docker-compose.yml index 3f3537b6..083e6bba 100644 --- a/modules/docker-registry/docker-compose.yml +++ b/modules/docker-registry/docker-compose.yml @@ -3,8 +3,12 @@ networks: driver: bridge services: + # registry:2 is pinned after the 2026-04-13 + 2026-04-19 orphan-index incidents. + # Floating tags were swapping to regressed versions between GC runs. Upgrade + # path: bump all six registry-* services in lockstep and bounce via + # `systemctl restart docker-compose-registry.service`. registry-dockerhub: - image: registry:2 + image: registry:2.8.3 container_name: registry-dockerhub restart: always volumes: @@ -22,7 +26,7 @@ services: start_period: 10s registry-ghcr: - image: registry:2 + image: registry:2.8.3 container_name: registry-ghcr restart: always volumes: @@ -38,7 +42,7 @@ services: start_period: 10s registry-quay: - image: registry:2 + image: registry:2.8.3 container_name: registry-quay restart: always volumes: @@ -54,7 +58,7 @@ services: start_period: 10s registry-k8s: - image: registry:2 + image: registry:2.8.3 container_name: registry-k8s restart: always volumes: @@ -70,7 +74,7 @@ services: start_period: 10s registry-kyverno: - image: registry:2 + image: registry:2.8.3 container_name: registry-kyverno restart: always volumes: @@ -86,7 +90,7 @@ services: start_period: 10s registry-private: - image: registry:2 + image: registry:2.8.3 container_name: registry-private restart: always volumes: diff --git a/modules/docker-registry/fix-broken-blobs.sh b/modules/docker-registry/fix-broken-blobs.sh index 84d35bdb..c3dfe702 100644 --- a/modules/docker-registry/fix-broken-blobs.sh +++ b/modules/docker-registry/fix-broken-blobs.sh @@ -1,25 +1,33 @@ #!/usr/bin/env python3 -"""Finds and removes layer links that point to non-existent blobs. +"""Registry integrity scanner — two classes of brokenness. -When the cleanup-tags.sh + garbage-collect cycle runs, it can delete blob data -while leaving _layers/ link files intact. The registry then returns HTTP 200 -with 0 bytes for those layers (it finds the link, trusts the blob exists, but -the data is gone). This causes containerd to fail with "unexpected EOF". +1. Orphaned layer links: the cleanup-tags.sh + garbage-collect cycle can delete + blob data while leaving _layers/ link files intact. The registry then returns + HTTP 200 with 0 bytes for those layers (it finds the link, trusts the blob + exists, but the data is gone). Containerd sees "unexpected EOF". + Action: delete the orphan link so the next pull re-fetches cleanly. -This script walks all repositories, checks each layer link against the actual -blobs directory, and removes any orphaned links. On next pull, the registry -will re-fetch the missing blobs from the upstream registry. +2. Orphaned OCI-index children: an image index (multi-platform manifest list) + references child manifests by digest. If a child's blob has been deleted — + by a cleanup-tags.sh tag rmtree followed by garbage-collect walking the + children wrong (distribution/distribution#3324 class), or by an incomplete + `buildx --push` whose partial blob was later purged by `uploadpurging` — + the index survives but pulls fail with `manifest unknown`. + Action: log loudly. Deleting an index is a conscious decision (the image + was published; removing it breaks downstream consumers), so we surface + the problem and leave repair to a human or to the rebuild runbook. -Run after garbage-collect (e.g., 3:15 AM Sunday) or daily. +Run after garbage-collect (Sunday 03:30) and daily (Mon-Sat 02:30). """ import argparse +import json import os import sys sys.stdout.reconfigure(line_buffering=True) -parser = argparse.ArgumentParser(description="Remove orphaned registry layer links") +parser = argparse.ArgumentParser(description="Scan registry for orphaned blobs and indexes") parser.add_argument("base", nargs="?", default="/opt/registry/data", help="Registry data directory") parser.add_argument("--dry-run", action="store_true", help="Report but don't delete") args = parser.parse_args() @@ -27,39 +35,101 @@ args = parser.parse_args() BASE = args.base DRY_RUN = args.dry_run -total_removed = 0 -total_checked = 0 +INDEX_MEDIA_TYPES = ( + "application/vnd.oci.image.index.v1+json", + "application/vnd.docker.distribution.manifest.list.v2+json", +) + +total_layer_removed = 0 +total_layer_checked = 0 +total_index_scanned = 0 +total_index_orphans = 0 + + +def load_manifest_blob(blobs_root, digest_hex): + blob_path = os.path.join(blobs_root, digest_hex[:2], digest_hex, "data") + if not os.path.isfile(blob_path): + return None + try: + with open(blob_path, "rb") as f: + raw = f.read(1024 * 1024) + except OSError: + return None + try: + return json.loads(raw) + except (json.JSONDecodeError, UnicodeDecodeError): + return None + for registry_name in sorted(os.listdir(BASE)): repos_dir = os.path.join(BASE, registry_name, "docker/registry/v2/repositories") - blobs_dir = os.path.join(BASE, registry_name, "docker/registry/v2/blobs") + blobs_root = os.path.join(BASE, registry_name, "docker/registry/v2/blobs/sha256") if not os.path.isdir(repos_dir): continue - for root, dirs, files in os.walk(repos_dir): - if not root.endswith("/_layers/sha256"): - continue + for root, _, _ in os.walk(repos_dir): + # --- Scan 1: orphan layer links ---------------------------------------- + if root.endswith("/_layers/sha256"): + repo = root.replace(repos_dir + "/", "").replace("/_layers/sha256", "") - repo = root.replace(repos_dir + "/", "").replace("/_layers/sha256", "") + for digest_dir in os.listdir(root): + link_file = os.path.join(root, digest_dir, "link") + if not os.path.isfile(link_file): + continue - for digest_dir in os.listdir(root): - link_file = os.path.join(root, digest_dir, "link") - if not os.path.isfile(link_file): - continue + total_layer_checked += 1 + blob_data = os.path.join(blobs_root, digest_dir[:2], digest_dir, "data") + if os.path.isfile(blob_data): + continue - total_checked += 1 - - # Check if the actual blob data exists - blob_data = os.path.join(blobs_dir, "sha256", digest_dir[:2], digest_dir, "data") - if not os.path.isfile(blob_data): prefix = "[DRY RUN] " if DRY_RUN else "" print(f"{prefix}[{registry_name}/{repo}] removing orphaned layer link: {digest_dir[:12]}...") if not DRY_RUN: - # Remove the entire digest directory (contains the link file) import shutil shutil.rmtree(os.path.join(root, digest_dir)) - total_removed += 1 + total_layer_removed += 1 + + # --- Scan 2: orphan OCI-index children -------------------------------- + elif root.endswith("/_manifests/revisions/sha256"): + repo = root.replace(repos_dir + "/", "").replace("/_manifests/revisions/sha256", "") + + for digest_dir in os.listdir(root): + # Manifest revision entry. Load the blob it points to. + manifest = load_manifest_blob(blobs_root, digest_dir) + if manifest is None: + continue + + media_type = manifest.get("mediaType", "") + if media_type not in INDEX_MEDIA_TYPES: + continue + + total_index_scanned += 1 + + for child in manifest.get("manifests", []): + child_digest = child.get("digest", "") + if not child_digest.startswith("sha256:"): + continue + child_hex = child_digest[len("sha256:"):] + child_blob = os.path.join(blobs_root, child_hex[:2], child_hex, "data") + if os.path.isfile(child_blob): + continue + + platform = child.get("platform", {}) + arch = platform.get("architecture", "?") + os_ = platform.get("os", "?") + print( + f"WARNING [{registry_name}/{repo}] ORPHAN INDEX: " + f"{digest_dir[:12]} references missing child {child_hex[:12]} " + f"({arch}/{os_}) — rebuild required, will not auto-repair" + ) + total_index_orphans += 1 + mode = "DRY RUN — " if DRY_RUN else "" -print(f"\n{mode}Checked {total_checked} layer links, removed {total_removed} orphaned.") +print(f"\n{mode}Layer scan: checked {total_layer_checked} links, removed {total_layer_removed} orphaned.") +print(f"{mode}Index scan: inspected {total_index_scanned} image indexes, found {total_index_orphans} orphaned children.") +if total_index_orphans > 0: + print(f"\nACTION REQUIRED: {total_index_orphans} orphan index child(ren) detected. " + "See docs/runbooks/registry-rebuild-image.md — the affected image must be rebuilt " + "(a registry DELETE on an index is a conscious decision, not an automated repair).") diff --git a/stacks/monitoring/main.tf b/stacks/monitoring/main.tf index ddf77022..c4961fdd 100644 --- a/stacks/monitoring/main.tf +++ b/stacks/monitoring/main.tf @@ -30,5 +30,7 @@ module "monitoring" { haos_api_token = data.vault_kv_secret_v2.secrets.data["haos_api_token"] pve_password = data.vault_kv_secret_v2.secrets.data["pve_password"] grafana_admin_password = data.vault_kv_secret_v2.secrets.data["grafana_admin_password"] + registry_user = data.vault_kv_secret_v2.viktor.data["registry_user"] + registry_password = data.vault_kv_secret_v2.viktor.data["registry_password"] tier = local.tiers.cluster } diff --git a/stacks/monitoring/modules/monitoring/main.tf b/stacks/monitoring/modules/monitoring/main.tf index 501124b1..db0c798e 100644 --- a/stacks/monitoring/modules/monitoring/main.tf +++ b/stacks/monitoring/modules/monitoring/main.tf @@ -29,6 +29,14 @@ variable "grafana_admin_password" { } variable "tier" { type = string } variable "mysql_host" { type = string } +variable "registry_user" { + type = string + sensitive = true +} +variable "registry_password" { + type = string + sensitive = true +} resource "kubernetes_namespace" "monitoring" { metadata { @@ -225,6 +233,195 @@ resource "kubernetes_cron_job_v1" "dns_anomaly_monitor" { } } +# ----------------------------------------------------------------------------- +# Registry manifest-integrity probe — HEADs every tag in the private R/W +# registry's catalog, walks multi-platform image indexes, and reports blob +# availability. Catches the orphan-index failure mode seen 2026-04-13 and +# 2026-04-19 before downstream pipelines hit it. +# See: docs/post-mortems/2026-04-19-registry-orphan-index.md +# ----------------------------------------------------------------------------- +resource "kubernetes_secret" "registry_probe_credentials" { + metadata { + name = "registry-probe-credentials" + namespace = kubernetes_namespace.monitoring.metadata[0].name + } + type = "Opaque" + data = { + REG_USER = var.registry_user + REG_PASS = var.registry_password + } +} + +resource "kubernetes_cron_job_v1" "registry_integrity_probe" { + metadata { + name = "registry-integrity-probe" + namespace = kubernetes_namespace.monitoring.metadata[0].name + } + spec { + concurrency_policy = "Forbid" + failed_jobs_history_limit = 3 + successful_jobs_history_limit = 3 + schedule = "*/15 * * * *" + job_template { + metadata {} + spec { + backoff_limit = 1 + ttl_seconds_after_finished = 600 + template { + metadata {} + spec { + container { + name = "registry-integrity-probe" + image = "docker.io/library/alpine:3.20" + env { + name = "REG_USER" + value_from { + secret_key_ref { + name = kubernetes_secret.registry_probe_credentials.metadata[0].name + key = "REG_USER" + } + } + } + env { + name = "REG_PASS" + value_from { + secret_key_ref { + name = kubernetes_secret.registry_probe_credentials.metadata[0].name + key = "REG_PASS" + } + } + } + env { + name = "REGISTRY_HOST" + value = "10.0.20.10:5050" + } + env { + name = "REGISTRY_INSTANCE" + value = "registry.viktorbarzin.me:5050" + } + env { + name = "PUSHGATEWAY" + value = "http://prometheus-prometheus-pushgateway.monitoring:9091/metrics/job/registry-integrity-probe" + } + env { + name = "TAGS_PER_REPO" + value = "5" + } + command = ["/bin/sh", "-c", <<-EOT + set -eu + apk add --no-cache curl jq >/dev/null + + REG="$REGISTRY_HOST" + INSTANCE="$REGISTRY_INSTANCE" + AUTH="$REG_USER:$REG_PASS" + ACCEPT='application/vnd.oci.image.index.v1+json,application/vnd.oci.image.manifest.v1+json,application/vnd.docker.distribution.manifest.list.v2+json,application/vnd.docker.distribution.manifest.v2+json' + + push() { + # Prometheus pushgateway — body ends with blank line. Ignore push errors. + curl -sf --max-time 10 --data-binary @- "$PUSHGATEWAY" >/dev/null 2>&1 || true + } + + CATALOG=$(curl -sk -u "$AUTH" --max-time 30 "https://$REG/v2/_catalog?n=1000" || echo "") + REPOS=$(echo "$CATALOG" | jq -r '.repositories[]?' 2>/dev/null || echo "") + + if [ -z "$REPOS" ]; then + echo "ERROR: empty catalog or auth failure — cannot probe" + NOW=$(date +%s) + push < /tmp/repos.txt + while IFS= read -r repo; do + [ -z "$repo" ] && continue + REPOS_N=$((REPOS_N + 1)) + + TAGS_JSON=$(curl -sk -u "$AUTH" --max-time 15 "https://$REG/v2/$repo/tags/list" || echo "") + echo "$TAGS_JSON" | jq -r '.tags[]?' 2>/dev/null | tail -n "$TAGS_PER_REPO" > /tmp/tags.txt || true + + while IFS= read -r tag; do + [ -z "$tag" ] && continue + TAGS_N=$((TAGS_N + 1)) + + HTTP=$(curl -sk -u "$AUTH" -o /tmp/m.json -w '%%{http_code}' \ + -H "Accept: $ACCEPT" --max-time 15 \ + "https://$REG/v2/$repo/manifests/$tag") + if [ "$HTTP" != "200" ]; then + echo "FAIL: $repo:$tag manifest HTTP $HTTP" + FAIL=$((FAIL + 1)) + continue + fi + + MT=$(jq -r '.mediaType // empty' /tmp/m.json 2>/dev/null || echo "") + if echo "$MT" | grep -Eq 'manifest\.list|image\.index'; then + INDEXES_N=$((INDEXES_N + 1)) + jq -r '.manifests[].digest' /tmp/m.json > /tmp/children.txt 2>/dev/null || true + while IFS= read -r d; do + [ -z "$d" ] && continue + CH=$(curl -sk -u "$AUTH" -o /dev/null -w '%%{http_code}' \ + -H "Accept: $ACCEPT" --max-time 10 -I \ + "https://$REG/v2/$repo/manifests/$d") + if [ "$CH" != "200" ]; then + echo "FAIL: $repo:$tag index child $d HTTP $CH" + FAIL=$((FAIL + 1)) + fi + done < /tmp/children.txt + fi + done < /tmp/tags.txt + done < /tmp/repos.txt + + NOW=$(date +%s) + push < 0 + for: 30m + labels: + severity: critical + annotations: + summary: "Registry has {{ $value }} broken manifest reference(s) — orphan index or missing blob" + description: "The registry-integrity-probe CronJob in the monitoring namespace found {{ $value }} manifest/blob references that return non-200 on the private registry. Almost certainly an orphan OCI-index child from the cleanup-tags.sh+GC race. Rebuild the affected image per docs/runbooks/registry-rebuild-image.md and investigate which tag(s) the probe logs flagged." + - alert: RegistryIntegrityProbeStale + expr: time() - registry_manifest_integrity_last_run_timestamp > 3600 + for: 15m + labels: + severity: warning + annotations: + summary: "Registry integrity probe has not reported in >1h — CronJob may be broken" + - alert: RegistryCatalogInaccessible + expr: registry_manifest_integrity_catalog_accessible == 0 + for: 15m + labels: + severity: critical + annotations: + summary: "Registry probe cannot fetch /v2/_catalog — auth failure or registry down" - alert: NodeHighCPUUsage expr: pve_cpu_usage_ratio * 100 > 60 for: 6h