[registry] Stop recurring orphan OCI-index incidents — detection + prevention + recovery
Second identical registry incident on 2026-04-19 (first 2026-04-13): the infra-ci:latest image index resolved to child manifests whose blobs had been garbage-collected out from under the index. Pipelines P366→P376 all exited 126 "image can't be pulled". Hot fix (a05d63e/6371e75/c113be4) restored green CI but left the underlying bug unaddressed. Root cause: cleanup-tags.sh rmtrees tag dirs on the registry VM daily at 02:00, registry:2's GC (Sunday 03:25) walks OCI index children imperfectly (distribution/distribution#3324 class). Nothing verified pushes end-to-end; nothing probed the registry for fetchability; nothing caught orphan indexes. Phase 1 — Detection: - .woodpecker/build-ci-image.yml: after build-and-push, a verify-integrity step walks the just-pushed manifest (index + children + config + every layer blob) via HEAD and fails the pipeline on any non-200. Catches broken pushes at the source. - stacks/monitoring: new registry-integrity-probe CronJob (every 15m) and three alerts — RegistryManifestIntegrityFailure, RegistryIntegrityProbeStale, RegistryCatalogInaccessible — closing the "registry serves 404 for a tag that exists" gap that masked the incident for 2+ hours. - docs/post-mortems/2026-04-19-registry-orphan-index.md: root cause, timeline, monitoring gaps, permanent fix. Phase 2 — Prevention: - modules/docker-registry/docker-compose.yml: pin registry:2 → registry:2.8.3 across all six registry services. Removes the floating-tag footgun. - modules/docker-registry/fix-broken-blobs.sh: new scan walks every _manifests/revisions/sha256/<digest> that is an image index and logs a loud WARNING when a referenced child blob is missing. Does NOT auto- delete — deleting a published image is a conscious decision. Layer-link scan preserved. Phase 3 — Recovery: - build-ci-image.yml: accept `manual` event so Woodpecker API/UI rebuilds don't need a cosmetic Dockerfile edit (matches convention from pve-nfs-exports-sync.yml). - docs/runbooks/registry-rebuild-image.md: exact command sequence for diagnosing + rebuilding after an orphan-index incident, plus a fallback for building directly on the registry VM if Woodpecker itself is down. - docs/runbooks/registry-vm.md + .claude/reference/service-catalog.md: cross-references to the new runbook. Out of scope (verified healthy or intentionally deferred): - Pull-through DockerHub/GHCR mirrors (74.5% hit rate, no 404s). - Registry HA/replication (single-VM SPOF is a known architectural choice; Synology offsite covers RPO < 1 day). - Diun exclude for registry:2 — not applicable; Diun only watches k8s (DIUN_PROVIDERS_KUBERNETES=true), not the VM's docker-compose. Verified locally: - fix-broken-blobs.sh --dry-run on a synthetic registry directory correctly flags both orphan layer links and orphan OCI-index children. - terraform fmt + validate on stacks/monitoring: success (only unrelated deprecation warnings). - python3 yaml.safe_load on .woodpecker/build-ci-image.yml and modules/docker-registry/docker-compose.yml: both parse clean. Closes: code-4b8 Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
df2c53db8d
commit
7cb44d7264
10 changed files with 779 additions and 41 deletions
|
|
@ -29,6 +29,14 @@ variable "grafana_admin_password" {
|
|||
}
|
||||
variable "tier" { type = string }
|
||||
variable "mysql_host" { type = string }
|
||||
variable "registry_user" {
|
||||
type = string
|
||||
sensitive = true
|
||||
}
|
||||
variable "registry_password" {
|
||||
type = string
|
||||
sensitive = true
|
||||
}
|
||||
|
||||
resource "kubernetes_namespace" "monitoring" {
|
||||
metadata {
|
||||
|
|
@ -225,6 +233,195 @@ resource "kubernetes_cron_job_v1" "dns_anomaly_monitor" {
|
|||
}
|
||||
}
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# Registry manifest-integrity probe — HEADs every tag in the private R/W
|
||||
# registry's catalog, walks multi-platform image indexes, and reports blob
|
||||
# availability. Catches the orphan-index failure mode seen 2026-04-13 and
|
||||
# 2026-04-19 before downstream pipelines hit it.
|
||||
# See: docs/post-mortems/2026-04-19-registry-orphan-index.md
|
||||
# -----------------------------------------------------------------------------
|
||||
resource "kubernetes_secret" "registry_probe_credentials" {
|
||||
metadata {
|
||||
name = "registry-probe-credentials"
|
||||
namespace = kubernetes_namespace.monitoring.metadata[0].name
|
||||
}
|
||||
type = "Opaque"
|
||||
data = {
|
||||
REG_USER = var.registry_user
|
||||
REG_PASS = var.registry_password
|
||||
}
|
||||
}
|
||||
|
||||
resource "kubernetes_cron_job_v1" "registry_integrity_probe" {
|
||||
metadata {
|
||||
name = "registry-integrity-probe"
|
||||
namespace = kubernetes_namespace.monitoring.metadata[0].name
|
||||
}
|
||||
spec {
|
||||
concurrency_policy = "Forbid"
|
||||
failed_jobs_history_limit = 3
|
||||
successful_jobs_history_limit = 3
|
||||
schedule = "*/15 * * * *"
|
||||
job_template {
|
||||
metadata {}
|
||||
spec {
|
||||
backoff_limit = 1
|
||||
ttl_seconds_after_finished = 600
|
||||
template {
|
||||
metadata {}
|
||||
spec {
|
||||
container {
|
||||
name = "registry-integrity-probe"
|
||||
image = "docker.io/library/alpine:3.20"
|
||||
env {
|
||||
name = "REG_USER"
|
||||
value_from {
|
||||
secret_key_ref {
|
||||
name = kubernetes_secret.registry_probe_credentials.metadata[0].name
|
||||
key = "REG_USER"
|
||||
}
|
||||
}
|
||||
}
|
||||
env {
|
||||
name = "REG_PASS"
|
||||
value_from {
|
||||
secret_key_ref {
|
||||
name = kubernetes_secret.registry_probe_credentials.metadata[0].name
|
||||
key = "REG_PASS"
|
||||
}
|
||||
}
|
||||
}
|
||||
env {
|
||||
name = "REGISTRY_HOST"
|
||||
value = "10.0.20.10:5050"
|
||||
}
|
||||
env {
|
||||
name = "REGISTRY_INSTANCE"
|
||||
value = "registry.viktorbarzin.me:5050"
|
||||
}
|
||||
env {
|
||||
name = "PUSHGATEWAY"
|
||||
value = "http://prometheus-prometheus-pushgateway.monitoring:9091/metrics/job/registry-integrity-probe"
|
||||
}
|
||||
env {
|
||||
name = "TAGS_PER_REPO"
|
||||
value = "5"
|
||||
}
|
||||
command = ["/bin/sh", "-c", <<-EOT
|
||||
set -eu
|
||||
apk add --no-cache curl jq >/dev/null
|
||||
|
||||
REG="$REGISTRY_HOST"
|
||||
INSTANCE="$REGISTRY_INSTANCE"
|
||||
AUTH="$REG_USER:$REG_PASS"
|
||||
ACCEPT='application/vnd.oci.image.index.v1+json,application/vnd.oci.image.manifest.v1+json,application/vnd.docker.distribution.manifest.list.v2+json,application/vnd.docker.distribution.manifest.v2+json'
|
||||
|
||||
push() {
|
||||
# Prometheus pushgateway — body ends with blank line. Ignore push errors.
|
||||
curl -sf --max-time 10 --data-binary @- "$PUSHGATEWAY" >/dev/null 2>&1 || true
|
||||
}
|
||||
|
||||
CATALOG=$(curl -sk -u "$AUTH" --max-time 30 "https://$REG/v2/_catalog?n=1000" || echo "")
|
||||
REPOS=$(echo "$CATALOG" | jq -r '.repositories[]?' 2>/dev/null || echo "")
|
||||
|
||||
if [ -z "$REPOS" ]; then
|
||||
echo "ERROR: empty catalog or auth failure — cannot probe"
|
||||
NOW=$(date +%s)
|
||||
push <<METRICS
|
||||
# TYPE registry_manifest_integrity_catalog_accessible gauge
|
||||
registry_manifest_integrity_catalog_accessible{instance="$INSTANCE"} 0
|
||||
# TYPE registry_manifest_integrity_last_run_timestamp gauge
|
||||
registry_manifest_integrity_last_run_timestamp{instance="$INSTANCE"} $NOW
|
||||
METRICS
|
||||
exit 1
|
||||
fi
|
||||
|
||||
FAIL=0
|
||||
REPOS_N=0
|
||||
TAGS_N=0
|
||||
INDEXES_N=0
|
||||
|
||||
printf '%s\n' $REPOS > /tmp/repos.txt
|
||||
while IFS= read -r repo; do
|
||||
[ -z "$repo" ] && continue
|
||||
REPOS_N=$((REPOS_N + 1))
|
||||
|
||||
TAGS_JSON=$(curl -sk -u "$AUTH" --max-time 15 "https://$REG/v2/$repo/tags/list" || echo "")
|
||||
echo "$TAGS_JSON" | jq -r '.tags[]?' 2>/dev/null | tail -n "$TAGS_PER_REPO" > /tmp/tags.txt || true
|
||||
|
||||
while IFS= read -r tag; do
|
||||
[ -z "$tag" ] && continue
|
||||
TAGS_N=$((TAGS_N + 1))
|
||||
|
||||
HTTP=$(curl -sk -u "$AUTH" -o /tmp/m.json -w '%%{http_code}' \
|
||||
-H "Accept: $ACCEPT" --max-time 15 \
|
||||
"https://$REG/v2/$repo/manifests/$tag")
|
||||
if [ "$HTTP" != "200" ]; then
|
||||
echo "FAIL: $repo:$tag manifest HTTP $HTTP"
|
||||
FAIL=$((FAIL + 1))
|
||||
continue
|
||||
fi
|
||||
|
||||
MT=$(jq -r '.mediaType // empty' /tmp/m.json 2>/dev/null || echo "")
|
||||
if echo "$MT" | grep -Eq 'manifest\.list|image\.index'; then
|
||||
INDEXES_N=$((INDEXES_N + 1))
|
||||
jq -r '.manifests[].digest' /tmp/m.json > /tmp/children.txt 2>/dev/null || true
|
||||
while IFS= read -r d; do
|
||||
[ -z "$d" ] && continue
|
||||
CH=$(curl -sk -u "$AUTH" -o /dev/null -w '%%{http_code}' \
|
||||
-H "Accept: $ACCEPT" --max-time 10 -I \
|
||||
"https://$REG/v2/$repo/manifests/$d")
|
||||
if [ "$CH" != "200" ]; then
|
||||
echo "FAIL: $repo:$tag index child $d HTTP $CH"
|
||||
FAIL=$((FAIL + 1))
|
||||
fi
|
||||
done < /tmp/children.txt
|
||||
fi
|
||||
done < /tmp/tags.txt
|
||||
done < /tmp/repos.txt
|
||||
|
||||
NOW=$(date +%s)
|
||||
push <<METRICS
|
||||
# TYPE registry_manifest_integrity_failures gauge
|
||||
registry_manifest_integrity_failures{instance="$INSTANCE"} $FAIL
|
||||
# TYPE registry_manifest_integrity_catalog_accessible gauge
|
||||
registry_manifest_integrity_catalog_accessible{instance="$INSTANCE"} 1
|
||||
# TYPE registry_manifest_integrity_repos_checked gauge
|
||||
registry_manifest_integrity_repos_checked{instance="$INSTANCE"} $REPOS_N
|
||||
# TYPE registry_manifest_integrity_tags_checked gauge
|
||||
registry_manifest_integrity_tags_checked{instance="$INSTANCE"} $TAGS_N
|
||||
# TYPE registry_manifest_integrity_indexes_checked gauge
|
||||
registry_manifest_integrity_indexes_checked{instance="$INSTANCE"} $INDEXES_N
|
||||
# TYPE registry_manifest_integrity_last_run_timestamp gauge
|
||||
registry_manifest_integrity_last_run_timestamp{instance="$INSTANCE"} $NOW
|
||||
METRICS
|
||||
|
||||
echo "Probe complete: $FAIL failures across $REPOS_N repos / $TAGS_N tags / $INDEXES_N indexes"
|
||||
if [ "$FAIL" -gt 0 ]; then exit 1; fi
|
||||
EOT
|
||||
]
|
||||
resources {
|
||||
requests = {
|
||||
cpu = "10m"
|
||||
memory = "48Mi"
|
||||
}
|
||||
limits = {
|
||||
memory = "96Mi"
|
||||
}
|
||||
}
|
||||
}
|
||||
restart_policy = "OnFailure"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
lifecycle {
|
||||
# KYVERNO_LIFECYCLE_V1: Kyverno admission webhook mutates dns_config with ndots=2
|
||||
ignore_changes = [spec[0].job_template[0].spec[0].template[0].spec[0].dns_config]
|
||||
}
|
||||
}
|
||||
|
||||
# Expose Pushgateway via NodePort so the PVE host can push LVM snapshot metrics
|
||||
resource "kubernetes_service" "pushgateway_nodeport" {
|
||||
metadata {
|
||||
|
|
|
|||
|
|
@ -1471,6 +1471,28 @@ serverFiles:
|
|||
severity: info
|
||||
annotations:
|
||||
summary: "Registry cache hit rate: {{ $value | printf \"%.0f\" }}% (threshold: 25%)"
|
||||
- alert: RegistryManifestIntegrityFailure
|
||||
expr: registry_manifest_integrity_failures > 0
|
||||
for: 30m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Registry has {{ $value }} broken manifest reference(s) — orphan index or missing blob"
|
||||
description: "The registry-integrity-probe CronJob in the monitoring namespace found {{ $value }} manifest/blob references that return non-200 on the private registry. Almost certainly an orphan OCI-index child from the cleanup-tags.sh+GC race. Rebuild the affected image per docs/runbooks/registry-rebuild-image.md and investigate which tag(s) the probe logs flagged."
|
||||
- alert: RegistryIntegrityProbeStale
|
||||
expr: time() - registry_manifest_integrity_last_run_timestamp > 3600
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Registry integrity probe has not reported in >1h — CronJob may be broken"
|
||||
- alert: RegistryCatalogInaccessible
|
||||
expr: registry_manifest_integrity_catalog_accessible == 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Registry probe cannot fetch /v2/_catalog — auth failure or registry down"
|
||||
- alert: NodeHighCPUUsage
|
||||
expr: pve_cpu_usage_ratio * 100 > 60
|
||||
for: 6h
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue