add registry blob integrity checker to self-heal corrupted cache

The cleanup-tags.sh + garbage-collect cycle can delete blob data while
leaving _layers/ link files intact. The registry then returns HTTP 200
with 0 bytes for those layers, causing "unexpected EOF" on image pulls.

fix-broken-blobs.sh walks all repositories, checks each layer link
against actual blob data, and removes orphaned links so the registry
re-fetches from upstream on next pull.

Schedule: daily at 2:30am (after tag cleanup) and Sunday 3:30am
(after garbage collection). First run found 2335/2556 (91%) of
layer links were orphaned.
This commit is contained in:
Viktor Barzin 2026-03-29 22:31:39 +03:00
parent facf959ecf
commit dd461beb33
2 changed files with 66 additions and 0 deletions

View file

@ -0,0 +1,59 @@
#!/usr/bin/env python3
"""Finds and removes layer links that point to non-existent blobs.
When the cleanup-tags.sh + garbage-collect cycle runs, it can delete blob data
while leaving _layers/ link files intact. The registry then returns HTTP 200
with 0 bytes for those layers (it finds the link, trusts the blob exists, but
the data is gone). This causes containerd to fail with "unexpected EOF".
This script walks all repositories, checks each layer link against the actual
blobs directory, and removes any orphaned links. On next pull, the registry
will re-fetch the missing blobs from the upstream registry.
Run after garbage-collect (e.g., 3:15 AM Sunday) or daily.
"""
import os
import sys
sys.stdout.reconfigure(line_buffering=True)
BASE = sys.argv[1] if len(sys.argv) > 1 else "/opt/registry/data"
DRY_RUN = "--dry-run" in sys.argv
total_removed = 0
total_checked = 0
for registry_name in sorted(os.listdir(BASE)):
repos_dir = os.path.join(BASE, registry_name, "docker/registry/v2/repositories")
blobs_dir = os.path.join(BASE, registry_name, "docker/registry/v2/blobs")
if not os.path.isdir(repos_dir):
continue
for root, dirs, files in os.walk(repos_dir):
if not root.endswith("/_layers/sha256"):
continue
repo = root.replace(repos_dir + "/", "").replace("/_layers/sha256", "")
for digest_dir in os.listdir(root):
link_file = os.path.join(root, digest_dir, "link")
if not os.path.isfile(link_file):
continue
total_checked += 1
# Check if the actual blob data exists
blob_data = os.path.join(blobs_dir, "sha256", digest_dir[:2], digest_dir, "data")
if not os.path.isfile(blob_data):
prefix = "[DRY RUN] " if DRY_RUN else ""
print(f"{prefix}[{registry_name}/{repo}] removing orphaned layer link: {digest_dir[:12]}...")
if not DRY_RUN:
# Remove the entire digest directory (contains the link file)
import shutil
shutil.rmtree(os.path.join(root, digest_dir))
total_removed += 1
mode = "DRY RUN — " if DRY_RUN else ""
print(f"\n{mode}Checked {total_checked} layer links, removed {total_removed} orphaned.")

View file

@ -270,6 +270,10 @@ module "docker-registry-template" {
format("echo %s | base64 -d > /opt/registry/cleanup-tags.sh && chmod +x /opt/registry/cleanup-tags.sh",
base64encode(file("${path.root}/../../modules/docker-registry/cleanup-tags.sh"))
),
# Write blob integrity checker
format("echo %s | base64 -d > /opt/registry/fix-broken-blobs.sh && chmod +x /opt/registry/fix-broken-blobs.sh",
base64encode(file("${path.root}/../../modules/docker-registry/fix-broken-blobs.sh"))
),
# Create systemd unit for docker compose
format("echo %s | base64 -d > /etc/systemd/system/docker-compose-registry.service",
base64encode(<<-UNIT
@ -304,6 +308,9 @@ UNIT
"( crontab -l 2>/dev/null; echo '25 3 * * 0 /usr/bin/docker exec registry-private registry garbage-collect -m /etc/docker/registry/config.yml >> /var/log/registry-gc.log 2>&1' ) | crontab -",
# Cron: tag cleanup (daily 2am, keep last 10 tags per image)
"( crontab -l 2>/dev/null; echo '0 2 * * * python3 /opt/registry/cleanup-tags.sh 10 >> /var/log/registry-cleanup.log 2>&1' ) | crontab -",
# Cron: blob integrity check (after GC on Sunday, and daily 2:30am after tag cleanup)
"( crontab -l 2>/dev/null; echo '30 3 * * 0 python3 /opt/registry/fix-broken-blobs.sh >> /var/log/registry-fix-blobs.log 2>&1' ) | crontab -",
"( crontab -l 2>/dev/null; echo '30 2 * * 1-6 python3 /opt/registry/fix-broken-blobs.sh >> /var/log/registry-fix-blobs.log 2>&1' ) | crontab -",
]
}