From dd461beb33f27d7501af49bfc10f54b5628328d0 Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Sun, 29 Mar 2026 22:31:39 +0300 Subject: [PATCH] add registry blob integrity checker to self-heal corrupted cache The cleanup-tags.sh + garbage-collect cycle can delete blob data while leaving _layers/ link files intact. The registry then returns HTTP 200 with 0 bytes for those layers, causing "unexpected EOF" on image pulls. fix-broken-blobs.sh walks all repositories, checks each layer link against actual blob data, and removes orphaned links so the registry re-fetches from upstream on next pull. Schedule: daily at 2:30am (after tag cleanup) and Sunday 3:30am (after garbage collection). First run found 2335/2556 (91%) of layer links were orphaned. --- modules/docker-registry/fix-broken-blobs.sh | 59 +++++++++++++++++++++ stacks/infra/main.tf | 7 +++ 2 files changed, 66 insertions(+) create mode 100644 modules/docker-registry/fix-broken-blobs.sh diff --git a/modules/docker-registry/fix-broken-blobs.sh b/modules/docker-registry/fix-broken-blobs.sh new file mode 100644 index 00000000..8cdadd1e --- /dev/null +++ b/modules/docker-registry/fix-broken-blobs.sh @@ -0,0 +1,59 @@ +#!/usr/bin/env python3 +"""Finds and removes layer links that point to non-existent blobs. + +When the cleanup-tags.sh + garbage-collect cycle runs, it can delete blob data +while leaving _layers/ link files intact. The registry then returns HTTP 200 +with 0 bytes for those layers (it finds the link, trusts the blob exists, but +the data is gone). This causes containerd to fail with "unexpected EOF". + +This script walks all repositories, checks each layer link against the actual +blobs directory, and removes any orphaned links. On next pull, the registry +will re-fetch the missing blobs from the upstream registry. + +Run after garbage-collect (e.g., 3:15 AM Sunday) or daily. +""" + +import os +import sys + +sys.stdout.reconfigure(line_buffering=True) + +BASE = sys.argv[1] if len(sys.argv) > 1 else "/opt/registry/data" +DRY_RUN = "--dry-run" in sys.argv + +total_removed = 0 +total_checked = 0 + +for registry_name in sorted(os.listdir(BASE)): + repos_dir = os.path.join(BASE, registry_name, "docker/registry/v2/repositories") + blobs_dir = os.path.join(BASE, registry_name, "docker/registry/v2/blobs") + + if not os.path.isdir(repos_dir): + continue + + for root, dirs, files in os.walk(repos_dir): + if not root.endswith("/_layers/sha256"): + continue + + repo = root.replace(repos_dir + "/", "").replace("/_layers/sha256", "") + + for digest_dir in os.listdir(root): + link_file = os.path.join(root, digest_dir, "link") + if not os.path.isfile(link_file): + continue + + total_checked += 1 + + # Check if the actual blob data exists + blob_data = os.path.join(blobs_dir, "sha256", digest_dir[:2], digest_dir, "data") + if not os.path.isfile(blob_data): + prefix = "[DRY RUN] " if DRY_RUN else "" + print(f"{prefix}[{registry_name}/{repo}] removing orphaned layer link: {digest_dir[:12]}...") + if not DRY_RUN: + # Remove the entire digest directory (contains the link file) + import shutil + shutil.rmtree(os.path.join(root, digest_dir)) + total_removed += 1 + +mode = "DRY RUN — " if DRY_RUN else "" +print(f"\n{mode}Checked {total_checked} layer links, removed {total_removed} orphaned.") diff --git a/stacks/infra/main.tf b/stacks/infra/main.tf index 2d75525a..dba2df4e 100644 --- a/stacks/infra/main.tf +++ b/stacks/infra/main.tf @@ -270,6 +270,10 @@ module "docker-registry-template" { format("echo %s | base64 -d > /opt/registry/cleanup-tags.sh && chmod +x /opt/registry/cleanup-tags.sh", base64encode(file("${path.root}/../../modules/docker-registry/cleanup-tags.sh")) ), + # Write blob integrity checker + format("echo %s | base64 -d > /opt/registry/fix-broken-blobs.sh && chmod +x /opt/registry/fix-broken-blobs.sh", + base64encode(file("${path.root}/../../modules/docker-registry/fix-broken-blobs.sh")) + ), # Create systemd unit for docker compose format("echo %s | base64 -d > /etc/systemd/system/docker-compose-registry.service", base64encode(<<-UNIT @@ -304,6 +308,9 @@ UNIT "( crontab -l 2>/dev/null; echo '25 3 * * 0 /usr/bin/docker exec registry-private registry garbage-collect -m /etc/docker/registry/config.yml >> /var/log/registry-gc.log 2>&1' ) | crontab -", # Cron: tag cleanup (daily 2am, keep last 10 tags per image) "( crontab -l 2>/dev/null; echo '0 2 * * * python3 /opt/registry/cleanup-tags.sh 10 >> /var/log/registry-cleanup.log 2>&1' ) | crontab -", + # Cron: blob integrity check (after GC on Sunday, and daily 2:30am after tag cleanup) + "( crontab -l 2>/dev/null; echo '30 3 * * 0 python3 /opt/registry/fix-broken-blobs.sh >> /var/log/registry-fix-blobs.log 2>&1' ) | crontab -", + "( crontab -l 2>/dev/null; echo '30 2 * * 1-6 python3 /opt/registry/fix-broken-blobs.sh >> /var/log/registry-fix-blobs.log 2>&1' ) | crontab -", ] }