add registry blob integrity checker to self-heal corrupted cache
The cleanup-tags.sh + garbage-collect cycle can delete blob data while leaving _layers/ link files intact. The registry then returns HTTP 200 with 0 bytes for those layers, causing "unexpected EOF" on image pulls. fix-broken-blobs.sh walks all repositories, checks each layer link against actual blob data, and removes orphaned links so the registry re-fetches from upstream on next pull. Schedule: daily at 2:30am (after tag cleanup) and Sunday 3:30am (after garbage collection). First run found 2335/2556 (91%) of layer links were orphaned.
This commit is contained in:
parent
facf959ecf
commit
dd461beb33
2 changed files with 66 additions and 0 deletions
59
modules/docker-registry/fix-broken-blobs.sh
Normal file
59
modules/docker-registry/fix-broken-blobs.sh
Normal file
|
|
@ -0,0 +1,59 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
"""Finds and removes layer links that point to non-existent blobs.
|
||||||
|
|
||||||
|
When the cleanup-tags.sh + garbage-collect cycle runs, it can delete blob data
|
||||||
|
while leaving _layers/ link files intact. The registry then returns HTTP 200
|
||||||
|
with 0 bytes for those layers (it finds the link, trusts the blob exists, but
|
||||||
|
the data is gone). This causes containerd to fail with "unexpected EOF".
|
||||||
|
|
||||||
|
This script walks all repositories, checks each layer link against the actual
|
||||||
|
blobs directory, and removes any orphaned links. On next pull, the registry
|
||||||
|
will re-fetch the missing blobs from the upstream registry.
|
||||||
|
|
||||||
|
Run after garbage-collect (e.g., 3:15 AM Sunday) or daily.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
|
||||||
|
sys.stdout.reconfigure(line_buffering=True)
|
||||||
|
|
||||||
|
BASE = sys.argv[1] if len(sys.argv) > 1 else "/opt/registry/data"
|
||||||
|
DRY_RUN = "--dry-run" in sys.argv
|
||||||
|
|
||||||
|
total_removed = 0
|
||||||
|
total_checked = 0
|
||||||
|
|
||||||
|
for registry_name in sorted(os.listdir(BASE)):
|
||||||
|
repos_dir = os.path.join(BASE, registry_name, "docker/registry/v2/repositories")
|
||||||
|
blobs_dir = os.path.join(BASE, registry_name, "docker/registry/v2/blobs")
|
||||||
|
|
||||||
|
if not os.path.isdir(repos_dir):
|
||||||
|
continue
|
||||||
|
|
||||||
|
for root, dirs, files in os.walk(repos_dir):
|
||||||
|
if not root.endswith("/_layers/sha256"):
|
||||||
|
continue
|
||||||
|
|
||||||
|
repo = root.replace(repos_dir + "/", "").replace("/_layers/sha256", "")
|
||||||
|
|
||||||
|
for digest_dir in os.listdir(root):
|
||||||
|
link_file = os.path.join(root, digest_dir, "link")
|
||||||
|
if not os.path.isfile(link_file):
|
||||||
|
continue
|
||||||
|
|
||||||
|
total_checked += 1
|
||||||
|
|
||||||
|
# Check if the actual blob data exists
|
||||||
|
blob_data = os.path.join(blobs_dir, "sha256", digest_dir[:2], digest_dir, "data")
|
||||||
|
if not os.path.isfile(blob_data):
|
||||||
|
prefix = "[DRY RUN] " if DRY_RUN else ""
|
||||||
|
print(f"{prefix}[{registry_name}/{repo}] removing orphaned layer link: {digest_dir[:12]}...")
|
||||||
|
if not DRY_RUN:
|
||||||
|
# Remove the entire digest directory (contains the link file)
|
||||||
|
import shutil
|
||||||
|
shutil.rmtree(os.path.join(root, digest_dir))
|
||||||
|
total_removed += 1
|
||||||
|
|
||||||
|
mode = "DRY RUN — " if DRY_RUN else ""
|
||||||
|
print(f"\n{mode}Checked {total_checked} layer links, removed {total_removed} orphaned.")
|
||||||
|
|
@ -270,6 +270,10 @@ module "docker-registry-template" {
|
||||||
format("echo %s | base64 -d > /opt/registry/cleanup-tags.sh && chmod +x /opt/registry/cleanup-tags.sh",
|
format("echo %s | base64 -d > /opt/registry/cleanup-tags.sh && chmod +x /opt/registry/cleanup-tags.sh",
|
||||||
base64encode(file("${path.root}/../../modules/docker-registry/cleanup-tags.sh"))
|
base64encode(file("${path.root}/../../modules/docker-registry/cleanup-tags.sh"))
|
||||||
),
|
),
|
||||||
|
# Write blob integrity checker
|
||||||
|
format("echo %s | base64 -d > /opt/registry/fix-broken-blobs.sh && chmod +x /opt/registry/fix-broken-blobs.sh",
|
||||||
|
base64encode(file("${path.root}/../../modules/docker-registry/fix-broken-blobs.sh"))
|
||||||
|
),
|
||||||
# Create systemd unit for docker compose
|
# Create systemd unit for docker compose
|
||||||
format("echo %s | base64 -d > /etc/systemd/system/docker-compose-registry.service",
|
format("echo %s | base64 -d > /etc/systemd/system/docker-compose-registry.service",
|
||||||
base64encode(<<-UNIT
|
base64encode(<<-UNIT
|
||||||
|
|
@ -304,6 +308,9 @@ UNIT
|
||||||
"( crontab -l 2>/dev/null; echo '25 3 * * 0 /usr/bin/docker exec registry-private registry garbage-collect -m /etc/docker/registry/config.yml >> /var/log/registry-gc.log 2>&1' ) | crontab -",
|
"( crontab -l 2>/dev/null; echo '25 3 * * 0 /usr/bin/docker exec registry-private registry garbage-collect -m /etc/docker/registry/config.yml >> /var/log/registry-gc.log 2>&1' ) | crontab -",
|
||||||
# Cron: tag cleanup (daily 2am, keep last 10 tags per image)
|
# Cron: tag cleanup (daily 2am, keep last 10 tags per image)
|
||||||
"( crontab -l 2>/dev/null; echo '0 2 * * * python3 /opt/registry/cleanup-tags.sh 10 >> /var/log/registry-cleanup.log 2>&1' ) | crontab -",
|
"( crontab -l 2>/dev/null; echo '0 2 * * * python3 /opt/registry/cleanup-tags.sh 10 >> /var/log/registry-cleanup.log 2>&1' ) | crontab -",
|
||||||
|
# Cron: blob integrity check (after GC on Sunday, and daily 2:30am after tag cleanup)
|
||||||
|
"( crontab -l 2>/dev/null; echo '30 3 * * 0 python3 /opt/registry/fix-broken-blobs.sh >> /var/log/registry-fix-blobs.log 2>&1' ) | crontab -",
|
||||||
|
"( crontab -l 2>/dev/null; echo '30 2 * * 1-6 python3 /opt/registry/fix-broken-blobs.sh >> /var/log/registry-fix-blobs.log 2>&1' ) | crontab -",
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue