fix: restore tree dropped by 6d224861; land stem95su gdrive-sync (10m) [ci skip]

6d224861 came from a --no-checkout worktree whose empty index made the
commit drop every file except two. This restores 05b50d2b's full tree and
correctly adds stacks/stem95su/gdrive-sync.tf + the service-catalog stem95su
entry. Forward-only (parent=6d224861, no force-push); [ci skip] since the
live infra was never applied from the broken commit.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
Viktor Barzin 2026-06-09 08:45:33 +00:00
parent 6d224861c4
commit fd0f4a0365
1166 changed files with 358546 additions and 0 deletions

View file

@ -0,0 +1,49 @@
#!/usr/bin/env python3
"""Keeps only the N most recent tags per image in pull-through cache registries.
Deletes old tag links directly from the filesystem since the API doesn't support
DELETE on proxy registries. Run garbage-collect after to reclaim blob storage."""
import os
import shutil
import sys
sys.stdout.reconfigure(line_buffering=True)
KEEP = int(sys.argv[1]) if len(sys.argv) > 1 else 10
BASE = sys.argv[2] if len(sys.argv) > 2 else "/opt/registry/data"
total_deleted = 0
for registry_name in sorted(os.listdir(BASE)):
storage = os.path.join(BASE, registry_name, "docker/registry/v2/repositories")
if not os.path.isdir(storage):
continue
for root, dirs, _ in os.walk(storage):
if not root.endswith("_manifests/tags"):
continue
repo = root.replace(storage + "/", "").replace("/_manifests/tags", "")
tag_times = []
for tag in os.listdir(root):
tag_path = os.path.join(root, tag)
if os.path.isdir(tag_path):
mtime = os.path.getmtime(tag_path)
tag_times.append((mtime, tag, tag_path))
if len(tag_times) <= KEEP:
continue
tag_times.sort(reverse=True)
to_delete = tag_times[KEEP:]
print(f"[{registry_name}/{repo}] {len(tag_times)} tags -> keeping {KEEP}, deleting {len(to_delete)}")
for _, tag, tag_path in to_delete:
shutil.rmtree(tag_path)
total_deleted += 1
print(f" done")
print(f"\nDeleted {total_deleted} tags. Run garbage-collect to reclaim space.")

View file

@ -0,0 +1,31 @@
version: 0.1
log:
fields:
service: registry-private
storage:
cache:
blobdescriptor: inmemory
filesystem:
rootdirectory: /var/lib/registry
maxsize: 100GiB
delete:
enabled: true
maintenance:
uploadpurging:
enabled: true
age: 168h
interval: 4h
dryrun: false
auth:
htpasswd:
realm: "Registry Realm"
path: /auth/htpasswd
http:
addr: :5000
headers:
X-Content-Type-Options: [nosniff]
health:
storagedriver:
enabled: true
interval: 10s
threshold: 3

View file

@ -0,0 +1,30 @@
version: 0.1
log:
fields:
service: registry-${name}
storage:
cache:
blobdescriptor: inmemory
filesystem:
rootdirectory: /var/lib/registry
delete:
enabled: true
maintenance:
uploadpurging:
enabled: true
age: 24h
interval: 4h
dryrun: false
http:
addr: :5000
draintimeout: 60s
headers:
X-Content-Type-Options: [nosniff]
health:
storagedriver:
enabled: true
interval: 10s
threshold: 3
proxy:
remoteurl: ${remote_url}
ttl: 0

View file

@ -0,0 +1,41 @@
version: 0.1
log:
fields:
service: registry
storage:
cache:
blobdescriptor: inmemory
filesystem:
rootdirectory: /var/lib/registry
delete:
enabled: true
maintenance:
uploadpurging:
enabled: true
age: 24h
interval: 4h
dryrun: false
readonly:
enabled: false
http:
addr: :5000
draintimeout: 60s
headers:
X-Content-Type-Options: [nosniff]
debug:
addr: ":5001"
# Enable proxy on nodes - https://github.com/containerd/containerd/blob/main/docs/cri/registry.md
# https://ops.tips/gists/retrieving-docker-registry-metrics-using-prometheus/
prometheus:
enabled: true
path: "/metrics"
health:
storagedriver:
enabled: true
interval: 10s
threshold: 3
proxy:
remoteurl: https://registry-1.docker.io
username: vbarzin@gmail.com
password: ${password}
ttl: 0

View file

@ -0,0 +1,158 @@
networks:
registry:
driver: bridge
services:
# registry:2 is pinned after the 2026-04-13 + 2026-04-19 orphan-index incidents.
# Floating tags were swapping to regressed versions between GC runs. Upgrade
# path: bump all six registry-* services in lockstep and bounce via
# `systemctl restart docker-compose-registry.service`.
registry-dockerhub:
image: registry:2.8.3
container_name: registry-dockerhub
restart: always
volumes:
- /opt/registry/data/dockerhub:/var/lib/registry
- /opt/registry/config-dockerhub.yml:/etc/docker/registry/config.yml:ro
networks:
- registry
ports:
- "5001:5001"
healthcheck:
test: ["CMD", "sh", "-c", "wget -qO- http://127.0.0.1:5000/v2/ >/dev/null 2>&1"]
interval: 30s
timeout: 10s
retries: 3
start_period: 10s
registry-ghcr:
image: registry:2.8.3
container_name: registry-ghcr
restart: always
volumes:
- /opt/registry/data/ghcr:/var/lib/registry
- /opt/registry/config-ghcr.yml:/etc/docker/registry/config.yml:ro
networks:
- registry
healthcheck:
test: ["CMD", "sh", "-c", "wget -qO- http://127.0.0.1:5000/v2/ >/dev/null 2>&1"]
interval: 30s
timeout: 10s
retries: 3
start_period: 10s
registry-quay:
image: registry:2.8.3
container_name: registry-quay
restart: always
volumes:
- /opt/registry/data/quay:/var/lib/registry
- /opt/registry/config-quay.yml:/etc/docker/registry/config.yml:ro
networks:
- registry
healthcheck:
test: ["CMD", "sh", "-c", "wget -qO- http://127.0.0.1:5000/v2/ >/dev/null 2>&1"]
interval: 30s
timeout: 10s
retries: 3
start_period: 10s
registry-k8s:
image: registry:2.8.3
container_name: registry-k8s
restart: always
volumes:
- /opt/registry/data/k8s:/var/lib/registry
- /opt/registry/config-k8s.yml:/etc/docker/registry/config.yml:ro
networks:
- registry
healthcheck:
test: ["CMD", "sh", "-c", "wget -qO- http://127.0.0.1:5000/v2/ >/dev/null 2>&1"]
interval: 30s
timeout: 10s
retries: 3
start_period: 10s
registry-kyverno:
image: registry:2.8.3
container_name: registry-kyverno
restart: always
volumes:
- /opt/registry/data/kyverno:/var/lib/registry
- /opt/registry/config-kyverno.yml:/etc/docker/registry/config.yml:ro
networks:
- registry
healthcheck:
test: ["CMD", "sh", "-c", "wget -qO- http://127.0.0.1:5000/v2/ >/dev/null 2>&1"]
interval: 30s
timeout: 10s
retries: 3
start_period: 10s
# registry-private decommissioned in Phase 4 of
# forgejo-registry-consolidation 2026-05-07 — image migration completed,
# cluster flipped to forgejo.viktorbarzin.me/viktor/<image>. The remaining
# five services on this VM are pull-through caches for upstream registries.
# After 1 week of no incidents, `rm -rf /opt/registry/data/private/` on the
# VM frees ~2.6 GB. The tarball break-glass under
# /opt/registry/data/private/_breakglass/ stays — it's how we recover
# infra-ci if Forgejo ever goes fully down.
nginx:
image: nginx:alpine
container_name: registry-nginx
restart: always
# 5050 dropped Phase 4 of forgejo-registry-consolidation 2026-05-07.
ports:
- "5000:5000"
- "5010:5010"
- "5020:5020"
- "5030:5030"
- "5040:5040"
volumes:
- /opt/registry/nginx.conf:/etc/nginx/nginx.conf:ro
- /opt/registry/tls:/etc/nginx/tls:ro
- nginx-cache:/var/cache/nginx
networks:
- registry
depends_on:
registry-dockerhub:
condition: service_healthy
registry-ghcr:
condition: service_healthy
registry-quay:
condition: service_healthy
registry-k8s:
condition: service_healthy
registry-kyverno:
condition: service_healthy
healthcheck:
test: ["CMD", "sh", "-c", "wget -qO- http://127.0.0.1:5000/v2/ >/dev/null 2>&1"]
interval: 30s
timeout: 10s
retries: 3
start_period: 15s
registry-ui:
image: joxit/docker-registry-ui:latest
container_name: registry-ui
restart: always
ports:
- "8080:80"
environment:
- NGINX_PROXY_PASS_URL=http://registry-dockerhub:5000
- DELETE_IMAGES=true
- SINGLE_REGISTRY=true
- SHOW_CONTENT_DIGEST=true
- SHOW_CATALOG_NB_TAGS=true
- CATALOG_ELEMENTS_LIMIT=1000
- TAGLIST_PAGE_SIZE=100
- REGISTRY_TITLE=viktorbarzin.me
networks:
- registry
depends_on:
registry-dockerhub:
condition: service_healthy
volumes:
nginx-cache:

View file

@ -0,0 +1,158 @@
#!/usr/bin/env python3
"""Registry integrity scanner — two classes of brokenness.
1. Orphaned layer links: the cleanup-tags.sh + garbage-collect cycle can delete
blob data while leaving _layers/ link files intact. The registry then returns
HTTP 200 with 0 bytes for those layers (it finds the link, trusts the blob
exists, but the data is gone). Containerd sees "unexpected EOF".
Action: delete the orphan link so the next pull re-fetches cleanly.
2. Orphaned OCI-index children: an image index (multi-platform manifest list)
references child manifests by digest. If a child's blob has been deleted —
by a cleanup-tags.sh tag rmtree followed by garbage-collect walking the
children wrong (distribution/distribution#3324 class), or by an incomplete
`buildx --push` whose partial blob was later purged by `uploadpurging`
the index survives but pulls fail with `manifest unknown`.
Action: log loudly. Deleting an index is a conscious decision (the image
was published; removing it breaks downstream consumers), so we surface
the problem and leave repair to a human or to the rebuild runbook.
Run after garbage-collect (Sunday 03:30) and daily (Mon-Sat 02:30).
"""
import argparse
import json
import os
import sys
sys.stdout.reconfigure(line_buffering=True)
parser = argparse.ArgumentParser(description="Scan registry for orphaned blobs and indexes")
parser.add_argument("base", nargs="?", default="/opt/registry/data", help="Registry data directory")
parser.add_argument("--dry-run", action="store_true", help="Report but don't delete")
args = parser.parse_args()
BASE = args.base
DRY_RUN = args.dry_run
INDEX_MEDIA_TYPES = (
"application/vnd.oci.image.index.v1+json",
"application/vnd.docker.distribution.manifest.list.v2+json",
)
# Only the private R/W registry is authoritative for every child of every
# index it stores — we pushed those indexes ourselves, so a missing child is
# always a bug (the 2026-04-13 + 2026-04-19 failure mode).
#
# Pull-through caches (dockerhub, ghcr, quay, k8s, kyverno) are ALLOWED to
# have missing children: they only fetch what someone actually pulls.
# Uncached arm64 / arm / attestation variants of a multi-platform index are
# normal partial state, not orphans. Scanning them generates hundreds of
# false-positive warnings — noise that would mask the real signal from the
# private registry. Scan 2 is therefore private-only.
INDEX_SCAN_REGISTRIES = ("private",)
total_layer_removed = 0
total_layer_checked = 0
total_index_scanned = 0
total_index_orphans = 0
def load_manifest_blob(blobs_root, digest_hex):
blob_path = os.path.join(blobs_root, digest_hex[:2], digest_hex, "data")
if not os.path.isfile(blob_path):
return None
try:
with open(blob_path, "rb") as f:
raw = f.read(1024 * 1024)
except OSError:
return None
try:
return json.loads(raw)
except (json.JSONDecodeError, UnicodeDecodeError):
return None
for registry_name in sorted(os.listdir(BASE)):
repos_dir = os.path.join(BASE, registry_name, "docker/registry/v2/repositories")
blobs_root = os.path.join(BASE, registry_name, "docker/registry/v2/blobs/sha256")
if not os.path.isdir(repos_dir):
continue
for root, _, _ in os.walk(repos_dir):
# --- Scan 1: orphan layer links ----------------------------------------
if root.endswith("/_layers/sha256"):
repo = root.replace(repos_dir + "/", "").replace("/_layers/sha256", "")
for digest_dir in os.listdir(root):
link_file = os.path.join(root, digest_dir, "link")
if not os.path.isfile(link_file):
continue
total_layer_checked += 1
blob_data = os.path.join(blobs_root, digest_dir[:2], digest_dir, "data")
if os.path.isfile(blob_data):
continue
prefix = "[DRY RUN] " if DRY_RUN else ""
print(f"{prefix}[{registry_name}/{repo}] removing orphaned layer link: {digest_dir[:12]}...")
if not DRY_RUN:
import shutil
shutil.rmtree(os.path.join(root, digest_dir))
total_layer_removed += 1
# --- Scan 2: orphan OCI-index children (private registry only) --------
elif root.endswith("/_manifests/revisions/sha256") and registry_name in INDEX_SCAN_REGISTRIES:
repo = root.replace(repos_dir + "/", "").replace("/_manifests/revisions/sha256", "")
for digest_dir in os.listdir(root):
# Manifest revision entry. Load the blob it points to.
manifest = load_manifest_blob(blobs_root, digest_dir)
if manifest is None:
continue
media_type = manifest.get("mediaType", "")
if media_type not in INDEX_MEDIA_TYPES:
continue
total_index_scanned += 1
# Per-repo revision links — serving a child manifest via the API
# requires <repo>/_manifests/revisions/sha256/<child-digest>/link
# to exist. The blob data alone is not enough: cleanup-tags.sh
# rmtrees tag dirs (which on 2.8.x also orphans the per-repo
# revision links for index children), while the upstream blob
# data survives in /blobs/. That's exactly the 2026-04-19
# failure mode — the probe sees 404 even though the blob file
# is still on disk.
revisions_root = os.path.dirname(root) # …/_manifests/revisions
for child in manifest.get("manifests", []):
child_digest = child.get("digest", "")
if not child_digest.startswith("sha256:"):
continue
child_hex = child_digest[len("sha256:"):]
child_link = os.path.join(revisions_root, "sha256", child_hex, "link")
if os.path.isfile(child_link):
continue
platform = child.get("platform", {})
arch = platform.get("architecture", "?")
os_ = platform.get("os", "?")
child_blob = os.path.join(blobs_root, child_hex[:2], child_hex, "data")
blob_state = "blob-data-present" if os.path.isfile(child_blob) else "blob-data-gone"
print(
f"WARNING [{registry_name}/{repo}] ORPHAN INDEX: "
f"{digest_dir[:12]} references missing child {child_hex[:12]} "
f"({arch}/{os_}, {blob_state}) — registry returns 404, rebuild required"
)
total_index_orphans += 1
mode = "DRY RUN — " if DRY_RUN else ""
print(f"\n{mode}Layer scan: checked {total_layer_checked} links, removed {total_layer_removed} orphaned.")
print(f"{mode}Index scan: inspected {total_index_scanned} image indexes, found {total_index_orphans} orphaned children.")
if total_index_orphans > 0:
print(f"\nACTION REQUIRED: {total_index_orphans} orphan index child(ren) detected. "
"See docs/runbooks/registry-rebuild-image.md — the affected image must be rebuilt "
"(a registry DELETE on an index is a conscious decision, not an automated repair).")

View file

@ -0,0 +1,174 @@
worker_processes auto;
error_log /var/log/nginx/error.log warn;
pid /tmp/nginx.pid;
events {
worker_connections 1024;
}
http {
proxy_cache_path /var/cache/nginx/registry
levels=1:2
keys_zone=registry:500m
max_size=50g
inactive=24h
use_temp_path=off;
log_format registry '$remote_addr [$time_local] "$request" '
'$status $body_bytes_sent '
'upstream=$upstream_addr time=$upstream_response_time '
'cache=$upstream_cache_status';
access_log /var/log/nginx/access.log registry;
# --- Upstreams ---
upstream dockerhub {
server registry-dockerhub:5000;
keepalive 32;
}
upstream ghcr {
server registry-ghcr:5000;
keepalive 32;
}
# `upstream private` removed in Phase 4 of forgejo-registry-consolidation
# 2026-05-07. The /v2/ private registry is now Forgejo at
# forgejo.viktorbarzin.me/viktor/.
# --- Docker Hub (port 5000) ---
server {
listen 5000;
server_name _;
client_max_body_size 0;
proxy_request_buffering off;
proxy_buffering on;
# Blobs are content-addressed (sha256) — immutable, safe to cache aggressively
location ~ /v2/.*/blobs/ {
proxy_pass http://dockerhub;
proxy_http_version 1.1;
proxy_set_header Host $host;
proxy_set_header Connection "";
# Reject truncated upstream responses
proxy_intercept_errors on;
error_page 502 503 504 = @upstream_error;
proxy_cache registry;
proxy_cache_lock on;
proxy_cache_lock_timeout 5m;
proxy_cache_lock_age 5m;
proxy_cache_use_stale updating;
proxy_cache_valid 200 24h;
proxy_cache_valid any 0;
proxy_cache_min_uses 2;
proxy_cache_methods GET;
proxy_read_timeout 900;
proxy_send_timeout 900;
}
# Manifests are mutable (tags can change) — no cache, pass through to registry
location /v2/ {
proxy_pass http://dockerhub;
proxy_http_version 1.1;
proxy_set_header Host $host;
proxy_set_header Connection "";
proxy_cache off;
proxy_read_timeout 900;
proxy_send_timeout 900;
}
location @upstream_error {
return 502 "upstream error";
}
location /healthz {
proxy_pass http://dockerhub/v2/;
proxy_read_timeout 5s;
proxy_connect_timeout 3s;
access_log off;
}
location / {
return 200 'ok';
add_header Content-Type text/plain;
}
}
# --- GHCR (port 5010) ---
server {
listen 5010;
server_name _;
client_max_body_size 0;
proxy_request_buffering off;
proxy_buffering on;
# Blobs are content-addressed (sha256) — immutable, safe to cache aggressively
location ~ /v2/.*/blobs/ {
proxy_pass http://ghcr;
proxy_http_version 1.1;
proxy_set_header Host $host;
proxy_set_header Connection "";
# Reject truncated upstream responses
proxy_intercept_errors on;
error_page 502 503 504 = @upstream_error;
proxy_cache registry;
proxy_cache_lock on;
proxy_cache_lock_timeout 5m;
proxy_cache_lock_age 5m;
proxy_cache_use_stale updating;
proxy_cache_valid 200 24h;
proxy_cache_valid any 0;
proxy_cache_min_uses 2;
proxy_cache_methods GET;
proxy_read_timeout 900;
proxy_send_timeout 900;
}
# Manifests are mutable (tags can change) — no cache, pass through to registry
location /v2/ {
proxy_pass http://ghcr;
proxy_http_version 1.1;
proxy_set_header Host $host;
proxy_set_header Connection "";
proxy_cache off;
proxy_read_timeout 900;
proxy_send_timeout 900;
}
location @upstream_error {
return 502 "upstream error";
}
location /healthz {
proxy_pass http://ghcr/v2/;
proxy_read_timeout 5s;
proxy_connect_timeout 3s;
access_log off;
}
location / {
return 200 'ok';
add_header Content-Type text/plain;
}
}
# --- Private R/W Registry (port 5050) decommissioned Phase 4 2026-05-07 ---
# The TLS port 5050 server block previously fronted `registry-private`.
# Migrated to Forgejo at forgejo.viktorbarzin.me/viktor/. Both
# docker-compose.yml and this nginx config no longer reference port 5050.
}