From de598996f115162653bbf497bd56af26dfd0491e Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Sun, 1 Mar 2026 21:46:41 +0000 Subject: [PATCH] [ci skip] remove low-traffic pull-through caches (registry.k8s.io, quay.io, reg.kyverno.io) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Pull-through cache at 10.0.20.10 was serving corrupted/truncated images for low-traffic registries, breaking VPA certgen (ImagePullBackOff) and previously causing Kyverno image pull failures. Kept: docker.io (port 5000) and ghcr.io (port 5010) — high traffic, Docker Hub rate limits make caching essential. Removed from cloud-init template and all 5 live nodes: - registry.k8s.io (port 5030) — 14 system images, very low churn - quay.io (port 5020) — 11 images - reg.kyverno.io (port 5040) — 5 images The registry containers on the 10.0.20.10 VM still run but nodes no longer route to them. They can be stopped/removed from the VM later. --- .claude/CLAUDE.md | 2 +- stacks/infra/main.tf | 27 ++++++++++----------------- 2 files changed, 11 insertions(+), 18 deletions(-) diff --git a/.claude/CLAUDE.md b/.claude/CLAUDE.md index 3ea930b3..0f717827 100755 --- a/.claude/CLAUDE.md +++ b/.claude/CLAUDE.md @@ -122,7 +122,7 @@ terraform fmt -recursive # Format all ## Infrastructure - Proxmox hypervisor (192.168.1.127) — see `.claude/reference/proxmox-inventory.md` for full VM table - Kubernetes cluster: 5 nodes (k8s-master + k8s-node1-4, v1.34.2), GPU on node1 (Tesla T4) -- Docker registry pull-through cache at `10.0.20.10` (ports 5000/5010/5020/5030/5040) +- Docker registry pull-through cache at `10.0.20.10` — only docker.io (port 5000) and ghcr.io (port 5010) are active. quay.io/registry.k8s.io/reg.kyverno.io caches disabled (caused corrupted images). - GPU workloads need: `node_selector = { "gpu": "true" }` + `toleration { key = "nvidia.com/gpu", value = "true", effect = "NoSchedule" }` ### Node Rebuild Procedure diff --git a/stacks/infra/main.tf b/stacks/infra/main.tf index 8e79a0c8..bca518fb 100644 --- a/stacks/infra/main.tf +++ b/stacks/infra/main.tf @@ -66,25 +66,17 @@ module "k8s-node-template" { # Set up config_path for per-registry mirror configuration sed -i 's|config_path = ""|config_path = "/etc/containerd/certs.d"|' /etc/containerd/config.toml - # Create hosts.toml for docker.io (Docker Hub) + # Create hosts.toml for docker.io (Docker Hub) — high traffic, rate-limited mkdir -p /etc/containerd/certs.d/docker.io printf 'server = "https://registry-1.docker.io"\n\n[host."http://10.0.20.10:5000"]\n capabilities = ["pull", "resolve"]\n' > /etc/containerd/certs.d/docker.io/hosts.toml - # Create hosts.toml for ghcr.io + # Create hosts.toml for ghcr.io — medium traffic mkdir -p /etc/containerd/certs.d/ghcr.io printf 'server = "https://ghcr.io"\n\n[host."http://10.0.20.10:5010"]\n capabilities = ["pull", "resolve"]\n' > /etc/containerd/certs.d/ghcr.io/hosts.toml - # Create hosts.toml for quay.io - mkdir -p /etc/containerd/certs.d/quay.io - printf 'server = "https://quay.io"\n\n[host."http://10.0.20.10:5020"]\n capabilities = ["pull", "resolve"]\n' > /etc/containerd/certs.d/quay.io/hosts.toml - - # Create hosts.toml for registry.k8s.io - mkdir -p /etc/containerd/certs.d/registry.k8s.io - printf 'server = "https://registry.k8s.io"\n\n[host."http://10.0.20.10:5030"]\n capabilities = ["pull", "resolve"]\n' > /etc/containerd/certs.d/registry.k8s.io/hosts.toml - - # Create hosts.toml for reg.kyverno.io - mkdir -p /etc/containerd/certs.d/reg.kyverno.io - printf 'server = "https://reg.kyverno.io"\n\n[host."http://10.0.20.10:5040"]\n capabilities = ["pull", "resolve"]\n' > /etc/containerd/certs.d/reg.kyverno.io/hosts.toml + # Low-traffic registries (registry.k8s.io, quay.io, reg.kyverno.io) pull directly. + # Pull-through cache removed: caused corrupted images (truncated downloads) + # breaking VPA certgen and Kyverno image pulls. sed -i 's/.*max_concurrent_downloads = 3/max_concurrent_downloads = 20/g' /etc/containerd/config.toml # Enable multiple concurrent downloads sudo sed -i '/serializeImagePulls:/d' /var/lib/kubelet/config.yaml && \ @@ -275,13 +267,14 @@ module "docker-registry-vm" { bridge = "vmbr1" vlan_tag = "20" ipconfig0 = "ip=10.0.20.10/24,gw=10.0.20.1" - # All ports go through nginx for request serialization (proxy_cache_lock): + # Active pull-through caches (docker.io + ghcr.io only): # 5000 -> nginx -> registry-dockerhub (docker.io proxy) # 5001 -> registry-dockerhub direct (Prometheus metrics) # 5010 -> nginx -> registry-ghcr (ghcr.io proxy) - # 5020 -> nginx -> registry-quay (quay.io proxy) - # 5030 -> nginx -> registry-k8s (registry.k8s.io proxy) - # 5040 -> nginx -> registry-kyverno (reg.kyverno.io proxy) + # Disabled caches (low-traffic, caused corrupted images): + # 5020 -> registry-quay (quay.io) — DISABLED + # 5030 -> registry-k8s (registry.k8s.io) — DISABLED, broke VPA certgen + # 5040 -> registry-kyverno (reg.kyverno.io) — DISABLED # 5050 -> nginx -> registry-private (R/W registry for CI build cache) # 8080 -> registry-ui (joxit/docker-registry-ui) }