diff --git a/.claude/CLAUDE.md b/.claude/CLAUDE.md index 98dacd41..bb1ce653 100755 --- a/.claude/CLAUDE.md +++ b/.claude/CLAUDE.md @@ -30,7 +30,7 @@ Violations cause state drift, which causes future applies to break or silently r - **New service**: Use `setup-project` skill for full workflow - **Ingress**: `ingress_factory` module. Auth: `protected = true`. Anti-AI: on by default. **DNS**: `dns_type = "proxied"` (Cloudflare CDN) or `"non-proxied"` (direct A/AAAA). DNS records are auto-created — no need to edit `config.tfvars`. - **Docker images**: Always build for `linux/amd64`. Use 8-char git SHA tags — `:latest` causes stale pull-through cache. -- **Private registry**: `registry.viktorbarzin.me` (htpasswd auth, credentials in Vault `secret/viktor`). Use `image: registry.viktorbarzin.me/:` + `imagePullSecrets: [{name: registry-credentials}]`. Kyverno auto-syncs the secret to all namespaces. Build & push from registry VM (`10.0.20.10`). Containerd `hosts.toml` redirects pulls to LAN IP directly. Web UI at `docker.viktorbarzin.me` (Authentik-protected). Engine pinned to `registry:2.8.3` (see post-mortem 2026-04-19); on-VM configs deploy via `.woodpecker/registry-config-sync.yml`; integrity probed every 15m by `registry-integrity-probe` CronJob in `monitoring` ns — the HTTP API is the authoritative integrity check, NOT `/blobs/*/data` presence (revision-link absence is the real failure mode). +- **Private registry**: `forgejo.viktorbarzin.me/viktor/` (Forgejo packages, OAuth-style PAT auth). Use `image: forgejo.viktorbarzin.me/viktor/:` + `imagePullSecrets: [{name: registry-credentials}]`. Kyverno auto-syncs the Secret to all namespaces. Containerd `hosts.toml` on every node redirects to in-cluster Traefik LB `10.0.20.200` to avoid hairpin NAT. Push-side: viktor PAT in Vault `secret/ci/global/forgejo_push_token` (Forgejo container packages are scoped per-user; only the package owner can push, ci-pusher cannot write to viktor/*). Pull-side: cluster-puller PAT in Vault `secret/viktor/forgejo_pull_token`. Retention CronJob (`forgejo-cleanup` in `forgejo` ns, daily 04:00) keeps newest 10 versions + always `:latest`; integrity probed every 15min by `forgejo-integrity-probe` in `monitoring` ns (catalog walk + manifest HEAD on every blob). See `docs/plans/2026-05-07-forgejo-registry-consolidation-{design,plan}.md` for the migration history. Pull-through caches for upstream registries (DockerHub, GHCR, Quay, k8s.gcr, Kyverno) stay on the registry VM at `10.0.20.10` ports 5000/5010/5020/5030/5040 — the old port-5050 R/W private registry was decommissioned 2026-05-07. - **LinuxServer.io containers**: `DOCKER_MODS` runs apt-get on every start — bake slow mods into a custom image (`RUN /docker-mods || true` then `ENV DOCKER_MODS=`). Set `NO_CHOWN=true` to skip recursive chown that hangs on NFS mounts. - **Node memory changes**: When changing VM memory on any k8s node, update kubelet `systemReserved`, `kubeReserved`, and eviction thresholds accordingly. Config: `/var/lib/kubelet/config.yaml`. Template: `stacks/infra/main.tf`. Current values: systemReserved=512Mi, kubeReserved=512Mi, evictionHard=500Mi, evictionSoft=1Gi. - **Node OS disk tuning** (in `stacks/infra/main.tf`): kubelet `imageGCHighThresholdPercent=70` (was 85), `imageGCLowThresholdPercent=60` (was 80), ext4 `commit=60` in fstab (was default 5s), journald `SystemMaxUse=200M` + `MaxRetentionSec=3day`. diff --git a/.woodpecker/default.yml b/.woodpecker/default.yml index fa6ffc4a..05a579ea 100644 --- a/.woodpecker/default.yml +++ b/.woodpecker/default.yml @@ -25,7 +25,7 @@ clone: steps: - name: apply - image: registry.viktorbarzin.me/infra-ci:latest + image: forgejo.viktorbarzin.me/viktor/infra-ci:latest pull: true backend_options: kubernetes: diff --git a/.woodpecker/drift-detection.yml b/.woodpecker/drift-detection.yml index 77d7788f..438c408c 100644 --- a/.woodpecker/drift-detection.yml +++ b/.woodpecker/drift-detection.yml @@ -14,7 +14,7 @@ clone: steps: - name: detect-drift - image: registry.viktorbarzin.me/infra-ci:latest + image: forgejo.viktorbarzin.me/viktor/infra-ci:latest pull: true backend_options: kubernetes: diff --git a/docs/architecture/ci-cd.md b/docs/architecture/ci-cd.md index 6a7fd2f9..4c0c020b 100644 --- a/docs/architecture/ci-cd.md +++ b/docs/architecture/ci-cd.md @@ -19,7 +19,7 @@ graph LR I --> J[Pull from DockerHub
or Pull-Through Cache] K[Pull-Through Cache
10.0.20.10] -.-> J - L[registry.viktorbarzin.me
Private Registry] -.-> J + L[forgejo.viktorbarzin.me
Private Registry on Forgejo] -.-> J style B fill:#2088ff style F fill:#4c9e47 @@ -33,7 +33,7 @@ graph LR | GitHub Actions | Cloud | `.github/workflows/build-and-deploy.yml` | Build Docker images, push to DockerHub | | Woodpecker CI | Self-hosted | `ci.viktorbarzin.me` | Deploy to Kubernetes cluster | | DockerHub | Cloud | `viktorbarzin/*` | Public image registry | -| Private Registry | Custom | `registry.viktorbarzin.me` | Private images, htpasswd auth | +| Private Registry | Forgejo Packages | `forgejo.viktorbarzin.me/viktor` | Private container images (PAT auth, retention CronJob) — migrated from registry.viktorbarzin.me 2026-05-07 | | Pull-Through Cache | Custom | `10.0.20.10:5000` (docker.io)
`10.0.20.10:5010` (ghcr.io) | LAN cache for remote registries | | Kyverno | Cluster | `kyverno` namespace | Auto-sync registry credentials to all namespaces | | Vault | Cluster | `vault.viktorbarzin.me` | K8s auth for Woodpecker pipelines | @@ -102,7 +102,7 @@ Woodpecker API uses numeric IDs (not owner/name): 1. **Containerd hosts.toml** redirects pulls from docker.io and ghcr.io to pull-through cache at `10.0.20.10` 2. **Pull-through cache** serves cached images from LAN, fetches from upstream on cache miss 3. **Kyverno ClusterPolicy** auto-syncs `registry-credentials` Secret to all namespaces for private registry access -4. **Private registry** (`registry.viktorbarzin.me`) uses htpasswd auth, credentials stored in Vault. Runs `registry:2.8.3` (pinned — floating `registry:2` was the root cause of the 2026-04-13 + 2026-04-19 orphan-index incidents; see `docs/post-mortems/2026-04-19-registry-orphan-index.md`). +4. **Private registry** has been Forgejo's built-in OCI registry at `forgejo.viktorbarzin.me/viktor/` since 2026-05-07. Auth via PAT (Vault `secret/ci/global/forgejo_push_token` for push, `secret/viktor/forgejo_pull_token` for pull). The pre-migration `registry:2.8.3`-based private registry on `registry.viktorbarzin.me:5050` was the root cause of three orphan-index incidents in three weeks (2026-04-13, 2026-04-19, 2026-05-04 — see `docs/post-mortems/2026-04-19-registry-orphan-index.md` and the full migration writeup at `docs/plans/2026-05-07-forgejo-registry-consolidation-{design,plan}.md`). The five pull-through caches on `10.0.20.10` (ports 5000/5010/5020/5030/5040) stay in place for upstream registries. 5. **Integrity probe** (`registry-integrity-probe` CronJob in `monitoring` ns, every 15m) walks `/v2/_catalog` → tags → indexes → child manifests via HEAD and pushes `registry_manifest_integrity_failures` to Pushgateway; alerts `RegistryManifestIntegrityFailure` / `RegistryIntegrityProbeStale` / `RegistryCatalogInaccessible` page on broken state. Authoritative check (HTTP API, not filesystem). ### Infra Pipelines (Woodpecker-only) diff --git a/docs/architecture/monitoring.md b/docs/architecture/monitoring.md index 5fa3bbba..3b9d915d 100644 --- a/docs/architecture/monitoring.md +++ b/docs/architecture/monitoring.md @@ -63,7 +63,7 @@ graph TB | External Monitor Sync | Python 3.12 | `stacks/uptime-kuma/` | CronJob (10min) syncs `[External]` monitors from `cloudflare_proxied_names` | | dcgm-exporter | Configurable resources | `stacks/monitoring/modules/monitoring/` | NVIDIA GPU metrics collection | | Email Roundtrip Probe | Python 3.12 | `stacks/mailserver/modules/mailserver/` | E2E email delivery verification via Mailgun API + IMAP | -| Registry Integrity Probe | Alpine 3.20 + curl/jq | `stacks/monitoring/modules/monitoring/main.tf` | CronJob every 15m: walks `/v2/_catalog` on `registry.viktorbarzin.me:5050`, HEADs every tagged manifest + index child; emits `registry_manifest_integrity_*` metrics to Pushgateway. Catches orphan OCI-index state that filesystem scans miss. | +| Forgejo Registry Integrity Probe | Alpine 3.20 + curl/jq | `stacks/monitoring/modules/monitoring/main.tf` | CronJob every 15m: walks `/v2/_catalog` on `forgejo.viktorbarzin.me` (HTTP via in-cluster service), HEADs every tagged manifest + index child; emits `registry_manifest_integrity_*` metrics to Pushgateway. Replaces the legacy `registry-integrity-probe` against `registry.viktorbarzin.me:5050` decommissioned in Phase 4 of forgejo-registry-consolidation 2026-05-07. | ## How It Works diff --git a/docs/runbooks/registry-vm.md b/docs/runbooks/registry-vm.md index b5fed938..95f7b637 100644 --- a/docs/runbooks/registry-vm.md +++ b/docs/runbooks/registry-vm.md @@ -1,12 +1,30 @@ # Runbook: Registry VM (docker-registry, 10.0.20.10) -Last updated: 2026-04-19 +Last updated: 2026-05-07 -The registry VM hosts `registry.viktorbarzin.me` (private Docker -registry, htpasswd-auth, NGINX → registry:2). It is an Ubuntu 24.04 -VM on the cluster LAN subnet `10.0.20.0/24`, with a static netplan -config (no DHCP). Because it sits on a subnet that only has pfSense -as its gateway, its DNS must be statically configured. +The registry VM is an Ubuntu 24.04 VM on the cluster LAN subnet +`10.0.20.0/24`, with a static netplan config (no DHCP). Because it +sits on a subnet that only has pfSense as its gateway, its DNS must +be statically configured. + +**As of Phase 4 of forgejo-registry-consolidation 2026-05-07** the VM +no longer hosts the private R/W registry. It hosts pull-through +caches only: + +| Port | Upstream | +|---|---| +| 5000 | docker.io (Docker Hub) — auth via dockerhub_registry_password | +| 5010 | ghcr.io | +| 5020 | quay.io | +| 5030 | registry.k8s.io | +| 5040 | reg.kyverno.io | + +The decommissioned private registry (port 5050) is now hosted on +Forgejo at `forgejo.viktorbarzin.me/viktor/`. See +`docs/plans/2026-05-07-forgejo-registry-consolidation-plan.md` for the +migration. Break-glass tarballs of `infra-ci` are still produced on +each build to `/opt/registry/data/private/_breakglass/` — see +`docs/runbooks/forgejo-registry-breakglass.md`. ## DNS configuration diff --git a/modules/docker-registry/docker-compose.yml b/modules/docker-registry/docker-compose.yml index 083e6bba..687dab27 100644 --- a/modules/docker-registry/docker-compose.yml +++ b/modules/docker-registry/docker-compose.yml @@ -89,35 +89,25 @@ services: retries: 3 start_period: 10s - registry-private: - image: registry:2.8.3 - container_name: registry-private - restart: always - volumes: - - /opt/registry/data/private:/var/lib/registry - - /opt/registry/config-private.yml:/etc/docker/registry/config.yml:ro - - /opt/registry/htpasswd:/auth/htpasswd:ro - networks: - - registry - healthcheck: - # 401 is expected (auth required) — any HTTP response means the registry is healthy - test: ["CMD", "sh", "-c", "wget -qS -O /dev/null http://127.0.0.1:5000/v2/ 2>&1 | grep -q 'HTTP/'"] - interval: 30s - timeout: 10s - retries: 3 - start_period: 10s + # registry-private removed in Phase 4 of forgejo-registry-consolidation + # 2026-05-07. The /v2/ private registry has migrated to Forgejo at + # forgejo.viktorbarzin.me/viktor/. Pull-through caches for upstream + # registries (dockerhub, ghcr, quay, k8s, kyverno) stay on this VM. + # Manual decommission step on the live VM: + # ssh root@10.0.20.10 'cd /opt/registry && docker compose up -d --remove-orphans' + # …and after 1 week of no incidents, `rm -rf /opt/registry/data/private/`. nginx: image: nginx:alpine container_name: registry-nginx restart: always + # 5050 dropped Phase 4 of forgejo-registry-consolidation 2026-05-07. ports: - "5000:5000" - "5010:5010" - "5020:5020" - "5030:5030" - "5040:5040" - - "5050:5050" volumes: - /opt/registry/nginx.conf:/etc/nginx/nginx.conf:ro - /opt/registry/tls:/etc/nginx/tls:ro @@ -135,8 +125,6 @@ services: condition: service_healthy registry-kyverno: condition: service_healthy - registry-private: - condition: service_healthy healthcheck: test: ["CMD", "sh", "-c", "wget -qO- http://127.0.0.1:5000/v2/ >/dev/null 2>&1"] interval: 30s diff --git a/modules/docker-registry/nginx_registry.conf b/modules/docker-registry/nginx_registry.conf index ec433340..e46d9f22 100644 --- a/modules/docker-registry/nginx_registry.conf +++ b/modules/docker-registry/nginx_registry.conf @@ -33,10 +33,9 @@ http { keepalive 32; } - upstream private { - server registry-private:5000; - keepalive 32; - } + # `upstream private` removed in Phase 4 of forgejo-registry-consolidation + # 2026-05-07. The /v2/ private registry is now Forgejo at + # forgejo.viktorbarzin.me/viktor/. # --- Docker Hub (port 5000) --- @@ -168,37 +167,8 @@ http { } } - # --- Private R/W Registry (port 5050, TLS) --- - - server { - listen 5050 ssl; - server_name registry.viktorbarzin.me; - - ssl_certificate /etc/nginx/tls/fullchain.pem; - ssl_certificate_key /etc/nginx/tls/privkey.pem; - ssl_protocols TLSv1.2 TLSv1.3; - - client_max_body_size 0; - proxy_request_buffering off; - proxy_buffering off; - chunked_transfer_encoding on; - - location /v2/ { - proxy_pass http://private; - proxy_http_version 1.1; - proxy_set_header Host $http_host; - proxy_set_header Connection ""; - proxy_set_header X-Real-IP $remote_addr; - proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; - proxy_set_header X-Forwarded-Proto $scheme; - - proxy_read_timeout 900; - proxy_send_timeout 900; - } - - location / { - return 200 'ok'; - add_header Content-Type text/plain; - } - } + # --- Private R/W Registry (port 5050) removed Phase 4 2026-05-07 --- + # The TLS port 5050 server block previously fronted `registry-private`. + # Migrated to Forgejo at forgejo.viktorbarzin.me/viktor/. nginx no longer + # listens on 5050; docker-compose.yml drops the `5050:5050` port mapping. } diff --git a/stacks/beads-server/main.tf b/stacks/beads-server/main.tf index e11b0ac7..006b9db6 100644 --- a/stacks/beads-server/main.tf +++ b/stacks/beads-server/main.tf @@ -567,7 +567,8 @@ resource "kubernetes_deployment" "beadboard" { container { name = "beadboard" - image = "registry.viktorbarzin.me:5050/beadboard:${var.beadboard_image_tag}" + # Phase 3 cutover 2026-05-07 — Forgejo registry consolidation. + image = "forgejo.viktorbarzin.me/viktor/beadboard:${var.beadboard_image_tag}" port { name = "http" @@ -725,7 +726,8 @@ resource "kubernetes_config_map" "beads_metadata" { } locals { - claude_agent_service_image = "registry.viktorbarzin.me/claude-agent-service:${var.claude_agent_service_image_tag}" + # Phase 3 cutover 2026-05-07 — Forgejo registry consolidation. + claude_agent_service_image = "forgejo.viktorbarzin.me/viktor/claude-agent-service:${var.claude_agent_service_image_tag}" beadboard_internal_url = "http://${kubernetes_service.beadboard.metadata[0].name}.${kubernetes_namespace.beads.metadata[0].name}.svc.cluster.local" beads_script_prelude = <<-EOT diff --git a/stacks/chrome-service/main.tf b/stacks/chrome-service/main.tf index bfefcc98..aae58be8 100644 --- a/stacks/chrome-service/main.tf +++ b/stacks/chrome-service/main.tf @@ -269,8 +269,9 @@ resource "kubernetes_deployment" "chrome_service" { # ingress at chrome.viktorbarzin.me. WS port 3000 (the Playwright # endpoint) stays internal-only. container { - name = "novnc" - image = "registry.viktorbarzin.me/chrome-service-novnc:v4" + name = "novnc" + # Phase 3 cutover 2026-05-07 — Forgejo registry consolidation. + image = "forgejo.viktorbarzin.me/viktor/chrome-service-novnc:v4" image_pull_policy = "IfNotPresent" port { name = "http" diff --git a/stacks/claude-agent-service/main.tf b/stacks/claude-agent-service/main.tf index 598572ee..71ea0d7d 100644 --- a/stacks/claude-agent-service/main.tf +++ b/stacks/claude-agent-service/main.tf @@ -10,7 +10,8 @@ data "vault_kv_secret_v2" "viktor_secrets" { locals { namespace = "claude-agent" - image = "registry.viktorbarzin.me/claude-agent-service" + # Phase 3 cutover 2026-05-07 — see infra/docs/plans/2026-05-07-forgejo-registry-consolidation-plan.md. + image = "forgejo.viktorbarzin.me/viktor/claude-agent-service" image_tag = "2fd7670d" labels = { app = "claude-agent-service" diff --git a/stacks/claude-memory/main.tf b/stacks/claude-memory/main.tf index 2394cd75..da62aaf0 100644 --- a/stacks/claude-memory/main.tf +++ b/stacks/claude-memory/main.tf @@ -175,8 +175,10 @@ resource "kubernetes_deployment" "claude-memory" { } } container { - name = "claude-memory" - image = "viktorbarzin/claude-memory-mcp:17" + name = "claude-memory" + # Phase 3 cutover 2026-05-07 — moved off DockerHub to Forgejo as + # part of the registry consolidation. Old: viktorbarzin/claude-memory-mcp:17 + image = "forgejo.viktorbarzin.me/viktor/claude-memory-mcp:17" port { container_port = 8000 @@ -282,7 +284,4 @@ module "ingress" { "gethomepage.dev/name" = "Claude Memory" "gethomepage.dev/description" = "Shared persistent memory for Claude sessions" "gethomepage.dev/icon" = "claude-ai.png" - "gethomepage.dev/group" = "Core Platform" - "gethomepage.dev/pod-selector" = "" - } -} + "gethomepage.dev/group" = "Cor \ No newline at end of file diff --git a/stacks/fire-planner/main.tf b/stacks/fire-planner/main.tf index 09e1177b..784427de 100644 --- a/stacks/fire-planner/main.tf +++ b/stacks/fire-planner/main.tf @@ -8,7 +8,11 @@ variable "postgresql_host" { type = string } locals { namespace = "fire-planner" - image = "registry.viktorbarzin.me/fire-planner:${var.image_tag}" + # Phase 3 cutover 2026-05-07. NOTE: the registry-private repo for + # fire-planner has 0 tags — first build via Woodpecker on the new Forgejo + # repo (viktor/fire-planner, Dockerfile + .woodpecker.yml added 2026-05-07) + # must succeed BEFORE the next pod restart, otherwise pulls will 404. + image = "forgejo.viktorbarzin.me/viktor/fire-planner:${var.image_tag}" labels = { app = "fire-planner" } diff --git a/stacks/freedify/factory/main.tf b/stacks/freedify/factory/main.tf index c66b9029..e2158d4c 100755 --- a/stacks/freedify/factory/main.tf +++ b/stacks/freedify/factory/main.tf @@ -105,7 +105,8 @@ resource "kubernetes_deployment" "freedify" { name = "registry-credentials" } container { - image = "registry.viktorbarzin.me/freedify:${var.tag}" + # Phase 3 cutover 2026-05-07 — Forgejo registry consolidation. + image = "forgejo.viktorbarzin.me/viktor/freedify:${var.tag}" name = "freedify" port { diff --git a/stacks/infra/main.tf b/stacks/infra/main.tf index c08af833..5b2d8876 100644 --- a/stacks/infra/main.tf +++ b/stacks/infra/main.tf @@ -75,17 +75,11 @@ module "k8s-node-template" { mkdir -p /etc/containerd/certs.d/ghcr.io printf 'server = "https://ghcr.io"\n\n[host."http://10.0.20.10:5010"]\n capabilities = ["pull", "resolve"]\n\n[host."https://ghcr.io"]\n capabilities = ["pull", "resolve"]\n' > /etc/containerd/certs.d/ghcr.io/hosts.toml - # Create hosts.toml for private registry — both IP and hostname entries - # IP-based (10.0.20.10:5050): direct access, skip TLS verify (wildcard cert, no IP SAN) - mkdir -p /etc/containerd/certs.d/10.0.20.10:5050 - printf 'server = "https://10.0.20.10:5050"\n\n[host."https://10.0.20.10:5050"]\n capabilities = ["pull", "resolve", "push"]\n skip_verify = true\n' > /etc/containerd/certs.d/10.0.20.10:5050/hosts.toml - # Hostname-based (registry.viktorbarzin.me): redirects to LAN IP to avoid Traefik round-trip - mkdir -p /etc/containerd/certs.d/registry.viktorbarzin.me - printf 'server = "https://registry.viktorbarzin.me"\n\n[host."https://10.0.20.10:5050"]\n capabilities = ["pull", "resolve", "push"]\n skip_verify = true\n' > /etc/containerd/certs.d/registry.viktorbarzin.me/hosts.toml - # Forgejo OCI registry: redirect to in-cluster Traefik LB (10.0.20.200) so # pulls don't hairpin out through the WAN gateway. Traefik serves the # *.viktorbarzin.me wildcard so SNI verification still passes. + # registry.viktorbarzin.me / 10.0.20.10:5050 entries removed in Phase 4 of + # the forgejo-registry-consolidation 2026-05-07 — registry-private is gone. mkdir -p /etc/containerd/certs.d/forgejo.viktorbarzin.me printf 'server = "https://forgejo.viktorbarzin.me"\n\n[host."https://10.0.20.200"]\n capabilities = ["pull", "resolve"]\n' > /etc/containerd/certs.d/forgejo.viktorbarzin.me/hosts.toml diff --git a/stacks/job-hunter/main.tf b/stacks/job-hunter/main.tf index ff48e28d..8f849c33 100644 --- a/stacks/job-hunter/main.tf +++ b/stacks/job-hunter/main.tf @@ -8,7 +8,8 @@ variable "postgresql_host" { type = string } locals { namespace = "job-hunter" - image = "registry.viktorbarzin.me/job-hunter:${var.image_tag}" + # Phase 3 cutover 2026-05-07 — see infra/docs/plans/2026-05-07-forgejo-registry-consolidation-plan.md. + image = "forgejo.viktorbarzin.me/viktor/job-hunter:${var.image_tag}" labels = { app = "job-hunter" } diff --git a/stacks/kyverno/modules/kyverno/registry-credentials.tf b/stacks/kyverno/modules/kyverno/registry-credentials.tf index 18949cad..c9cae9fd 100644 --- a/stacks/kyverno/modules/kyverno/registry-credentials.tf +++ b/stacks/kyverno/modules/kyverno/registry-credentials.tf @@ -20,21 +20,12 @@ resource "kubernetes_secret" "registry_credentials" { data = { ".dockerconfigjson" = jsonencode({ auths = { - "registry.viktorbarzin.me" = { - auth = base64encode("${data.vault_kv_secret_v2.viktor.data["registry_user"]}:${data.vault_kv_secret_v2.viktor.data["registry_password"]}") - } - "registry.viktorbarzin.me:5050" = { - auth = base64encode("${data.vault_kv_secret_v2.viktor.data["registry_user"]}:${data.vault_kv_secret_v2.viktor.data["registry_password"]}") - } - "10.0.20.10:5050" = { - auth = base64encode("${data.vault_kv_secret_v2.viktor.data["registry_user"]}:${data.vault_kv_secret_v2.viktor.data["registry_password"]}") - } - # Forgejo OCI registry — read-only PAT for the cluster-puller service - # account user. Pushes go through ci-pusher (separate PAT in Vault - # secret/ci/global, surfaced to Woodpecker). - # try() lets the apply succeed before the Vault key is populated - # during Phase 0 bootstrap (see docs/runbooks/forgejo-registry-setup.md). - # The cluster has no consumers yet — broken creds are visible but harmless. + # Phase 4 of forgejo-registry-consolidation 2026-05-07 — registry- + # private decommissioned. Old auths entries (registry.viktorbarzin.me, + # registry.viktorbarzin.me:5050, 10.0.20.10:5050) removed to prevent + # silent fallback. If a pod somehow references the old hostname now, + # it will visibly fail with auth missing rather than silently pulling + # potentially-stale blobs. "forgejo.viktorbarzin.me" = { auth = base64encode("cluster-puller:${try(data.vault_kv_secret_v2.viktor.data["forgejo_pull_token"], "")}") } diff --git a/stacks/monitoring/modules/monitoring/main.tf b/stacks/monitoring/modules/monitoring/main.tf index ceadb1eb..481112c1 100644 --- a/stacks/monitoring/modules/monitoring/main.tf +++ b/stacks/monitoring/modules/monitoring/main.tf @@ -243,193 +243,15 @@ resource "kubernetes_cron_job_v1" "dns_anomaly_monitor" { } # ----------------------------------------------------------------------------- -# Registry manifest-integrity probe — HEADs every tag in the private R/W -# registry's catalog, walks multi-platform image indexes, and reports blob -# availability. Catches the orphan-index failure mode seen 2026-04-13 and -# 2026-04-19 before downstream pipelines hit it. +# Phase 4 of forgejo-registry-consolidation 2026-05-07: registry-private +# decommissioned. The integrity probe below caught the orphan-index failure +# mode in `registry:2.8.3` (post-mortem 2026-04-19). With that engine +# retired, the probe is replaced by `forgejo_integrity_probe` below. +# +# Resource definitions stripped wholesale — terragrunt apply destroys the +# in-cluster CronJob + Secret on the next run. # See: docs/post-mortems/2026-04-19-registry-orphan-index.md # ----------------------------------------------------------------------------- -resource "kubernetes_secret" "registry_probe_credentials" { - metadata { - name = "registry-probe-credentials" - namespace = kubernetes_namespace.monitoring.metadata[0].name - } - type = "Opaque" - data = { - REG_USER = var.registry_user - REG_PASS = var.registry_password - } -} - -resource "kubernetes_cron_job_v1" "registry_integrity_probe" { - metadata { - name = "registry-integrity-probe" - namespace = kubernetes_namespace.monitoring.metadata[0].name - } - spec { - concurrency_policy = "Forbid" - failed_jobs_history_limit = 3 - successful_jobs_history_limit = 3 - schedule = "*/15 * * * *" - job_template { - metadata {} - spec { - backoff_limit = 1 - ttl_seconds_after_finished = 600 - template { - metadata {} - spec { - container { - name = "registry-integrity-probe" - image = "docker.io/library/alpine:3.20" - env { - name = "REG_USER" - value_from { - secret_key_ref { - name = kubernetes_secret.registry_probe_credentials.metadata[0].name - key = "REG_USER" - } - } - } - env { - name = "REG_PASS" - value_from { - secret_key_ref { - name = kubernetes_secret.registry_probe_credentials.metadata[0].name - key = "REG_PASS" - } - } - } - env { - name = "REGISTRY_HOST" - value = "10.0.20.10:5050" - } - env { - name = "REGISTRY_INSTANCE" - value = "registry.viktorbarzin.me:5050" - } - env { - name = "PUSHGATEWAY" - value = "http://prometheus-prometheus-pushgateway.monitoring:9091/metrics/job/registry-integrity-probe" - } - env { - name = "TAGS_PER_REPO" - value = "5" - } - command = ["/bin/sh", "-c", <<-EOT - set -eu - apk add --no-cache curl jq >/dev/null - - REG="$REGISTRY_HOST" - INSTANCE="$REGISTRY_INSTANCE" - AUTH="$REG_USER:$REG_PASS" - ACCEPT='application/vnd.oci.image.index.v1+json,application/vnd.oci.image.manifest.v1+json,application/vnd.docker.distribution.manifest.list.v2+json,application/vnd.docker.distribution.manifest.v2+json' - - push() { - # Prometheus pushgateway — body ends with blank line. Ignore push errors. - curl -sf --max-time 10 --data-binary @- "$PUSHGATEWAY" >/dev/null 2>&1 || true - } - - CATALOG=$(curl -sk -u "$AUTH" --max-time 30 "https://$REG/v2/_catalog?n=1000" || echo "") - REPOS=$(echo "$CATALOG" | jq -r '.repositories[]?' 2>/dev/null || echo "") - - if [ -z "$REPOS" ]; then - echo "ERROR: empty catalog or auth failure — cannot probe" - NOW=$(date +%s) - push < /tmp/repos.txt - while IFS= read -r repo; do - [ -z "$repo" ] && continue - REPOS_N=$((REPOS_N + 1)) - - TAGS_JSON=$(curl -sk -u "$AUTH" --max-time 15 "https://$REG/v2/$repo/tags/list" || echo "") - echo "$TAGS_JSON" | jq -r '.tags[]?' 2>/dev/null | tail -n "$TAGS_PER_REPO" > /tmp/tags.txt || true - - while IFS= read -r tag; do - [ -z "$tag" ] && continue - TAGS_N=$((TAGS_N + 1)) - - HTTP=$(curl -sk -u "$AUTH" -o /tmp/m.json -w '%%{http_code}' \ - -H "Accept: $ACCEPT" --max-time 15 \ - "https://$REG/v2/$repo/manifests/$tag") - if [ "$HTTP" != "200" ]; then - echo "FAIL: $repo:$tag manifest HTTP $HTTP" - FAIL=$((FAIL + 1)) - continue - fi - - MT=$(jq -r '.mediaType // empty' /tmp/m.json 2>/dev/null || echo "") - if echo "$MT" | grep -Eq 'manifest\.list|image\.index'; then - INDEXES_N=$((INDEXES_N + 1)) - jq -r '.manifests[].digest' /tmp/m.json > /tmp/children.txt 2>/dev/null || true - while IFS= read -r d; do - [ -z "$d" ] && continue - CH=$(curl -sk -u "$AUTH" -o /dev/null -w '%%{http_code}' \ - -H "Accept: $ACCEPT" --max-time 10 -I \ - "https://$REG/v2/$repo/manifests/$d") - if [ "$CH" != "200" ]; then - echo "FAIL: $repo:$tag index child $d HTTP $CH" - FAIL=$((FAIL + 1)) - fi - done < /tmp/children.txt - fi - done < /tmp/tags.txt - done < /tmp/repos.txt - - NOW=$(date +%s) - push < 3600 for: 15m diff --git a/stacks/payslip-ingest/main.tf b/stacks/payslip-ingest/main.tf index 8c313c25..82fc3543 100644 --- a/stacks/payslip-ingest/main.tf +++ b/stacks/payslip-ingest/main.tf @@ -8,7 +8,10 @@ variable "postgresql_host" { type = string } locals { namespace = "payslip-ingest" - image = "registry.viktorbarzin.me/payslip-ingest:${var.image_tag}" + # Phase 3 of forgejo-registry-consolidation — image= flipped to Forgejo + # 2026-05-07. registry-private kept image at the same path, so the new + # Forgejo URL is `viktor/` under forgejo.viktorbarzin.me. + image = "forgejo.viktorbarzin.me/viktor/payslip-ingest:${var.image_tag}" labels = { app = "payslip-ingest" }