From 020f62555b76c3067d29deb7093f922659a2fdb7 Mon Sep 17 00:00:00 2001
From: Viktor Barzin <vbarzin@gmail.com>
Date: Sat, 16 May 2026 12:19:34 +0000
Subject: [PATCH] Phase 0: install Keel + Kyverno auto-update annotation
 injector
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Foundation for opt-out-pure auto-update model per
docs/plans/2026-05-16-auto-upgrade-apps-{design,plan}.md.

- New stack `stacks/keel/` deploys Keel via Helm (charts.keel.sh, v1.0.6).
  Polls registries hourly per design decision #8. Default schedule
  overridable per-workload via keel.sh/pollSchedule annotation.
- New Kyverno ClusterPolicy `inject-keel-annotations` mutates Deployments,
  StatefulSets, and DaemonSets in namespaces labeled `keel.sh/enrolled=true`
  with keel.sh/policy=force + trigger=poll + pollSchedule=@every 1h.
- Phase 0 enrolls no namespaces. Phase 1 (next session) labels the
  self-hosted set.
- Per-workload opt-out: label `keel.sh/policy: never` (used by rollback
  runbook and chrome-service-style deliberate pins).
- Keel namespace excluded from the mutate — supervisor self-update has
  too-bad a failure mode (decision #11).
- AGENTS.md: KYVERNO_LIFECYCLE_V2 marker convention added for the
  ignore_changes block enrolled workloads need.
- .claude/CLAUDE.md: docker-images rule flagged as transitional.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .claude/CLAUDE.md                             |   2 +-
 AGENTS.md                                     |  31 ++
 .../2026-05-16-auto-upgrade-apps-design.md    | 165 +++++++++
 .../2026-05-16-auto-upgrade-apps-plan.md      | 322 ++++++++++++++++++
 stacks/keel/main.tf                           |  65 ++++
 stacks/keel/terragrunt.hcl                    |  13 +
 .../modules/kyverno/keel-annotations.tf       |  82 +++++
 7 files changed, 679 insertions(+), 1 deletion(-)
 create mode 100644 docs/plans/2026-05-16-auto-upgrade-apps-design.md
 create mode 100644 docs/plans/2026-05-16-auto-upgrade-apps-plan.md
 create mode 100644 stacks/keel/main.tf
 create mode 100644 stacks/keel/terragrunt.hcl
 create mode 100644 stacks/kyverno/modules/kyverno/keel-annotations.tf
diff --git a/.claude/CLAUDE.md b/.claude/CLAUDE.md
index 7e520085..c6c024c0 100755
--- a/.claude/CLAUDE.md
+++ b/.claude/CLAUDE.md
@@ -37,7 +37,7 @@ Violations cause state drift, which causes future applies to break or silently r
   - **Anti-AI**: on by default when `auth = "none"` or `auth = "app"` (no Authentik to discourage bots); redundant on `"required"` and `"public"`.
   - **DNS**: `dns_type = "proxied"` (Cloudflare CDN) or `"non-proxied"` (direct A/AAAA). DNS records are auto-created — no need to edit `config.tfvars`. Smoke-test target: `echo.viktorbarzin.me` (auth=public, header-reflecting backend).
 - **Anubis PoW challenge** (`modules/kubernetes/anubis_instance/`): per-site reverse proxy that issues a 30-day JWT cookie after a tiny PoW solve. Use for **public, content-bearing sites without app-level auth** (blog, docs, wikis, static landing pages). Pattern: declare `module "anubis" { source = "../../modules/kubernetes/anubis_instance"; name = "X"; namespace = ...; target_url = "http://<backend>.<ns>.svc.cluster.local" }`, then in `ingress_factory` set `service_name = module.anubis.service_name`, `port = module.anubis.service_port`, `anti_ai_scraping = false`. Shared ed25519 key in Vault `secret/viktor` -> `anubis_ed25519_key`; cookie scoped to `viktorbarzin.me` so one solve covers all Anubis-fronted subdomains. **DO NOT put Anubis in front of Git/API/WebDAV/CLI endpoints** — clients without JS can't solve PoW. **Replicas default to 1** because Anubis stores in-flight challenges in process memory; a challenge issued by pod A and solved against pod B errors with `store: key not found` (HTTP 500). Bumping replicas requires wiring a shared Redis store (TODO). For path-level carve-outs (e.g. wrongmove has `/` behind Anubis but `/api` direct), declare a second `ingress_factory` with `ingress_path = ["/api"]` pointing at the bare backend service. Active on: blog, www, kms, travel, f1, cc, json, pb (privatebin), home (homepage), wrongmove (UI only). See `.claude/reference/patterns.md` "Anti-AI Scraping" for full layering.
-- **Docker images**: Always build for `linux/amd64`. Use 8-char git SHA tags — `:latest` causes stale pull-through cache.
+- **Docker images**: Always build for `linux/amd64`. SHA-tag rule is being phased out — see `docs/plans/2026-05-16-auto-upgrade-apps-{design,plan}.md`. New model: CI pushes `:latest` (optionally also `:<8-char-sha>` for traceability), Keel polls and triggers rollouts. Cache-staleness concern from the old rule is resolved at the nginx layer (URL-split — manifests pass through, blobs cached). Until Phase 1 of the migration completes (per the plan), follow the SHA-tag rule for new services to match existing pattern.
 - **Private registry**: `forgejo.viktorbarzin.me/viktor/<name>` (Forgejo packages, OAuth-style PAT auth). Use `image: forgejo.viktorbarzin.me/viktor/<name>:<tag>` + `imagePullSecrets: [{name: registry-credentials}]`. Kyverno auto-syncs the Secret to all namespaces. Containerd `hosts.toml` on every node redirects to in-cluster Traefik LB `10.0.20.200` to avoid hairpin NAT. Push-side: viktor PAT in Vault `secret/ci/global/forgejo_push_token` (Forgejo container packages are scoped per-user; only the package owner can push, ci-pusher cannot write to viktor/*). Pull-side: cluster-puller PAT in Vault `secret/viktor/forgejo_pull_token`. Retention CronJob (`forgejo-cleanup` in `forgejo` ns, daily 04:00) keeps newest 10 versions + always `:latest`; integrity probed every 15min by `forgejo-integrity-probe` in `monitoring` ns (catalog walk + manifest HEAD on every blob). See `docs/plans/2026-05-07-forgejo-registry-consolidation-{design,plan}.md` for the migration history. Pull-through caches for upstream registries (DockerHub, GHCR, Quay, k8s.gcr, Kyverno) stay on the registry VM at `10.0.20.10` ports 5000/5010/5020/5030/5040 — the old port-5050 R/W private registry was decommissioned 2026-05-07.
 - **LinuxServer.io containers**: `DOCKER_MODS` runs apt-get on every start — bake slow mods into a custom image (`RUN /docker-mods || true` then `ENV DOCKER_MODS=`). Set `NO_CHOWN=true` to skip recursive chown that hangs on NFS mounts.
 - **Node memory changes**: When changing VM memory on any k8s node, update kubelet `systemReserved`, `kubeReserved`, and eviction thresholds accordingly. Config: `/var/lib/kubelet/config.yaml`. Template: `stacks/infra/main.tf`. Current values: systemReserved=512Mi, kubeReserved=512Mi, evictionHard=500Mi, evictionSoft=1Gi.
diff --git a/AGENTS.md b/AGENTS.md
index 5f9c0839..53bcb5c2 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -154,6 +154,37 @@ lifecycle {
 
 **Audit**: `rg "KYVERNO_LIFECYCLE_V1" stacks/ | wc -l` — should grow (never shrink). Add the marker to every new pod-owning resource. The `_template/main.tf.example` stub shows the canonical form.
 
+### `# KYVERNO_LIFECYCLE_V2` — Keel auto-update annotations
+
+When a namespace is labeled `keel.sh/enrolled=true`, the `inject-keel-annotations` ClusterPolicy (`stacks/kyverno/modules/kyverno/keel-annotations.tf`) injects three annotations on every Deployment / StatefulSet / DaemonSet:
+
+```
+keel.sh/policy: force
+keel.sh/trigger: poll
+keel.sh/pollSchedule: "@every 1h"
+```
+
+To suppress the resulting Terraform drift, **enrolled workloads** must extend their `ignore_changes` block:
+
+```hcl
+lifecycle {
+  ignore_changes = [
+    spec[0].template[0].spec[0].dns_config, # KYVERNO_LIFECYCLE_V1
+    metadata[0].annotations["keel.sh/policy"],
+    metadata[0].annotations["keel.sh/trigger"],
+    metadata[0].annotations["keel.sh/pollSchedule"], # KYVERNO_LIFECYCLE_V2
+  ]
+}
+```
+
+The V2 snippet is added **per workload** as namespaces are phase-enrolled — not as a mass sweep. Workloads in un-enrolled namespaces do not receive the annotation and don't need the V2 block.
+
+Per-workload opt-out: add the label `keel.sh/policy: never` on the Deployment metadata (not pod template); the policy's `exclude` clause respects it, no annotation gets injected, no `ignore_changes` needed.
+
+**Audit**: `rg "KYVERNO_LIFECYCLE_V2" stacks/` — count should equal the number of enrolled workloads.
+
+**Design context**: `docs/plans/2026-05-16-auto-upgrade-apps-{design,plan}.md`.
+
 ## Tier System
 `0-core` | `1-cluster` | `2-gpu` | `3-edge` | `4-aux` — Kyverno auto-generates LimitRange + ResourceQuota per namespace based on tier label.
 - Containers without explicit `resources {}` get default limits (256Mi for edge/aux — causes OOMKill for heavy apps)
diff --git a/docs/plans/2026-05-16-auto-upgrade-apps-design.md b/docs/plans/2026-05-16-auto-upgrade-apps-design.md
new file mode 100644
index 00000000..78c81870
--- /dev/null
+++ b/docs/plans/2026-05-16-auto-upgrade-apps-design.md
@@ -0,0 +1,165 @@
+# Auto-Upgrade Apps Design
+
+**Date**: 2026-05-16
+**Status**: Approved (brainstorm + grill complete; implementation pending)
+
+## Problem
+
+Three constraints in tension across the cluster's ~70 services:
+
+1. **Keep apps at latest.** Most services drift behind upstream; manual bumps don't scale.
+2. **Stay Terraform-compatible.** Image refs live in `.tf`; we want declarative source of truth.
+3. **Don't let the pull-through cache serve stale `:latest`.** Cache layer must not lie about what `:latest` means today.
+
+The previous `Diun → n8n → Service Upgrade Agent` flow handled (1) via changelog-reviewed PR bumps for third-party. Self-hosted services have inconsistent CI: 1 of 11 fully wired (CI builds + pushes + rolls out), 6 partially wired (build but no rollout trigger), 4 with no CI at all. Self-hosted services typically pull `forgejo.viktorbarzin.me/viktor/<name>:<8-char-sha>` with Terraform tracking each SHA in `var.image_tag`.
+
+The user wants to simplify by retiring the changelog-review agent and moving to a pure "latest, always" model, with the cache freshness concern handled at the cache layer (already done — see Architecture §1).
+
+## Decisions
+
+| # | Decision | Notes |
+|---|----------|-------|
+| 1 | **Auto-roll for everything** (no PR-bump gate) | Retires the Service Upgrade Agent; Diun's role narrows to notification only |
+| 2 | **Actuator: Keel** ([keel.sh](https://keel.sh)) | Annotation-driven Deployment/StatefulSet/DaemonSet auto-update operator |
+| 3 | **Tag scheme: `:latest` where it exists, `:major` where it doesn't, glob+`ignore_changes` last resort** | `keel.sh/policy: force` for `:latest` / `:major`; tag string stays in Terraform |
+| 4 | **Opt-out-pure (no skip-list)** | Every workload auto-rolls, including Vault, CNPG, operators, CNI, CSI. User accepts recoverability risk |
+| 5 | **Phased rollout (9 phases)** | Low-risk → bootstrap. Catch up to latest as we phase in. Each phase soaks ~1 week |
+| 6 | **Per-phase: single combined PR** | Switch image refs to floating tag + add to Kyverno mutate allowlist in same commit |
+| 7 | **Diun is the audit source for catch-up** | Existing 6h-poll already reports outdated images; export as worklist per phase |
+| 8 | **Polling, hourly** (`@every 1h`) | Not webhooks — single mechanism, all registries supported |
+| 9 | **Rollback: `kubectl rollout undo` → pin in Terraform → add `keel.sh/policy: never`** | (c) from grill: immediate undo, durable Terraform pin within ≤1h before next Keel poll |
+| 10 | **Implementation: Kyverno cluster-wide mutate** | One `ClusterPolicy` injects Keel annotations; phase boundary = `NamespaceSelector` allowlist |
+| 11 | **Keel exempt from its own mutate** | One-line `NamespaceSelector` exclusion. Supervisor self-update has uniquely bad failure mode |
+| 12 | **Uniform CI model for all self-hosted** | CI builds + pushes `:latest`, Keel polls and rolls. No per-repo `kubectl set image` step. Retires the GHA-migrated SHA-tag flow (memory id=388) |
+
+## Architecture
+
+### 1. Cache freshness — already correct
+
+Pull-through cache at `10.0.20.10` already splits caching by URL at the nginx layer:
+
+- `location ~ /v2/.*/blobs/` → `proxy_cache_valid 200 24h` — blobs cached (content-addressed, immutable)
+- `location /v2/` (manifests) → pass through, no cache
+
+Combined with `registry.proxy.ttl: 0` at the docker-registry layer, mutable manifests revalidate against upstream on every pull. **No cache changes needed for this design.** The CLAUDE.md note "Use 8-char git SHA tags — `:latest` causes stale pull-through cache" predates the nginx URL-split fix and should be updated as part of this work.
+
+### 2. Detection — Keel polls upstream
+
+Keel runs as a Deployment in its own namespace. Every annotated workload polls its registry hourly (Keel-managed; configurable per workload). On detection of a new digest under the watched tag:
+
+- `keel.sh/policy: force` (for mutable tags `:latest`, `:16`, `:7`, etc.) → trigger Deployment update (pod template hash changes → restart)
+- `keel.sh/policy: minor` / `major` / `glob` (only for images that publish neither `:latest` nor a stable floating tag) → rewrite tag string on the Deployment; requires `lifecycle { ignore_changes = [...image] }`
+
+### 3. Application — kubelet pull through the cache
+
+When Keel triggers restart:
+
+1. kubelet asks the cache (via containerd hosts.toml) for `image:tag` manifest.
+2. nginx passes the manifest request through to the docker-registry layer.
+3. docker-registry (with `proxy.ttl: 0`) passes through to upstream.
+4. Upstream returns current digest.
+5. kubelet pulls blobs (mostly cached at nginx layer; new blobs from upstream).
+6. New pod runs new image.
+
+### 4. Annotation injection — Kyverno mutate
+
+Single `ClusterPolicy` adds these annotations to every Deployment / StatefulSet / DaemonSet in opted-in namespaces:
+
+```yaml
+metadata:
+  annotations:
+    keel.sh/policy: force
+    keel.sh/trigger: poll
+    keel.sh/pollSchedule: "@every 1h"
+```
+
+Phase = a `match.any[].resources.namespaces` list. Phase advance = append namespaces. Keel namespace is excluded.
+
+### 5. Terraform drift handling
+
+Existing convention (`# KYVERNO_LIFECYCLE_V1` marker) handles `dns_config` injection. We extend with a new marker:
+
+```hcl
+lifecycle {
+  ignore_changes = [
+    spec[0].template[0].spec[0].dns_config,  # KYVERNO_LIFECYCLE_V1
+    metadata[0].annotations["keel.sh/policy"],
+    metadata[0].annotations["keel.sh/trigger"],
+    metadata[0].annotations["keel.sh/pollSchedule"],  # KYVERNO_LIFECYCLE_V2
+  ]
+}
+```
+
+This is added per workload as we phase in. Mechanical, grep-able.
+
+## Phase ordering
+
+| Phase | Set | Rationale |
+|-------|-----|-----------|
+| 0 | Foundation (Keel install, Kyverno ClusterPolicy with empty allowlist) | Build infra without enrolling anything |
+| 1 | Self-hosted (forgejo-hosted: ~11 services) | We own the code; failures are easy to diagnose |
+| 2 | Stateless third-party web apps (linkwarden, postiz, affine, etc.) | No migrations |
+| 3 | Exporters, sidecars, utilities | Stateless |
+| 4 | Stateful-but-tolerant (Grafana, Prometheus, etc.) | Restart-safe state |
+| 5 | State-coupled with migrations (Nextcloud, Forgejo, paperless-ngx, mailserver) | Schema-migration risk |
+| 6 | Authentik | Auth outage |
+| 7 | Operators (cnpg-operator, ESO, kured, descheduler) | Operator skew |
+| 8 | Critical infra (Calico, proxmox-csi, nfs-csi, traefik, metallb) | Node-level outage potential (memory id=390: 26h Calico cascade) |
+| 9 | Bootstrap (Vault, CNPG PG cluster, mysql-standalone) | Lose recoverability if broken |
+
+Per-phase: combined PR → apply (catch-up rolls happen) → soak 1 week → next phase. If a service breaks repeatedly, apply rollback runbook (decision #9) and proceed; re-enroll later or leave pinned.
+
+## Risk register
+
+| Risk | Likelihood | Impact | Mitigation |
+|------|-----------|--------|------------|
+| Bad upstream image rolls into prod | High | Service-level outage | Existing alerts (`KubePodCrashLooping`, `KubeletImagePullErrors`, `PodsStuckContainerCreating`); rollback runbook (decision #9) |
+| Catch-up rollout overwhelms cache | Medium | ImagePullBackOff cascade (memory id=603) | Rate-limit catch-up to ~5 rollouts/6h via `-target=` per phase; same pacing as retired Service Upgrade Agent (memory id=612) |
+| Calico / CSI auto-roll cascades (memory id=390: 26h outage) | Low-Medium | Cluster-level outage | Phase 8 is intentionally late; user opted into the risk; rollback to pinned chart version via Terraform |
+| Vault auto-rolls to broken image | Low | Loss of secrets sync; 43 ExternalSecrets stop reconciling | Phase 9 last; Tier 0 SOPS state allows manual recovery |
+| CNPG PG cluster auto-rolls to broken image | Low | Tier 1 Terraform state inaccessible; 105 stacks can't apply | Phase 9 last; Tier 0 stack `cnpg` is bootstrap-capable |
+| Helm-atomic-trap services (memory id=981) | Medium | `terraform apply` hangs in pending-rollback | Identify `helm_release` services with `atomic = true`; either remove atomic or skip from Keel |
+| Keel itself rolls to broken version | Low | Supervisor down; no auto-rolls until manual pin | Decision #11: exempt Keel from mutate |
+| Terraform drift after Kyverno injects annotation | High at first | Spurious diffs on every plan | KYVERNO_LIFECYCLE_V2 marker (Architecture §5); applied incrementally per phase |
+
+## What we give up
+
+- **Terraform no longer tracks deployed version.** Image refs in `.tf` say `:latest` or `:16`, but the running digest is whatever Keel pulled. To know what's running: `kubectl describe pod`. This is a deliberate trade — the previous SHA-pinned flow tracked version in TF but required N stack edits per deploy.
+- **No changelog review before rollout.** The Service Upgrade Agent's risk classification is gone. We rely on alerts to catch breakage post-deploy, not prevent it.
+- **CLAUDE.md SHA-tag rule is reversed for this design.** The "use 8-char git SHA tags" rule predates the nginx URL-split fix. New rule (post-rollout): "use floating tags + Keel annotation" — to be updated in both `infra/.claude/CLAUDE.md` and the repo-root `CLAUDE.md` once Phase 1 is stable.
+
+## Decisions resolved post-grill
+
+### Q1 — Uniform CI model for ALL self-hosted (resolved 2026-05-16)
+
+Every self-hosted service moves to the same shape:
+
+```
+CI (GHA or Woodpecker) → build → push :latest (optionally also :<SHA> for traceability) → done
+Keel → poll registry → detect new digest → trigger rollout
+```
+
+The 10 GHA-migrated repos (memory id=388: Website, k8s-portal, f1-stream, claude-memory-mcp, apple-health-data, audiblez-web, plotting-book, insta2spotify, audiobook-search, council-complaints) drop the `Woodpecker API → kubectl set image` step. Their `.woodpecker/deploy.yml` and `.woodpecker/build-fallback.yml` files become obsolete; remove during Phase 1.
+
+Terraform image refs for all self-hosted: `<registry>/<repo>:latest` (with `${var.image_tag}` defaulting to `"latest"` where the variable exists).
+
+### Q2 — No-CI self-hosted services (resolution: uniform participation)
+
+| Service | Action |
+|---------|--------|
+| `wealthfolio` | Switch Terraform to upstream `wealthfolio/wealthfolio:latest` (DockerHub). No CI needed. |
+| `chrome-service` | Verify whether `:v4` is a deliberate pin. If yes → tag stays, add `keel.sh/policy: never` label. If no → switch to `:latest` or `:major`. Investigate during Phase 1 prep. |
+| `beadboard` (used by `beads-server`) | Add minimal Woodpecker CI: build on push → push `:latest`. User-owned. |
+| `freedify` | Add minimal Woodpecker CI: build on push → push `:latest`. User-owned. |
+
+## Open questions (still need resolution before Phase 1)
+
+1. **`helm_release atomic = true` services**: count and identify before Phase 1. Either remove `atomic` (preferred — eliminates the memory id=981 trap), or skip from Kyverno mutate via per-namespace exclusion. Survey command: `grep -rn 'atomic.*true' infra/stacks/ infra/modules/`.
+
+## Out of scope
+
+- Cache TTL changes — current config is already correct (nginx URL-split).
+- Webhook-based Keel triggers — polling is sufficient for this cadence.
+- Replacing Diun — kept for notification visibility into new tags not yet under Keel annotation (during phase rollout).
+- Keel approval gate (`keel.sh/approvals: N`) — user wants unattended auto-roll.
+- Keel auto-rollback on health-check failure — out of scope for v1; revisit if breakage rate is high.
diff --git a/docs/plans/2026-05-16-auto-upgrade-apps-plan.md b/docs/plans/2026-05-16-auto-upgrade-apps-plan.md
new file mode 100644
index 00000000..4937b92f
--- /dev/null
+++ b/docs/plans/2026-05-16-auto-upgrade-apps-plan.md
@@ -0,0 +1,322 @@
+# Auto-Upgrade Apps Implementation Plan
+
+> **For Claude:** REQUIRED SUB-SKILL: Use superpowers:executing-plans to implement this plan task-by-task.
+
+**Goal:** Move the cluster from a mix of pinned-SHA / pinned-semver / ad-hoc `:latest` references to a Keel-driven auto-update model where every workload tracks `:latest` (or a chosen `:major` floating tag) and rolls automatically when upstream advances.
+
+**Architecture:** Kyverno cluster-wide `ClusterPolicy` mutates Deployments / StatefulSets / DaemonSets in opted-in namespaces with Keel annotations (`keel.sh/policy: force`, `keel.sh/trigger: poll`, `keel.sh/pollSchedule: @every 1h`). Keel polls registries, triggers rollout on new digest. kubelet pulls fresh manifest via the nginx URL-split cache (manifests passthrough, blobs cached). Phase advance = expand the `NamespaceSelector` allowlist.
+
+**Tech Stack:** Keel, Kyverno, Terraform / Terragrunt, Helm, Diun (notification only), nginx, docker/distribution
+
+**Design doc:** `docs/plans/2026-05-16-auto-upgrade-apps-design.md`
+
+**Key context:**
+- Cache is already correctly configured (nginx URL-split + `proxy.ttl: 0`). No cache changes needed.
+- Per-stack `lifecycle.ignore_changes` is already required for the existing `dns_config` Kyverno mutation (KYVERNO_LIFECYCLE_V1 convention). This plan extends it with a V2 marker for Keel annotations.
+- Service Upgrade Agent (Diun → n8n → claude bumps tfvars) is retired by this design. n8n workflow + supporting scripts are removed once Phase 9 completes.
+- CLAUDE.md "use 8-char git SHA tags" rule is reversed by this design (see Open Q1 in design doc).
+
+---
+
+## Phase 0 — Foundation
+
+### Task 0.1: Resolve remaining open question
+
+Q1 and Q2 from the design doc are resolved (uniform `:latest` + Keel model for all self-hosted; per-service plan for no-CI services).
+
+Remaining open question:
+
+**Helm-atomic services.** Survey:
+```bash
+grep -rn 'atomic.*true' /home/wizard/code/infra/stacks/ /home/wizard/code/infra/modules/
+```
+
+For each match: either remove `atomic = true` (preferred) or add the namespace to a Kyverno exclusion list. Document inline before Phase 1 proceeds.
+
+---
+
+### Task 0.2: Create the Keel stack
+
+**Files:**
+- Create: `stacks/keel/terragrunt.hcl`
+- Create: `stacks/keel/main.tf`
+- Create: `stacks/keel/variables.tf`
+- Create: `stacks/keel/modules/keel/main.tf`
+
+**Step 1:** Add `keel` to `terragrunt.hcl` `locals.tier0_stacks` — **NO**. Keel is Tier 1 (depends on Kyverno + Keel image registry access). Keep it in Tier 1.
+
+**Step 2:** Deploy via Helm chart `keel-hq/keel` (verify current version via context7 before pinning).
+
+Key Helm values:
+- `polling.enabled: true`
+- `helmProvider.enabled: false` (we use annotations, not Helm hooks)
+- `notifications.slack.enabled: true` with channel `#deployments` (verify channel exists)
+- Registry credentials: mount Forgejo PAT from Vault via ExternalSecret (`secret/viktor/forgejo_pull_token`).
+
+**Step 3:** Verify Keel can authenticate to all five registries (Docker Hub, ghcr, quay, k8s.io, kyverno via the local cache; Forgejo direct).
+
+**Acceptance:**
+- `kubectl -n keel get pod` shows Keel Ready.
+- `kubectl -n keel logs deploy/keel | grep registry` shows successful manifest queries.
+
+---
+
+### Task 0.3: Author the Kyverno ClusterPolicy
+
+**Files:**
+- Create: `stacks/kyverno/modules/kyverno/keel-annotations.tf` (or extend `security-policies.tf`)
+
+ClusterPolicy `inject-keel-annotations`:
+
+```yaml
+apiVersion: kyverno.io/v1
+kind: ClusterPolicy
+metadata:
+  name: inject-keel-annotations
+spec:
+  background: true
+  rules:
+    - name: add-keel-annotation
+      match:
+        any:
+          - resources:
+              kinds: [Deployment, StatefulSet, DaemonSet]
+              namespaces: []  # populated per phase
+      exclude:
+        any:
+          - resources:
+              namespaces: ["keel"]  # decision #11
+          - resources:
+              # Workloads can opt out by setting this label
+              selector:
+                matchLabels:
+                  keel.sh/policy: never
+      mutate:
+        patchStrategicMerge:
+          metadata:
+            annotations:
+              +(keel.sh/policy): force
+              +(keel.sh/trigger): poll
+              +(keel.sh/pollSchedule): "@every 1h"
+```
+
+- `+()` syntax adds only if not present (preserves per-workload overrides).
+- `exclude.selector.matchLabels[keel.sh/policy=never]` is the per-workload escape hatch (used during rollback per decision #9).
+
+**Step 2:** Initially deploy with `namespaces: []` — policy exists but matches nothing.
+
+**Acceptance:**
+- `kubectl get clusterpolicy inject-keel-annotations` shows Ready.
+- `kubectl get deploy -A -o yaml | grep keel.sh/policy` shows no matches yet (empty allowlist).
+
+---
+
+### Task 0.4: Define the KYVERNO_LIFECYCLE_V2 marker convention
+
+**Files:**
+- Modify: `AGENTS.md` — add the V2 snippet to the "Kyverno Drift Suppression" section
+- Modify: `.claude/CLAUDE.md` — reference the V2 marker
+
+Snippet to copy-paste:
+
+```hcl
+lifecycle {
+  ignore_changes = [
+    spec[0].template[0].spec[0].dns_config,            # KYVERNO_LIFECYCLE_V1
+    metadata[0].annotations["keel.sh/policy"],
+    metadata[0].annotations["keel.sh/trigger"],
+    metadata[0].annotations["keel.sh/pollSchedule"],   # KYVERNO_LIFECYCLE_V2
+  ]
+}
+```
+
+Backfill order: per-phase, only on workloads about to be enrolled. Not a mass sweep.
+
+---
+
+## Phase 1 — Self-hosted (uniform model)
+
+**Set:** all self-hosted services. Three sub-categories:
+
+- **Woodpecker-build-only (6):** `claude-agent-service`, `fire-planner`, `job-hunter`, `payslip-ingest`, `recruiter-responder`, `claude-memory-mcp`.
+- **GHA-migrated (10, per memory id=388):** Website, k8s-portal, f1-stream, claude-memory-mcp, apple-health-data, audiblez-web, plotting-book, insta2spotify, audiobook-search, council-complaints. (Note: claude-memory-mcp appears in both lists — verify.)
+- **No-CI (4, per design Q2):** `wealthfolio` (→ upstream), `chrome-service` (verify pin intent), `beadboard` (add CI), `freedify` (add CI).
+- **Already-uniform (1):** `kms-website` — already pushes `:latest` AND SHA; just needs Keel annotation.
+
+### Task 1.1: Audit current image refs
+
+```bash
+grep -rE 'image\s*=\s*"(forgejo\.viktorbarzin\.me|viktorbarzin)' /home/wizard/code/infra/stacks/ | sort
+```
+
+Tabulate per service: current tag, CI type (GHA / Woodpecker / none), action needed.
+
+### Task 1.2: Per-service uniform conversion
+
+For each Woodpecker-build-only service:
+1. Edit Terraform: `local.image_tag` / `var.image_tag` → `"latest"`.
+2. Add the KYVERNO_LIFECYCLE_V2 snippet (annotations ignore_changes).
+3. Verify `.woodpecker.yml` pushes `:latest` on every build (most do via `auto_tag: true`).
+
+For each GHA-migrated service:
+1. Edit Terraform: switch `image_tag` from SHA reference to `"latest"`.
+2. Add the KYVERNO_LIFECYCLE_V2 snippet.
+3. Edit `.github/workflows/build-and-deploy.yml`: push `:latest` (in addition to `:<8-char-sha>` for traceability). Remove the Woodpecker API POST step.
+4. Delete `.woodpecker/deploy.yml` and `.woodpecker/build-fallback.yml` from each repo (no longer needed).
+5. Remove the Woodpecker repo config for these repos from Terraform if applicable.
+
+For each no-CI service:
+- `wealthfolio`: change Terraform image to `wealthfolio/wealthfolio:latest` (upstream DockerHub). Validate the image starts cleanly.
+- `chrome-service`: check git blame on the `:v4` pin. If deliberate → label `keel.sh/policy: never`. If accidental → bump to upstream `:latest`.
+- `beadboard`, `freedify`: write a minimal `.woodpecker.yml` (single build step pushing to Forgejo `:latest`). Trigger an initial build to populate `:latest`.
+
+For `kms-website`: only add the Keel annotation; CI changes optional.
+
+### Task 1.3: Add Phase 1 namespaces to Kyverno allowlist
+
+Edit `stacks/kyverno/modules/kyverno/keel-annotations.tf`:
+
+```yaml
+namespaces:
+  - claude-agent-service
+  - fire-planner
+  - job-hunter
+  - payslip-ingest
+  - recruiter-responder
+  - claude-memory-mcp
+  - kms-website
+  # GHA-migrated set:
+  - website  # or whatever the namespace is named per repo
+  - k8s-portal
+  - f1-stream
+  - apple-health-data
+  - audiblez-web
+  - plotting-book
+  - insta2spotify
+  - audiobook-search
+  - council-complaints
+  # No-CI set:
+  - beads-server
+  - chrome-service
+  - freedify
+  - wealthfolio
+```
+
+Verify each namespace name from `kubectl get ns` before locking in (some may differ from the repo name).
+
+Apply. Watch `kubectl get deploy -n <ns> -o yaml | grep keel.sh` confirm annotations injected. Watch Keel logs for first poll cycle picking up the workloads.
+
+### Task 1.4: Soak
+
+1 week. Monitor:
+- Slack `#deployments` for Keel rollout notifications.
+- `KubePodCrashLooping` alerts.
+- Manual `kubectl rollout status` on each service after a Keel-triggered rollout.
+
+If any service breaks repeatedly: apply rollback runbook (decision #9), record the service in a "pin list" with reason, proceed.
+
+**Acceptance:**
+- All 7 services running latest digests within 24h of Phase 1 apply.
+- No CrashLooping persisting >1h.
+- No more than 2 services pinned-out during the soak week.
+
+---
+
+## Phase 2 — Stateless third-party web apps
+
+**Set:** linkwarden, postiz, affine, isponsorblocktv, audiobookshelf, freshrss, tandoor, immich (verify it qualifies — has external DB so app-restart is safe), excalidraw, hackmd, send, jsoncrack, sparkyfitness, etc. (~15-20 services — full list from `kubectl get deploy -A` filtered against the phase-1 set + skip-bucket).
+
+### Task 2.1: Audit current tags via Diun
+
+```bash
+# Diun's REST API or UI exports a "new tags available" report
+# Use as the per-service decision source
+```
+
+For each service, pick floating tag:
+- `:latest` if upstream publishes it and it's stable.
+- `:<major>` (e.g. `:2`, `:v3`) if `:latest` is unreliable.
+- `glob` + `ignore_changes` as last resort.
+
+### Task 2.2: Catch-up PR
+
+Single combined PR:
+- Per-stack: switch image tag from pinned semver to chosen floating tag (Diun-informed).
+- Per-stack: add KYVERNO_LIFECYCLE_V2 snippet.
+- Append Phase 2 namespaces to Kyverno allowlist.
+
+Apply with `-target=` per stack to pace rollouts (≤5 per hour to avoid cache burst — memory id=603).
+
+### Task 2.3: Soak — 1 week, same monitoring as Phase 1.
+
+---
+
+## Phases 3–9 — same template
+
+For each phase, repeat:
+
+1. Define the set (precise namespace list).
+2. Audit current tags (Diun + grep).
+3. Pick floating tag per service.
+4. Combined PR: image-ref change + lifecycle snippet + Kyverno allowlist update.
+5. Apply paced (≤5/hr).
+6. Soak 1 week. Pin-out any service that breaks repeatedly.
+
+Set definitions per phase: see design doc Phase Ordering table.
+
+**Special-handling phases:**
+
+- **Phase 7 (Operators).** Restart of an operator can confuse its managed CRD reconciles. Use `imagePullPolicy: Always` + readiness check before declaring stable. Investigate cnpg-operator and ESO restart behavior in advance.
+- **Phase 8 (Critical infra).** Calico/CSI DaemonSet rollouts impact each node briefly. Verify `updateStrategy.rollingUpdate.maxUnavailable: 1` on every DaemonSet before enrollment. Memory id=390 (26h Calico-cascade outage) is the cautionary tale.
+- **Phase 9 (Bootstrap).** Vault, CNPG, mysql-standalone. Coordinate with backup window. Take a fresh snapshot of `/srv/nfs/<db>-backup/` before applying the phase enrollment.
+
+---
+
+## Cleanup tasks (after Phase 9 stable)
+
+### Task C.1: Retire Service Upgrade Agent
+
+**Files:**
+- Modify: `stacks/n8n/` — remove the Service Upgrade Agent workflow
+- Delete: any supporting scripts (`infra/scripts/service-upgrade-*.sh` if they exist)
+- Modify: `stacks/diun/` — disable webhook notification to n8n (keep Slack notification for visibility)
+
+### Task C.2: Update CLAUDE.md files
+
+- Reverse the "use 8-char git SHA tags" rule in `infra/.claude/CLAUDE.md` "Docker images" line.
+- Reverse same in root `/CLAUDE.md` if duplicated.
+- Add a new section documenting the Keel model + KYVERNO_LIFECYCLE_V2 snippet.
+- Update memory via `mcp__claude_memory__memory_update` on entries 388, 612, 604 (CI/CD architecture, Service Upgrade Agent retirement, cache TTL clarification).
+
+### Task C.3: Add a runbook
+
+**Files:**
+- Create: `docs/runbooks/keel-rollback.md`
+
+Document the rollback flow (decision #9): `kubectl rollout undo` → Terraform pin → annotation `keel.sh/policy: never`.
+
+### Task C.4: Tidy Diun
+
+Drop image-pin overrides for MySQL, PostgreSQL, Redis from Diun config (no longer needed since they're Keel-managed; the previous skip was for the retired changelog-agent path).
+
+---
+
+## Rollback (whole project)
+
+If the auto-roll experiment goes badly cluster-wide (multiple cascading failures, repeated outages), revert:
+
+1. Set Kyverno ClusterPolicy `inject-keel-annotations` to empty `namespaces: []`.
+2. Existing annotations remain on workloads, but Keel continues to act on them — so also disable Keel: scale `keel` Deployment to 0.
+3. Pin every workload's Terraform image_tag back to its current running digest (use `kubectl get deploy -A -o jsonpath='{range .items[*]}{.metadata.name}:{.spec.template.spec.containers[0].image}{"\n"}{end}'`).
+4. Document failure modes in `post-mortems/2026-XX-XX-keel-rollback.md`.
+5. Reconsider opt-in approach for next iteration.
+
+---
+
+## Success criteria
+
+- All ~70 services running latest within 8 weeks of Phase 0 completion.
+- Zero unrolled-back outages caused by Keel.
+- ≤5 services on the "pin list" (i.e. ≥93% auto-roll success rate).
+- `terragrunt plan` shows no spurious diffs from Kyverno-injected annotations (KYVERNO_LIFECYCLE_V2 working as intended).
+- Service Upgrade Agent + supporting infra retired.
diff --git a/stacks/keel/main.tf b/stacks/keel/main.tf
new file mode 100644
index 00000000..851dc913
--- /dev/null
+++ b/stacks/keel/main.tf
@@ -0,0 +1,65 @@
+# Keel — automated Kubernetes Deployment image updates.
+# Design: docs/plans/2026-05-16-auto-upgrade-apps-design.md
+# Plan:   docs/plans/2026-05-16-auto-upgrade-apps-plan.md
+#
+# Operation: Keel polls each watched workload's registry hourly (default
+# schedule below; overridable per-workload via keel.sh/pollSchedule).
+# Detection of a new digest under the watched tag triggers a Deployment
+# update (pod template hash bump → rolling restart). Workloads opt in by
+# carrying keel.sh/policy + keel.sh/trigger annotations — those are
+# injected cluster-wide by the inject-keel-annotations ClusterPolicy
+# (stacks/kyverno/modules/kyverno/keel-annotations.tf) on namespaces
+# labeled keel.sh/enrolled=true.
+
+resource "kubernetes_namespace" "keel" {
+  metadata {
+    name = "keel"
+    labels = {
+      tier = local.tiers.cluster
+    }
+  }
+  lifecycle {
+    # KYVERNO_LIFECYCLE_V1
+    ignore_changes = [metadata[0].labels["goldilocks.fairwinds.com/vpa-update-mode"]]
+  }
+}
+
+resource "helm_release" "keel" {
+  name       = "keel"
+  namespace  = kubernetes_namespace.keel.metadata[0].name
+  repository = "https://charts.keel.sh"
+  chart      = "keel"
+  version    = "1.0.6"
+
+  # Atomic mitigates partial-deploy state. Keel itself is exempt from
+  # auto-update (Kyverno mutate excludes the keel namespace), so it only
+  # rolls when this stack applies — making atomic safe here.
+  atomic = true
+
+  values = [yamlencode({
+    polling = {
+      enabled = true
+      # Default poll cadence for workloads that don't override per-Deployment
+      # via keel.sh/pollSchedule. Decision #8 in the design doc.
+      defaultSchedule = "@every 1h"
+    }
+    helmProvider = {
+      enabled = false # We use annotations, not Helm hooks
+    }
+    notificationLevel = "info"
+    persistence = {
+      enabled = false
+    }
+    # Keel uses each watched Deployment's own imagePullSecrets to query
+    # its registry. Forgejo creds (`registry-credentials`) are auto-synced
+    # to every namespace by Kyverno already, so Keel pods don't need a
+    # separate pull-secret for their own image (ghcr.io is public).
+    rbac = {
+      enabled = true
+    }
+    resources = {
+      requests = { cpu = "50m", memory = "64Mi" }
+      limits   = { memory = "256Mi" }
+    }
+  })]
+}
diff --git a/stacks/keel/terragrunt.hcl b/stacks/keel/terragrunt.hcl
new file mode 100644
index 00000000..5ac60a69
--- /dev/null
+++ b/stacks/keel/terragrunt.hcl
@@ -0,0 +1,13 @@
+include "root" {
+  path = find_in_parent_folders()
+}
+
+dependency "platform" {
+  config_path  = "../platform"
+  skip_outputs = true
+}
+
+dependency "kyverno" {
+  config_path  = "../kyverno"
+  skip_outputs = true
+}
diff --git a/stacks/kyverno/modules/kyverno/keel-annotations.tf b/stacks/kyverno/modules/kyverno/keel-annotations.tf
new file mode 100644
index 00000000..9b833a4e
--- /dev/null
+++ b/stacks/kyverno/modules/kyverno/keel-annotations.tf
@@ -0,0 +1,82 @@
+# =============================================================================
+# Keel Auto-Update Annotation Injector
+# =============================================================================
+# Design: infra/docs/plans/2026-05-16-auto-upgrade-apps-design.md
+# Plan:   infra/docs/plans/2026-05-16-auto-upgrade-apps-plan.md
+#
+# Mutate policy that adds keel.sh/* annotations to Deployments,
+# StatefulSets and DaemonSets in *opted-in* namespaces. Opt-in is via a
+# label on the namespace:
+#
+#   labels = { "keel.sh/enrolled" = "true" }
+#
+# Phase rollout = label more namespaces. No edit to this file per phase.
+#
+# Workloads can individually opt out with the label keel.sh/policy=never
+# (used by the rollback runbook). The keel namespace itself is always
+# excluded (design decision #11 — supervisor must not auto-update).
+
+resource "kubernetes_manifest" "policy_inject_keel_annotations" {
+  manifest = {
+    apiVersion = "kyverno.io/v1"
+    kind       = "ClusterPolicy"
+    metadata = {
+      name = "inject-keel-annotations"
+      annotations = {
+        "policies.kyverno.io/title"       = "Inject Keel Auto-Update Annotations"
+        "policies.kyverno.io/category"    = "Automation"
+        "policies.kyverno.io/severity"    = "low"
+        "policies.kyverno.io/description" = "Adds keel.sh/policy: force + trigger: poll annotations to workloads in namespaces labeled keel.sh/enrolled=true. Phase rollout per docs/plans/2026-05-16-auto-upgrade-apps-{design,plan}.md."
+      }
+    }
+    spec = {
+      background = true
+      rules = [{
+        name = "add-keel-annotations"
+        match = {
+          any = [{
+            resources = {
+              kinds = ["Deployment", "StatefulSet", "DaemonSet"]
+              namespaceSelector = {
+                matchLabels = {
+                  "keel.sh/enrolled" = "true"
+                }
+              }
+            }
+          }]
+        }
+        exclude = {
+          any = [
+            {
+              resources = {
+                namespaces = ["keel"]
+              }
+            },
+            {
+              resources = {
+                selector = {
+                  matchLabels = {
+                    "keel.sh/policy" = "never"
+                  }
+                }
+              }
+            },
+          ]
+        }
+        mutate = {
+          patchStrategicMerge = {
+            metadata = {
+              annotations = {
+                # `+(...)` only adds if not present; per-workload overrides win.
+                "+(keel.sh/policy)"       = "force"
+                "+(keel.sh/trigger)"      = "poll"
+                "+(keel.sh/pollSchedule)" = "@every 1h"
+              }
+            }
+          }
+        }
+      }]
+    }
+  }
+  depends_on = [helm_release.kyverno]
+}