diff --git a/docs/plans/2026-05-16-auto-upgrade-apps-design.md b/docs/plans/2026-05-16-auto-upgrade-apps-design.md index 78c81870..4ff24d3f 100644 --- a/docs/plans/2026-05-16-auto-upgrade-apps-design.md +++ b/docs/plans/2026-05-16-auto-upgrade-apps-design.md @@ -101,7 +101,7 @@ This is added per workload as we phase in. Mechanical, grep-able. | 2 | Stateless third-party web apps (linkwarden, postiz, affine, etc.) | No migrations | | 3 | Exporters, sidecars, utilities | Stateless | | 4 | Stateful-but-tolerant (Grafana, Prometheus, etc.) | Restart-safe state | -| 5 | State-coupled with migrations (Nextcloud, Forgejo, paperless-ngx, mailserver) | Schema-migration risk | +| 5 | State-coupled with migrations (Nextcloud, Forgejo, paperless-ngx, mailserver) | Schema-migration risk. **Nextcloud enrolled 2026-06-01** with two safeguards for the migration risk: F1 — `nextcloud-watchdog` CronJob runs `occ upgrade` when occ reports `needsDbUpgrade=true` (recovers an interrupted entrypoint upgrade); F2 — `chart_values.yaml` renders the live (Keel-bumped) image tag with a floor, so a helm re-render never downgrades below live. Scope is `patch` (Kyverno-stamped) == `minor` for Nextcloud (32.0.x only). See `stacks/nextcloud/main.tf`. | | 6 | Authentik | Auth outage | | 7 | Operators (cnpg-operator, ESO, kured, descheduler) | Operator skew | | 8 | Critical infra (Calico, proxmox-csi, nfs-csi, traefik, metallb) | Node-level outage potential (memory id=390: 26h Calico cascade) | diff --git a/stacks/nextcloud/chart_values.yaml b/stacks/nextcloud/chart_values.yaml index 760a6aec..af2f4259 100644 --- a/stacks/nextcloud/chart_values.yaml +++ b/stacks/nextcloud/chart_values.yaml @@ -1,13 +1,16 @@ -# Pin the image to 32.0.9 (apache). On 2026-05-26 Keel bumped the live -# Deployment 32.0.3 → 32.0.9-apache and the DATA migrated to 32.0.9.2; Keel -# was then disabled but chart_values was never pinned, so it kept defaulting -# to the chart's appVersion (32.0.3). A 2026-06-01 `terragrunt apply` -# reconciled that drift, rolled a 32.0.3 pod, and Nextcloud refused to -# downgrade (data 32.0.9.2 > image 32.0.3.2) → CrashLoopBackOff. Pinning here -# keeps TF the source of truth and matches the on-disk data version. +# image.tag is rendered dynamically (templatefile var `image_tag`) from the +# CURRENT live Deployment tag, falling back to var.nextcloud_image_tag_floor +# (32.0.9) on fresh install / DR — see stacks/nextcloud/main.tf +# `data.kubernetes_resource.nextcloud_live` + locals. This makes helm upgrades +# image-no-ops in steady state and means a re-render can NEVER downgrade below +# the Keel-bumped live tag (the 2026-06-01 CrashLoop: a pinned 32.0.3 lost to +# live 32.0.9 and Nextcloud refused the downgrade). Keel (keel.sh/policy=minor) +# bumps the live tag upward within major 32; the next apply just follows it. +# flavor=apache renders the bare apache-default tag (live image is +# `nextcloud:`, no -apache suffix). image: flavor: apache - tag: "32.0.9" + tag: "${image_tag}" nextcloud: host: nextcloud.viktorbarzin.me diff --git a/stacks/nextcloud/external_storage.tf b/stacks/nextcloud/external_storage.tf index 182eda5a..feddde05 100644 --- a/stacks/nextcloud/external_storage.tf +++ b/stacks/nextcloud/external_storage.tf @@ -116,6 +116,16 @@ resource "kubernetes_role_binding" "nextcloud_external_storage_bootstrap" { # ── Bootstrap Job ──────────────────────────────────────────────────────────── resource "kubernetes_job_v1" "nextcloud_external_storage_bootstrap" { + # The bootstrap script (below) waits up to 10m for the NC pod to be Ready. + # kubernetes_job_v1's default create timeout is only 1m, which spuriously + # fails the apply whenever the NC pod takes >1m to come up — e.g. now that + # Keel auto-upgrades nextcloud, a bump mid-apply runs `occ upgrade` in the + # entrypoint and delays readiness past 1m (observed 2026-06-01). Match the + # script's 10m wait plus margin. + timeouts { + create = "12m" + } + metadata { name = "nextcloud-external-storage-bootstrap" namespace = kubernetes_namespace.nextcloud.metadata[0].name diff --git a/stacks/nextcloud/main.tf b/stacks/nextcloud/main.tf index 5ced81de..066058a8 100644 --- a/stacks/nextcloud/main.tf +++ b/stacks/nextcloud/main.tf @@ -6,13 +6,60 @@ variable "nfs_server" { type = string } variable "redis_host" { type = string } variable "mysql_host" { type = string } +# FLOOR only — Keel bumps the LIVE image tag upward (minor policy); the +# data source below renders the current live tag so a helm apply never +# downgrades below what Keel installed. This floor only wins on a fresh +# install / DR (no live Deployment) or after deliberately restoring an +# OLDER DB snapshot (bump this to match — see comment on the data source). +variable "nextcloud_image_tag_floor" { + type = string + default = "32.0.9" +} + data "vault_kv_secret_v2" "secrets" { mount = "secret" name = "nextcloud" } +# Render the CURRENT live image tag so helm upgrades are image-no-ops and +# can NEVER downgrade below the Keel-bumped live tag (failure mode F2: the +# 2026-06-01 CrashLoop where a pinned 32.0.3 re-render lost to live 32.0.9). +# Helm-managed workloads can't use the raw-Deployment KEEL_IGNORE_IMAGE +# `lifecycle.ignore_changes` trick (immich/freshrss main.tf), so we feed the +# live tag back into the chart instead. +# +# Use the PLURAL `kubernetes_resources` (field-selected to name=nextcloud), NOT +# the singular `kubernetes_resource`: in kubernetes provider 3.1.0 the singular +# data source ERRORS ("Provider produced null object") when the target is +# absent, and try() can't rescue it (the failure is at the provider read, not +# the expression). The plural returns an empty `objects` list on no match, so +# objects[0] + try() cleanly falls back to var.nextcloud_image_tag_floor on +# fresh install / DR. (Verified empirically against provider 3.1.0.) +# +# namespace is the LITERAL "nextcloud", NOT +# kubernetes_namespace.nextcloud.metadata[0].name, on purpose: referencing the +# namespace resource makes Terraform defer this data read to apply time +# whenever the namespace has a pending change (e.g. the keel.sh/enrolled label +# add) — "(depends on a resource ... with changes pending)" — which leaves the +# tag unknown at plan, turning every helm plan into an unverifiable +# (known after apply) values churn. A static namespace decouples the read so it +# resolves at plan time. +data "kubernetes_resources" "nextcloud_live" { + api_version = "apps/v1" + kind = "Deployment" + namespace = "nextcloud" + field_selector = "metadata.name=nextcloud" +} + locals { homepage_credentials = jsondecode(data.vault_kv_secret_v2.secrets.data["homepage_credentials"]) + + _live_image = try(data.kubernetes_resources.nextcloud_live.objects[0].spec.template.spec.containers[0].image, "") + # Last colon-segment is the tag (handles registry:port/repo:tag); strip the + # optional `-apache` flavor suffix so it round-trips through the chart's + # `image.flavor=apache` (which renders the bare apache-default tag). + _live_tag = try(replace(element(split(":", local._live_image), length(split(":", local._live_image)) - 1), "-apache", ""), "") + nextcloud_image_tag = local._live_tag != "" ? local._live_tag : var.nextcloud_image_tag_floor } @@ -30,14 +77,26 @@ resource "kubernetes_namespace" "nextcloud" { tier = local.tiers.edge "resource-governance/custom-limitrange" = "true" "resource-governance/custom-quota" = "true" - # Keel disabled for nextcloud: the 2026-05-26 Keel-driven bump - # 32.0.3-apache → 32.0.9-apache left the pod in maintenance mode - # (needsDbUpgrade=true) for ~22h because Keel doesn't run - # `occ upgrade` after rolling the image. Defense-in-depth: - # (a) namespace not enrolled here, (b) workload carries the - # `keel.sh/policy=never` label + annotation below so even if the - # ns label gets re-added, Kyverno excludes this Deployment. - # "keel.sh/enrolled" = "true" + # Keel re-enabled 2026-06-01 (was disabled after the 2026-05-26 bump + # 32.0.3→32.0.9 stuck the pod in maintenance mode for ~22h). Two + # safeguards make auto-upgrade safe, engineered around BOTH failure modes: + # F1 — interrupted `occ upgrade` (entrypoint copies version.php before + # occ upgrade finishes, so a probe-restart mid-upgrade leaves the + # DB half-migrated → 503): the nextcloud-watchdog CronJob below + # self-heals by running `occ upgrade` when occ reports + # needsDbUpgrade=true. + # F2 — helm re-renders a tag BELOW the Keel-bumped live image → + # Nextcloud refuses the downgrade → CrashLoop (the 2026-06-01 + # incident): chart_values renders the live tag with a floor, so a + # re-render is never below live. + # Scope: the shared Kyverno `inject-keel-annotations` policy stamps + # keel.sh/policy=patch (+ trigger=poll + pollSchedule) on enrolled + # workloads. For Nextcloud patch == minor in practice — it only ships + # 32.0.x maintenance releases (never 32.1.x), and major 33 needs `major` + # policy and stays manual (the entrypoint's +1-major limit enforces that + # anyway). We deliberately do NOT override the policy per-workload — see + # the note where the old override resources used to live, below. + "keel.sh/enrolled" = "true" } } lifecycle { @@ -46,40 +105,24 @@ resource "kubernetes_namespace" "nextcloud" { } } -# Workload-level Keel opt-out (see namespace comment above). -# Keel reads the ANNOTATION `keel.sh/policy` (it's what un-tracks the -# image watcher); the LABEL exists for the Kyverno exclude rule in -# `inject-keel-annotations` (defense-in-depth in case the namespace -# label gets re-added later). Both are set via these helper resources -# because the nextcloud chart 8.8.1 doesn't expose Deployment-level -# commonLabels / commonAnnotations. -resource "kubernetes_labels" "nextcloud_keel_optout" { - api_version = "apps/v1" - kind = "Deployment" - metadata { - name = "nextcloud" - namespace = kubernetes_namespace.nextcloud.metadata[0].name - } - labels = { - "keel.sh/policy" = "never" - } - force = true - depends_on = [helm_release.nextcloud] -} - -resource "kubernetes_annotations" "nextcloud_keel_optout" { - api_version = "apps/v1" - kind = "Deployment" - metadata { - name = "nextcloud" - namespace = kubernetes_namespace.nextcloud.metadata[0].name - } - annotations = { - "keel.sh/policy" = "never" - } - force = true - depends_on = [helm_release.nextcloud] -} +# No per-workload Keel override resources here, on purpose. Nextcloud is +# enrolled via the namespace label above; the shared Kyverno +# `inject-keel-annotations` policy then stamps keel.sh/policy=patch + +# trigger=poll + pollSchedule, and Keel auto-upgrades within 32.0.x. +# +# This stack used to carry kubernetes_labels + kubernetes_annotations +# resources forcing keel.sh/policy=minor (and before that =never, for the +# opt-out). Both were removed 2026-06-01 after re-enabling Keel because each +# produced perpetual drift: +# - Kyverno's background-controller overwrites a TF-set policy back to +# `patch` despite the policy's `+(keel.sh/policy)` add-if-missing anchor +# (observed live: the annotation's field manager was background-controller +# with value patch right after a Keel-bump admission). +# - The helm release strips the deployment's keel.sh/policy LABEL on every +# roll, so TF re-added it on every apply. +# patch == minor for Nextcloud (32.0.x only; major 33 needs `major` and stays +# manual), so letting Kyverno own the keel annotations — exactly like every +# other enrolled workload (immich, freshrss) — is both correct and drift-free. resource "kubernetes_manifest" "external_secret" { manifest = { @@ -191,7 +234,7 @@ resource "helm_release" "nextcloud" { atomic = true version = "8.8.1" - values = [templatefile("${path.module}/chart_values.yaml", { tls_secret_name = var.tls_secret_name, mysql_host = var.mysql_host })] + values = [templatefile("${path.module}/chart_values.yaml", { tls_secret_name = var.tls_secret_name, mysql_host = var.mysql_host, image_tag = local.nextcloud_image_tag })] timeout = 6000 depends_on = [kubernetes_manifest.db_external_secret] } @@ -457,9 +500,12 @@ resource "kubernetes_config_map" "backup-script" { } } -# Watchdog: auto-restart Nextcloud when Apache workers go runaway -# Checks every 5 minutes if Apache has >40 active workers (normal is 5-15). -# If runaway detected, restarts the deployment to recover node CPU. +# Watchdog: runs every 5 minutes with two jobs: +# 1. Apache runaway recovery — if >40 workers (normal 5-15), rollout-restart +# to recover node CPU. +# 2. F1 Keel self-heal — if occ reports needsDbUpgrade=true (an interrupted +# `occ upgrade` after a Keel image bump left the app in maintenance mode), +# re-run `occ upgrade` and clear maintenance mode. resource "kubernetes_service_account" "nextcloud_watchdog" { metadata { name = "nextcloud-watchdog" @@ -521,7 +567,9 @@ resource "kubernetes_cron_job_v1" "nextcloud_watchdog" { job_template { metadata {} spec { - active_deadline_seconds = 120 + # 600s (was 120s) so the F1 self-heal `occ upgrade` isn't killed + # mid-migration. concurrency_policy=Forbid prevents overlap. + active_deadline_seconds = 600 template { metadata {} spec { @@ -554,6 +602,22 @@ resource "kubernetes_cron_job_v1" "nextcloud_watchdog" { else echo "Apache workers within normal range ($WORKERS <= 40)" fi + + # F1 self-heal: a Keel image bump runs `occ upgrade` in the + # entrypoint, but if that's interrupted (e.g. a probe restart + # mid-upgrade) occ reports needsDbUpgrade=true and the app sits + # in maintenance mode (503). Re-run the upgrade and clear + # maintenance mode. Gated on needsDbUpgrade only, so a + # deliberate manual maintenance window is left untouched. + ST=$(kubectl exec -n nextcloud "$POD" -c nextcloud -- php occ status --output=json 2>/dev/null || true) + if echo "$ST" | grep -q '"needsDbUpgrade":true'; then + echo "$(date): needsDbUpgrade=true → running occ upgrade" + kubectl exec -n nextcloud "$POD" -c nextcloud -- php occ upgrade --no-interaction || true + kubectl exec -n nextcloud "$POD" -c nextcloud -- php occ maintenance:mode --off || true + echo "$(date): self-heal occ upgrade complete" + else + echo "$(date): occ status healthy (no DB upgrade pending)" + fi EOF ] }