fix: restore tree dropped by 6d224861; land stem95su gdrive-sync (10m) [ci skip]

6d224861 came from a --no-checkout worktree whose empty index made the commit drop every file except two. This restores 05b50d2b's full tree and correctly adds stacks/stem95su/gdrive-sync.tf + the service-catalog stem95su entry. Forward-only (parent=6d224861, no force-push); [ci skip] since the live infra was never applied from the broken commit. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-09 08:45:33 +00:00 · 2026-06-09 08:45:33 +00:00 · fd0f4a0365
commit fd0f4a0365
parent 6d224861c4
1166 changed files with 358546 additions and 0 deletions
--- a/stacks/nfs-csi/modules/nfs-csi/main.tf
+++ b/stacks/nfs-csi/modules/nfs-csi/main.tf
@ -0,0 +1,148 @@
+variable "tier" { type = string }
+variable "nfs_server" { type = string }
+
+resource "kubernetes_namespace" "nfs_csi" {
+  metadata {
+    name = "nfs-csi"
+    labels = {
+      tier = var.tier
+      "keel.sh/enrolled" = "true"
+    }
+  }
+  lifecycle {
+    # KYVERNO_LIFECYCLE_V1: goldilocks-vpa-auto-mode ClusterPolicy stamps this label on every namespace
+    ignore_changes = [metadata[0].labels["goldilocks.fairwinds.com/vpa-update-mode"]]
+  }
+}
+
+resource "helm_release" "nfs_csi_driver" {
+  namespace        = kubernetes_namespace.nfs_csi.metadata[0].name
+  create_namespace = false
+  name             = "csi-driver-nfs"
+  atomic           = true
+  timeout          = 300
+
+  repository = "https://raw.githubusercontent.com/kubernetes-csi/csi-driver-nfs/master/charts"
+  chart      = "csi-driver-nfs"
+  # Pinned 2026-05-17. Keel polled and rolled csi-driver-nfs 4.13.1 → 4.13.2,
+  # which broke the cluster:
+  #   * Controller pods ended up on k8s-master because the new chart removed
+  #     control-plane exclusion from the default node selector.
+  #   * Two controller replicas on the same node fought over hostNetwork ports
+  #     19809 (node-driver-registrar) and 29653 (liveness-probe). One replica
+  #     CrashLoopBackOff'd with `bind: address already in use`.
+  #   * Rolling back live (helm rollback) left zombie containerd containers
+  #     holding the ports — only a kubelet restart cleared them.
+  # nfs-csi namespace is in the Kyverno keel exclude list (keel-annotations.tf)
+  # so Keel will not touch it again. This version pin is the second line of
+  # defense against accidental floating-version drift on `terraform apply`.
+  version = "4.13.1"
+
+  values = [yamlencode({
+    controller = {
+      replicas = 2
+      # Required to coexist with the v4.13.1 chart on a 1-master + 4-worker
+      # cluster:
+      #   * podAntiAffinity forces the 2 controller replicas onto DIFFERENT
+      #     hosts (host network ports 19809/29653 are per-host).
+      #   * nodeAffinity excludes the control-plane node entirely so the
+      #     scheduler can't pick master when a worker is briefly NotReady.
+      # Without these, Kubernetes can schedule both replicas on the same node
+      # (port conflict) or on master itself (which already runs the DaemonSet
+      # pod and would conflict with it).
+      affinity = {
+        nodeAffinity = {
+          requiredDuringSchedulingIgnoredDuringExecution = {
+            nodeSelectorTerms = [{
+              matchExpressions = [{
+                key      = "node-role.kubernetes.io/control-plane"
+                operator = "DoesNotExist"
+              }]
+            }]
+          }
+        }
+        podAntiAffinity = {
+          requiredDuringSchedulingIgnoredDuringExecution = [{
+            labelSelector = {
+              matchLabels = {
+                app = "csi-nfs-controller"
+              }
+            }
+            topologyKey = "kubernetes.io/hostname"
+          }]
+        }
+      }
+      livenessProbe = {
+        httpPort = 29653
+      }
+      resources = {
+        csiProvisioner = {
+          requests = { cpu = "10m", memory = "128Mi" }
+          limits   = { memory = "128Mi" }
+        }
+        csiResizer = {
+          requests = { cpu = "10m", memory = "128Mi" }
+          limits   = { memory = "128Mi" }
+        }
+        csiSnapshotter = {
+          requests = { cpu = "10m", memory = "128Mi" }
+          limits   = { memory = "128Mi" }
+        }
+        nfs = {
+          requests = { cpu = "10m", memory = "128Mi" }
+          limits   = { memory = "128Mi" }
+        }
+        livenessProbe = {
+          requests = { cpu = "10m", memory = "64Mi" }
+          limits   = { memory = "64Mi" }
+        }
+      }
+    }
+    node = {
+      resources = {
+        nfs = {
+          requests = { cpu = "10m", memory = "128Mi" }
+          limits   = { memory = "128Mi" }
+        }
+        livenessProbe = {
+          requests = { cpu = "10m", memory = "64Mi" }
+          limits   = { memory = "64Mi" }
+        }
+        nodeDriverRegistrar = {
+          requests = { cpu = "10m", memory = "64Mi" }
+          limits   = { memory = "64Mi" }
+        }
+      }
+    }
+    storageClass = {
+      create = false
+    }
+  })]
+}
+
+# Historical name retained for PV compatibility — 48 bound PVs reference
+# storageClassName: nfs-truenas. The actual backend is the Proxmox host NFS
+# (var.nfs_server = 192.168.1.127) since TrueNAS was decommissioned
+# 2026-04-13. SC names are immutable on PVs, so renaming would require
+# migrating every PV. Not worth the churn for a cosmetic change.
+resource "kubernetes_storage_class" "nfs_truenas" {
+  metadata {
+    name = "nfs-truenas"
+  }
+  storage_provisioner = "nfs.csi.k8s.io"
+  reclaim_policy      = "Retain"
+  volume_binding_mode = "Immediate"
+
+  mount_options = [
+    "nfsvers=4",
+    "soft",
+    "timeo=30",
+    "retrans=3",
+    "actimeo=5",
+  ]
+
+  parameters = {
+    server = var.nfs_server
+    share  = "/srv/nfs"
+  }
+}