fix: restore tree dropped by 6d224861; land stem95su gdrive-sync (10m) [ci skip]

6d224861 came from a --no-checkout worktree whose empty index made the
commit drop every file except two. This restores 05b50d2b's full tree and
correctly adds stacks/stem95su/gdrive-sync.tf + the service-catalog stem95su
entry. Forward-only (parent=6d224861, no force-push); [ci skip] since the
live infra was never applied from the broken commit.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
Viktor Barzin 2026-06-09 08:45:33 +00:00
parent 6d224861c4
commit fd0f4a0365
1166 changed files with 358546 additions and 0 deletions

View file

@ -0,0 +1,148 @@
variable "tier" { type = string }
variable "nfs_server" { type = string }
resource "kubernetes_namespace" "nfs_csi" {
metadata {
name = "nfs-csi"
labels = {
tier = var.tier
"keel.sh/enrolled" = "true"
}
}
lifecycle {
# KYVERNO_LIFECYCLE_V1: goldilocks-vpa-auto-mode ClusterPolicy stamps this label on every namespace
ignore_changes = [metadata[0].labels["goldilocks.fairwinds.com/vpa-update-mode"]]
}
}
resource "helm_release" "nfs_csi_driver" {
namespace = kubernetes_namespace.nfs_csi.metadata[0].name
create_namespace = false
name = "csi-driver-nfs"
atomic = true
timeout = 300
repository = "https://raw.githubusercontent.com/kubernetes-csi/csi-driver-nfs/master/charts"
chart = "csi-driver-nfs"
# Pinned 2026-05-17. Keel polled and rolled csi-driver-nfs 4.13.1 4.13.2,
# which broke the cluster:
# * Controller pods ended up on k8s-master because the new chart removed
# control-plane exclusion from the default node selector.
# * Two controller replicas on the same node fought over hostNetwork ports
# 19809 (node-driver-registrar) and 29653 (liveness-probe). One replica
# CrashLoopBackOff'd with `bind: address already in use`.
# * Rolling back live (helm rollback) left zombie containerd containers
# holding the ports only a kubelet restart cleared them.
# nfs-csi namespace is in the Kyverno keel exclude list (keel-annotations.tf)
# so Keel will not touch it again. This version pin is the second line of
# defense against accidental floating-version drift on `terraform apply`.
version = "4.13.1"
values = [yamlencode({
controller = {
replicas = 2
# Required to coexist with the v4.13.1 chart on a 1-master + 4-worker
# cluster:
# * podAntiAffinity forces the 2 controller replicas onto DIFFERENT
# hosts (host network ports 19809/29653 are per-host).
# * nodeAffinity excludes the control-plane node entirely so the
# scheduler can't pick master when a worker is briefly NotReady.
# Without these, Kubernetes can schedule both replicas on the same node
# (port conflict) or on master itself (which already runs the DaemonSet
# pod and would conflict with it).
affinity = {
nodeAffinity = {
requiredDuringSchedulingIgnoredDuringExecution = {
nodeSelectorTerms = [{
matchExpressions = [{
key = "node-role.kubernetes.io/control-plane"
operator = "DoesNotExist"
}]
}]
}
}
podAntiAffinity = {
requiredDuringSchedulingIgnoredDuringExecution = [{
labelSelector = {
matchLabels = {
app = "csi-nfs-controller"
}
}
topologyKey = "kubernetes.io/hostname"
}]
}
}
livenessProbe = {
httpPort = 29653
}
resources = {
csiProvisioner = {
requests = { cpu = "10m", memory = "128Mi" }
limits = { memory = "128Mi" }
}
csiResizer = {
requests = { cpu = "10m", memory = "128Mi" }
limits = { memory = "128Mi" }
}
csiSnapshotter = {
requests = { cpu = "10m", memory = "128Mi" }
limits = { memory = "128Mi" }
}
nfs = {
requests = { cpu = "10m", memory = "128Mi" }
limits = { memory = "128Mi" }
}
livenessProbe = {
requests = { cpu = "10m", memory = "64Mi" }
limits = { memory = "64Mi" }
}
}
}
node = {
resources = {
nfs = {
requests = { cpu = "10m", memory = "128Mi" }
limits = { memory = "128Mi" }
}
livenessProbe = {
requests = { cpu = "10m", memory = "64Mi" }
limits = { memory = "64Mi" }
}
nodeDriverRegistrar = {
requests = { cpu = "10m", memory = "64Mi" }
limits = { memory = "64Mi" }
}
}
}
storageClass = {
create = false
}
})]
}
# Historical name retained for PV compatibility 48 bound PVs reference
# storageClassName: nfs-truenas. The actual backend is the Proxmox host NFS
# (var.nfs_server = 192.168.1.127) since TrueNAS was decommissioned
# 2026-04-13. SC names are immutable on PVs, so renaming would require
# migrating every PV. Not worth the churn for a cosmetic change.
resource "kubernetes_storage_class" "nfs_truenas" {
metadata {
name = "nfs-truenas"
}
storage_provisioner = "nfs.csi.k8s.io"
reclaim_policy = "Retain"
volume_binding_mode = "Immediate"
mount_options = [
"nfsvers=4",
"soft",
"timeo=30",
"retrans=3",
"actimeo=5",
]
parameters = {
server = var.nfs_server
share = "/srv/nfs"
}
}