# ============================================================================= # Tier-Based Resource Governance # ============================================================================= # default (limit) = defaultRequest (request) to give Guaranteed QoS and prevent # memory overcommit. Changed 2026-03-14 after node2 OOM crash caused by 250% # memory overcommit (61GB limits on 24GB node). # # Four layers of protection against noisy neighbor issues: # 1. PriorityClasses - critical services survive resource pressure # 2. LimitRange defaults (Kyverno generate) - auto-inject defaults for containers without resources # 3. ResourceQuotas (Kyverno generate) - hard ceiling on namespace resource consumption # 4. Priority injection (Kyverno mutate) - set priorityClassName based on namespace tier label locals { governance_tiers = ["0-core", "1-cluster", "2-gpu", "3-edge", "4-aux"] excluded_namespaces = ["kube-system", "metallb-system", "kyverno", "calico-system", "calico-apiserver"] # GPU-priority injection exclude list. Adds `tts` to the base set so the # `inject-gpu-workload-priority` policy does NOT stamp the immich-equal # gpu-workload (1,200,000) priority on Chatterbox-TTS pods. Chatterbox is a # best-effort off-peak batch tenant on the shared T4: it must keep its # tier-2-gpu (600,000) priority so it is ALWAYS the pod evicted under GPU-node # pressure, never immich-ml/frigate/llama-swap. See the tts stack # (stacks/tts/) + docs/plans/2026-06-08-chatterbox-tts-infra.md §3. gpu_priority_excluded_namespaces = concat(local.excluded_namespaces, ["tts"]) } # ----------------------------------------------------------------------------- # Layer 1: PriorityClasses # ----------------------------------------------------------------------------- # Values stay well below system-cluster-critical (2,000,000,000) resource "kubernetes_priority_class" "tier_0_core" { metadata { name = "tier-0-core" } value = 1000000 global_default = false preemption_policy = "PreemptLowerPriority" description = "Critical infrastructure: ingress, DNS, VPN, auth, monitoring" } resource "kubernetes_priority_class" "tier_1_cluster" { metadata { name = "tier-1-cluster" } value = 800000 global_default = false preemption_policy = "PreemptLowerPriority" description = "Cluster services: Redis, metrics, security" } resource "kubernetes_priority_class" "tier_2_gpu" { metadata { name = "tier-2-gpu" } value = 600000 global_default = false preemption_policy = "PreemptLowerPriority" description = "GPU workloads: Immich, Ollama, Frigate" } resource "kubernetes_priority_class" "gpu_workload" { metadata { name = "gpu-workload" } value = 1200000 global_default = false preemption_policy = "PreemptLowerPriority" description = "GPU-pinned workloads. Higher than all user tiers. Auto-injected by Kyverno on pods requesting nvidia.com/gpu." } resource "kubernetes_priority_class" "tier_3_edge" { metadata { name = "tier-3-edge" } value = 400000 global_default = false preemption_policy = "PreemptLowerPriority" description = "User-facing services: mail, file sync, dashboards" } resource "kubernetes_priority_class" "tier_4_aux" { metadata { name = "tier-4-aux" } value = 200000 global_default = false preemption_policy = "Never" description = "Optional services: blogs, tools, experiments. Will not preempt other aux services." } # ----------------------------------------------------------------------------- # Layer 2: LimitRange Defaults (Kyverno Generate) # ----------------------------------------------------------------------------- # Creates a LimitRange in each namespace based on its tier label. # Only affects containers WITHOUT explicit resource requests/limits. resource "kubectl_manifest" "generate_limitrange_by_tier" { yaml_body = yamlencode({ apiVersion = "kyverno.io/v1" kind = "ClusterPolicy" metadata = { name = "generate-limitrange-by-tier" annotations = { "policies.kyverno.io/title" = "Generate LimitRange by Tier" "policies.kyverno.io/description" = "Creates tier-appropriate LimitRange defaults in namespaces based on their tier label. Only affects containers without explicit resource specifications. Excludes namespaces with resource-governance/custom-limitrange label." } } spec = { generateExisting = true rules = [ # Tier 0-core { name = "limitrange-tier-0-core" match = { any = [ { resources = { kinds = ["Namespace"] selector = { matchLabels = { tier = "0-core" } } } } ] } exclude = { any = [ { resources = { selector = { matchLabels = { "resource-governance/custom-limitrange" = "true" } } } } ] } generate = { synchronize = true apiVersion = "v1" kind = "LimitRange" name = "tier-defaults" namespace = "{{request.object.metadata.name}}" data = { spec = { limits = [ { type = "Container" default = { memory = "256Mi" } defaultRequest = { cpu = "100m" memory = "256Mi" } max = { memory = "8Gi" } } ] } } } }, # Tier 1-cluster { name = "limitrange-tier-1-cluster" match = { any = [ { resources = { kinds = ["Namespace"] selector = { matchLabels = { tier = "1-cluster" } } } } ] } exclude = { any = [ { resources = { selector = { matchLabels = { "resource-governance/custom-limitrange" = "true" } } } } ] } generate = { synchronize = true apiVersion = "v1" kind = "LimitRange" name = "tier-defaults" namespace = "{{request.object.metadata.name}}" data = { spec = { limits = [ { type = "Container" default = { memory = "256Mi" } defaultRequest = { cpu = "100m" memory = "256Mi" } max = { memory = "8Gi" } } ] } } } }, # Tier 2-gpu { name = "limitrange-tier-2-gpu" match = { any = [ { resources = { kinds = ["Namespace"] selector = { matchLabels = { tier = "2-gpu" } } } } ] } exclude = { any = [ { resources = { selector = { matchLabels = { "resource-governance/custom-limitrange" = "true" } } } } ] } generate = { synchronize = true apiVersion = "v1" kind = "LimitRange" name = "tier-defaults" namespace = "{{request.object.metadata.name}}" data = { spec = { limits = [ { type = "Container" default = { memory = "1Gi" } defaultRequest = { cpu = "200m" memory = "1Gi" } max = { memory = "16Gi" } } ] } } } }, # Tier 3-edge — Burstable QoS: request < limit to reduce scheduler pressure { name = "limitrange-tier-3-edge" match = { any = [ { resources = { kinds = ["Namespace"] selector = { matchLabels = { tier = "3-edge" } } } } ] } exclude = { any = [ { resources = { selector = { matchLabels = { "resource-governance/custom-limitrange" = "true" } } } } ] } generate = { synchronize = true apiVersion = "v1" kind = "LimitRange" name = "tier-defaults" namespace = "{{request.object.metadata.name}}" data = { spec = { limits = [ { type = "Container" default = { memory = "256Mi" } defaultRequest = { cpu = "50m" memory = "128Mi" } max = { memory = "8Gi" } } ] } } } }, # Tier 4-aux — Burstable QoS: request < limit to reduce scheduler pressure { name = "limitrange-tier-4-aux" match = { any = [ { resources = { kinds = ["Namespace"] selector = { matchLabels = { tier = "4-aux" } } } } ] } exclude = { any = [ { resources = { selector = { matchLabels = { "resource-governance/custom-limitrange" = "true" } } } } ] } generate = { synchronize = true apiVersion = "v1" kind = "LimitRange" name = "tier-defaults" namespace = "{{request.object.metadata.name}}" data = { spec = { limits = [ { type = "Container" default = { memory = "256Mi" } defaultRequest = { cpu = "50m" memory = "64Mi" } max = { memory = "4Gi" } } ] } } } }, # Fallback: namespaces without a tier label get aux-level defaults # requests = limits to prevent memory overcommit (2026-03-14 node2 OOM incident) { name = "limitrange-no-tier-fallback" match = { any = [ { resources = { kinds = ["Namespace"] } } ] } exclude = { any = [ { resources = { selector = { matchExpressions = [ { key = "tier" operator = "Exists" } ] } } }, { resources = { namespaces = ["kube-system", "metallb-system", "kyverno", "calico-system", "calico-apiserver"] } } ] } generate = { synchronize = true apiVersion = "v1" kind = "LimitRange" name = "tier-defaults" namespace = "{{request.object.metadata.name}}" data = { spec = { limits = [ { type = "Container" default = { memory = "192Mi" } defaultRequest = { cpu = "50m" memory = "128Mi" } max = { memory = "4Gi" } } ] } } } }, ] } }) } # ----------------------------------------------------------------------------- # Layer 3: ResourceQuotas (Kyverno Generate) # ----------------------------------------------------------------------------- # Creates a ResourceQuota in each namespace based on its tier label. # Sets hard ceiling on total namespace resource consumption. # Namespaces with label resource-governance/custom-quota=true are excluded. # # IMPORTANT: LimitRange (Layer 2) must exist before ResourceQuota takes effect, # because ResourceQuota requires all pods to have resource requests set. resource "kubectl_manifest" "generate_resourcequota_by_tier" { depends_on = [kubectl_manifest.generate_limitrange_by_tier] yaml_body = yamlencode({ apiVersion = "kyverno.io/v1" kind = "ClusterPolicy" metadata = { name = "generate-resourcequota-by-tier" annotations = { "policies.kyverno.io/title" = "Generate ResourceQuota by Tier" "policies.kyverno.io/description" = "Creates tier-appropriate ResourceQuota in namespaces based on their tier label. Excludes namespaces with resource-governance/custom-quota label." } } spec = { generateExisting = true rules = [ # Tier 0-core { name = "quota-tier-0-core" match = { any = [ { resources = { kinds = ["Namespace"] selector = { matchLabels = { tier = "0-core" } } } } ] } exclude = { any = [ { resources = { selector = { matchLabels = { "resource-governance/custom-quota" = "true" } } } } ] } generate = { synchronize = true apiVersion = "v1" kind = "ResourceQuota" name = "tier-quota" namespace = "{{request.object.metadata.name}}" data = { spec = { hard = { "requests.cpu" = "8" "requests.memory" = "8Gi" "limits.memory" = "64Gi" pods = "100" } } } } }, # Tier 1-cluster { name = "quota-tier-1-cluster" match = { any = [ { resources = { kinds = ["Namespace"] selector = { matchLabels = { tier = "1-cluster" } } } } ] } exclude = { any = [ { resources = { selector = { matchLabels = { "resource-governance/custom-quota" = "true" } } } } ] } generate = { synchronize = true apiVersion = "v1" kind = "ResourceQuota" name = "tier-quota" namespace = "{{request.object.metadata.name}}" data = { spec = { hard = { "requests.cpu" = "4" "requests.memory" = "4Gi" "limits.memory" = "32Gi" pods = "30" } } } } }, # Tier 2-gpu { name = "quota-tier-2-gpu" match = { any = [ { resources = { kinds = ["Namespace"] selector = { matchLabels = { tier = "2-gpu" } } } } ] } exclude = { any = [ { resources = { selector = { matchLabels = { "resource-governance/custom-quota" = "true" } } } } ] } generate = { synchronize = true apiVersion = "v1" kind = "ResourceQuota" name = "tier-quota" namespace = "{{request.object.metadata.name}}" data = { spec = { hard = { "requests.cpu" = "8" "requests.memory" = "12Gi" "limits.memory" = "32Gi" pods = "40" } } } } }, # Tier 3-edge { name = "quota-tier-3-edge" match = { any = [ { resources = { kinds = ["Namespace"] selector = { matchLabels = { tier = "3-edge" } } } } ] } exclude = { any = [ { resources = { selector = { matchLabels = { "resource-governance/custom-quota" = "true" } } } } ] } generate = { synchronize = true apiVersion = "v1" kind = "ResourceQuota" name = "tier-quota" namespace = "{{request.object.metadata.name}}" data = { spec = { hard = { "requests.cpu" = "4" "requests.memory" = "4Gi" "limits.memory" = "32Gi" pods = "30" } } } } }, # Tier 4-aux { name = "quota-tier-4-aux" match = { any = [ { resources = { kinds = ["Namespace"] selector = { matchLabels = { tier = "4-aux" } } } } ] } exclude = { any = [ { resources = { selector = { matchLabels = { "resource-governance/custom-quota" = "true" } } } } ] } generate = { synchronize = true apiVersion = "v1" kind = "ResourceQuota" name = "tier-quota" namespace = "{{request.object.metadata.name}}" data = { spec = { hard = { "requests.cpu" = "2" "requests.memory" = "3Gi" "limits.memory" = "16Gi" pods = "20" } } } } }, ] } }) } # ----------------------------------------------------------------------------- # Layer 4: PriorityClassName Injection (Kyverno Mutate) # ----------------------------------------------------------------------------- # Automatically sets priorityClassName on Pods based on their namespace's tier label. # Skips pods that already have a priorityClassName set. # Uses namespaceSelector instead of API calls — no round-trip to the API server. resource "kubectl_manifest" "mutate_priority_from_tier" { yaml_body = yamlencode({ apiVersion = "kyverno.io/v1" kind = "ClusterPolicy" metadata = { name = "inject-priority-class-from-tier" annotations = { "policies.kyverno.io/title" = "Inject PriorityClass from Tier" "policies.kyverno.io/description" = "Sets priorityClassName on Pods based on the namespace tier label. Skips pods that already have a priorityClassName." } } spec = { rules = [for tier in local.governance_tiers : { name = "inject-priority-${tier}" match = { any = [ { resources = { kinds = ["Pod"] operations = ["CREATE"] namespaceSelector = { matchLabels = { tier = tier } } } } ] } exclude = { any = [ { resources = { namespaces = local.excluded_namespaces } } ] } preconditions = { all = [ { key = "{{request.object.spec.priorityClassName || ''}}" operator = "Equals" value = "" } ] } mutate = { patchesJson6902 = yamlencode([ { op = "remove" path = "/spec/priority" }, { op = "remove" path = "/spec/preemptionPolicy" }, { op = "add" path = "/spec/priorityClassName" value = "tier-${tier}" } ]) } }] } }) } # --- ndots:2 injection --- # Kubernetes defaults to ndots:5, which causes 4 wasted NxDomain queries per # external DNS lookup (search domain expansion). This policy injects ndots:2 # on all pods to reduce NxDomain flood while still allowing short-name service # resolution (e.g. "redis.redis" has 1 dot, so it still expands). resource "kubectl_manifest" "mutate_ndots" { yaml_body = yamlencode({ apiVersion = "kyverno.io/v1" kind = "ClusterPolicy" metadata = { name = "inject-ndots" annotations = { "policies.kyverno.io/title" = "Inject ndots:2 DNS Config" "policies.kyverno.io/description" = "Sets ndots:2 on all Pods to reduce NxDomain query flood from search domain expansion. Skips pods that already have ndots configured." } } spec = { rules = [ { name = "inject-ndots-2" match = { any = [ { resources = { kinds = ["Pod"] } } ] } exclude = { any = [ { resources = { namespaces = ["kube-system", "metallb-system", "kyverno", "calico-system", "calico-apiserver"] } } ] } preconditions = { all = [ { key = "{{ request.object.spec.dnsConfig.options || `[]` | [?name == 'ndots'] | length(@) }}" operator = "Equals" value = "0" } ] } mutate = { patchStrategicMerge = { spec = { dnsConfig = { options = [ { name = "ndots" value = "2" } ] } } } } } ] } }) } # ----------------------------------------------------------------------------- # Layer 5: GPU Workload Priority Override (Kyverno Mutate) # ----------------------------------------------------------------------------- # Overrides the tier-based priorityClassName with gpu-workload for pods that # actually request nvidia.com/gpu resources. This ensures GPU pods can preempt # non-GPU pods on the GPU node, regardless of namespace tier. # Runs after Layer 4 (tier injection), so it overrides the tier-based priority. resource "kubectl_manifest" "mutate_gpu_priority" { yaml_body = yamlencode({ apiVersion = "kyverno.io/v1" kind = "ClusterPolicy" metadata = { name = "inject-gpu-workload-priority" annotations = { "policies.kyverno.io/title" = "Inject GPU Workload Priority" "policies.kyverno.io/description" = "Overrides priorityClassName to gpu-workload for pods requesting nvidia.com/gpu resources. Runs after tier-based injection." } } spec = { rules = [ { name = "gpu-priority-override" match = { any = [ { resources = { kinds = ["Pod"] operations = ["CREATE"] } } ] } exclude = { any = [ { resources = { # tts added so Chatterbox-TTS keeps tier-2-gpu priority (it's a # best-effort off-peak batch tenant — must be evicted first, # not promoted to immich-equal gpu-workload). See locals above. namespaces = local.gpu_priority_excluded_namespaces } } ] } preconditions = { any = [ { key = "{{ request.object.spec.containers[].resources.requests.\"nvidia.com/gpu\" || '' }}" operator = "NotEquals" value = "" }, { key = "{{ request.object.spec.containers[].resources.limits.\"nvidia.com/gpu\" || '' }}" operator = "NotEquals" value = "" } ] } mutate = { # `op=add` (not replace) — incoming pods often lack the # `/spec/priorityClassName` key entirely; replace fails with # "doc is missing key" and aborts the mutation chain BEFORE # Layer 4 (tier injection) can fall back. add works whether # the path exists or not. Verified 2026-05-26 on frigate. patchesJson6902 = yamlencode([ { op = "add" path = "/spec/priorityClassName" value = "gpu-workload" }, { op = "add" path = "/spec/priority" value = 1200000 }, { op = "add" path = "/spec/preemptionPolicy" value = "PreemptLowerPriority" } ]) } } ] } }) } # ----------------------------------------------------------------------------- # Layer 5: Automatic Cleanup of Failed/Evicted Pods # ----------------------------------------------------------------------------- # Deletes pods in Failed phase every hour, cluster-wide. # Prevents stale evicted pods and failed CronJob pods from accumulating. # Grant Kyverno cleanup controller permission to delete Pods resource "kubernetes_cluster_role_v1" "kyverno_cleanup_pods" { metadata { name = "kyverno:cleanup-controller:pods" labels = { "app.kubernetes.io/part-of" = "kyverno" "app.kubernetes.io/instance" = "kyverno" } } rule { api_groups = [""] resources = ["pods"] verbs = ["list", "watch", "delete"] } } resource "kubernetes_cluster_role_binding_v1" "kyverno_cleanup_pods" { metadata { name = "kyverno:cleanup-controller:pods" labels = { "app.kubernetes.io/part-of" = "kyverno" "app.kubernetes.io/instance" = "kyverno" } } role_ref { api_group = "rbac.authorization.k8s.io" kind = "ClusterRole" name = kubernetes_cluster_role_v1.kyverno_cleanup_pods.metadata[0].name } subject { kind = "ServiceAccount" name = "kyverno-cleanup-controller" namespace = "kyverno" } } resource "kubectl_manifest" "cleanup_failed_pods" { yaml_body = yamlencode({ apiVersion = "kyverno.io/v2" kind = "ClusterCleanupPolicy" metadata = { name = "cleanup-failed-pods" annotations = { "policies.kyverno.io/title" = "Cleanup Failed Pods" "policies.kyverno.io/description" = "Automatically deletes pods in Failed phase (evicted, error, completed CronJob failures)." } } spec = { match = { any = [ { resources = { kinds = ["Pod"] } } ] } conditions = { any = [ { key = "{{ request.object.status.phase }}" operator = "Equals" value = "Failed" } ] } schedule = "15 * * * *" } }) } # ----------------------------------------------------------------------------- # Strip CPU Limits (Kyverno Mutate) # ----------------------------------------------------------------------------- # Removes resources.limits.cpu from every container and initContainer at pod # admission. Memory limits are preserved. Cluster policy: CFS throttling causes # more harm than good for bursty single-threaded workloads (Node.js, Python # apps). Upstream Helm charts (CrowdSec, descheduler, kubernetes-dashboard, # nvidia gpu-operator) still ship CPU limits — this strips them declaratively # so we don't have to fork values.yaml per chart. # # Scope: admission-time only. Existing pods keep their limits until restarted # naturally (Helm upgrade, node drain, rollout). No mutateExistingOnPolicyUpdate. # # JSON6902 remove op fails on missing paths — per-element precondition gates # the mutation so pods without CPU limits pass through untouched. resource "kubectl_manifest" "mutate_strip_cpu_limits" { yaml_body = yamlencode({ apiVersion = "kyverno.io/v1" kind = "ClusterPolicy" metadata = { name = "strip-cpu-limits" annotations = { "policies.kyverno.io/title" = "Strip CPU Limits" "policies.kyverno.io/description" = join("", [ "Removes resources.limits.cpu from every container and initContainer ", "at pod admission. Memory limits are preserved. Cluster policy: CFS ", "throttling causes more harm than good for bursty single-threaded ", "workloads (Node.js, Python apps).", ]) } } spec = { background = false rules = [ { name = "strip-container-cpu-limit" match = { any = [ { resources = { kinds = ["Pod"] operations = ["CREATE"] } } ] } preconditions = { all = [ { key = "{{ request.object.spec.containers[?resources.limits.cpu != null] | length(@) }}" operator = "GreaterThan" value = 0 } ] } mutate = { foreach = [ { list = "request.object.spec.containers" preconditions = { all = [ { key = "{{ element.resources.limits.cpu || '' }}" operator = "NotEquals" value = "" } ] } patchesJson6902 = yamlencode([ { op = "remove" path = "/spec/containers/{{ elementIndex }}/resources/limits/cpu" } ]) } ] } }, { name = "strip-initcontainer-cpu-limit" match = { any = [ { resources = { kinds = ["Pod"] operations = ["CREATE"] } } ] } preconditions = { all = [ { key = "{{ request.object.spec.initContainers[?resources.limits.cpu != null] || `[]` | length(@) }}" operator = "GreaterThan" value = 0 } ] } mutate = { foreach = [ { list = "request.object.spec.initContainers" preconditions = { all = [ { key = "{{ element.resources.limits.cpu || '' }}" operator = "NotEquals" value = "" } ] } patchesJson6902 = yamlencode([ { op = "remove" path = "/spec/initContainers/{{ elementIndex }}/resources/limits/cpu" } ]) } ] } }, ] } }) }