From cf578516e9cdf5dde9a308804974035c28760b5c Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Wed, 15 Apr 2026 17:37:49 +0000 Subject: [PATCH] feat: auto-cleanup failed/evicted pods via Kyverno ClusterCleanupPolicy Add cleanup-failed-pods policy that runs hourly (at :15) to delete all pods in Failed phase cluster-wide. Prevents stale evicted and failed CronJob pods from accumulating and creating healthcheck noise. Also adds ClusterRole + ClusterRoleBinding to grant Kyverno cleanup controller permission to delete Pods (not included by default). [ci skip] Co-Authored-By: Claude Opus 4.6 (1M context) --- .../modules/kyverno/resource-governance.tf | 77 +++++++++++++++++++ 1 file changed, 77 insertions(+) diff --git a/stacks/kyverno/modules/kyverno/resource-governance.tf b/stacks/kyverno/modules/kyverno/resource-governance.tf index 6341da40..f6f96940 100644 --- a/stacks/kyverno/modules/kyverno/resource-governance.tf +++ b/stacks/kyverno/modules/kyverno/resource-governance.tf @@ -948,3 +948,80 @@ resource "kubernetes_manifest" "mutate_gpu_priority" { } } } + +# ----------------------------------------------------------------------------- +# Layer 5: Automatic Cleanup of Failed/Evicted Pods +# ----------------------------------------------------------------------------- +# Deletes pods in Failed phase every hour, cluster-wide. +# Prevents stale evicted pods and failed CronJob pods from accumulating. + +# Grant Kyverno cleanup controller permission to delete Pods +resource "kubernetes_cluster_role_v1" "kyverno_cleanup_pods" { + metadata { + name = "kyverno:cleanup-controller:pods" + labels = { + "app.kubernetes.io/part-of" = "kyverno" + "app.kubernetes.io/instance" = "kyverno" + } + } + rule { + api_groups = [""] + resources = ["pods"] + verbs = ["list", "watch", "delete"] + } +} + +resource "kubernetes_cluster_role_binding_v1" "kyverno_cleanup_pods" { + metadata { + name = "kyverno:cleanup-controller:pods" + labels = { + "app.kubernetes.io/part-of" = "kyverno" + "app.kubernetes.io/instance" = "kyverno" + } + } + role_ref { + api_group = "rbac.authorization.k8s.io" + kind = "ClusterRole" + name = kubernetes_cluster_role_v1.kyverno_cleanup_pods.metadata[0].name + } + subject { + kind = "ServiceAccount" + name = "kyverno-cleanup-controller" + namespace = "kyverno" + } +} + +resource "kubernetes_manifest" "cleanup_failed_pods" { + manifest = { + apiVersion = "kyverno.io/v2" + kind = "ClusterCleanupPolicy" + metadata = { + name = "cleanup-failed-pods" + annotations = { + "policies.kyverno.io/title" = "Cleanup Failed Pods" + "policies.kyverno.io/description" = "Automatically deletes pods in Failed phase (evicted, error, completed CronJob failures)." + } + } + spec = { + match = { + any = [ + { + resources = { + kinds = ["Pod"] + } + } + ] + } + conditions = { + any = [ + { + key = "{{ request.object.status.phase }}" + operator = "Equals" + value = "Failed" + } + ] + } + schedule = "15 * * * *" + } + } +}