diff --git a/modules/kubernetes/infra-maintenance/main.tf b/modules/kubernetes/infra-maintenance/main.tf index 4625ba92..27a92a96 100644 --- a/modules/kubernetes/infra-maintenance/main.tf +++ b/modules/kubernetes/infra-maintenance/main.tf @@ -141,3 +141,72 @@ resource "kubernetes_cron_job_v1" "backup-etcd" { } } } + +# Clean up evicted/failed pods cluster-wide daily +resource "kubernetes_cron_job_v1" "cleanup-failed-pods" { + metadata { + name = "cleanup-failed-pods" + namespace = "default" + } + spec { + schedule = "0 2 * * *" + successful_jobs_history_limit = 1 + failed_jobs_history_limit = 1 + concurrency_policy = "Forbid" + job_template { + metadata { + name = "cleanup-failed-pods" + } + spec { + template { + metadata { + name = "cleanup-failed-pods" + } + spec { + service_account_name = kubernetes_service_account.cleanup_sa.metadata[0].name + container { + name = "cleanup" + image = "bitnami/kubectl:latest" + command = ["/bin/sh", "-c", "kubectl delete pods -A --field-selector=status.phase=Failed --ignore-not-found"] + } + restart_policy = "Never" + } + } + } + } + } +} + +resource "kubernetes_service_account" "cleanup_sa" { + metadata { + name = "failed-pod-cleanup" + namespace = "default" + } +} + +resource "kubernetes_cluster_role" "cleanup_role" { + metadata { + name = "failed-pod-cleanup" + } + rule { + api_groups = [""] + resources = ["pods"] + verbs = ["list", "delete"] + } +} + +resource "kubernetes_cluster_role_binding" "cleanup_binding" { + metadata { + name = "failed-pod-cleanup" + } + role_ref { + api_group = "rbac.authorization.k8s.io" + kind = "ClusterRole" + name = kubernetes_cluster_role.cleanup_role.metadata[0].name + } + subject { + kind = "ServiceAccount" + name = kubernetes_service_account.cleanup_sa.metadata[0].name + namespace = "default" + } +} diff --git a/modules/kubernetes/nextcloud/chart_values.yaml b/modules/kubernetes/nextcloud/chart_values.yaml index 861eaf1a..cda04812 100644 --- a/modules/kubernetes/nextcloud/chart_values.yaml +++ b/modules/kubernetes/nextcloud/chart_values.yaml @@ -59,11 +59,7 @@ podAnnotations: diun.include_tags: "^[0-9]+(?:.[0-9]+)?(?:.[0-9]+)?.*" collabora: - enabled: true # Currently the app is disabled as using onlyoffice instead - - autoscaling: - # enable autocaling, please check collabora README.md first - enabled: true + enabled: false # Using onlyoffice instead cronjob: enabled: true diff --git a/modules/kubernetes/nvidia/main.tf b/modules/kubernetes/nvidia/main.tf index 8f46839c..b9356cc2 100644 --- a/modules/kubernetes/nvidia/main.tf +++ b/modules/kubernetes/nvidia/main.tf @@ -605,6 +605,7 @@ resource "kubernetes_daemonset" "gpu_pod_exporter" { } initial_delay_seconds = 30 period_seconds = 30 + timeout_seconds = 5 } } diff --git a/secrets/nfs_directories.txt b/secrets/nfs_directories.txt index d99f3aac..893c5531 100644 Binary files a/secrets/nfs_directories.txt and b/secrets/nfs_directories.txt differ