From a73f3fcb6b17deb0d58b4ea0331697e672e23c70 Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Sun, 15 Feb 2026 17:20:47 +0000 Subject: [PATCH] Cluster health remediation: cleanup CronJob, disable Collabora, fix GPU probe, add NFS exports [ci skip] - Add daily CronJob to auto-clean Failed/Evicted pods cluster-wide (infra-maintenance) - Disable Collabora in Nextcloud (broken HPA caused scaling storm; using OnlyOffice instead) - Increase gpu-pod-exporter liveness probe timeout from 1s to 5s - Add osm-routing NFS exports (osrm-data, otp-data) --- modules/kubernetes/infra-maintenance/main.tf | 69 ++++++++++++++++++ .../kubernetes/nextcloud/chart_values.yaml | 6 +- modules/kubernetes/nvidia/main.tf | 1 + secrets/nfs_directories.txt | Bin 1623 -> 1666 bytes 4 files changed, 71 insertions(+), 5 deletions(-) diff --git a/modules/kubernetes/infra-maintenance/main.tf b/modules/kubernetes/infra-maintenance/main.tf index 4625ba92..27a92a96 100644 --- a/modules/kubernetes/infra-maintenance/main.tf +++ b/modules/kubernetes/infra-maintenance/main.tf @@ -141,3 +141,72 @@ resource "kubernetes_cron_job_v1" "backup-etcd" { } } } + +# Clean up evicted/failed pods cluster-wide daily +resource "kubernetes_cron_job_v1" "cleanup-failed-pods" { + metadata { + name = "cleanup-failed-pods" + namespace = "default" + } + spec { + schedule = "0 2 * * *" + successful_jobs_history_limit = 1 + failed_jobs_history_limit = 1 + concurrency_policy = "Forbid" + job_template { + metadata { + name = "cleanup-failed-pods" + } + spec { + template { + metadata { + name = "cleanup-failed-pods" + } + spec { + service_account_name = kubernetes_service_account.cleanup_sa.metadata[0].name + container { + name = "cleanup" + image = "bitnami/kubectl:latest" + command = ["/bin/sh", "-c", "kubectl delete pods -A --field-selector=status.phase=Failed --ignore-not-found"] + } + restart_policy = "Never" + } + } + } + } + } +} + +resource "kubernetes_service_account" "cleanup_sa" { + metadata { + name = "failed-pod-cleanup" + namespace = "default" + } +} + +resource "kubernetes_cluster_role" "cleanup_role" { + metadata { + name = "failed-pod-cleanup" + } + rule { + api_groups = [""] + resources = ["pods"] + verbs = ["list", "delete"] + } +} + +resource "kubernetes_cluster_role_binding" "cleanup_binding" { + metadata { + name = "failed-pod-cleanup" + } + role_ref { + api_group = "rbac.authorization.k8s.io" + kind = "ClusterRole" + name = kubernetes_cluster_role.cleanup_role.metadata[0].name + } + subject { + kind = "ServiceAccount" + name = kubernetes_service_account.cleanup_sa.metadata[0].name + namespace = "default" + } +} diff --git a/modules/kubernetes/nextcloud/chart_values.yaml b/modules/kubernetes/nextcloud/chart_values.yaml index 861eaf1a..cda04812 100644 --- a/modules/kubernetes/nextcloud/chart_values.yaml +++ b/modules/kubernetes/nextcloud/chart_values.yaml @@ -59,11 +59,7 @@ podAnnotations: diun.include_tags: "^[0-9]+(?:.[0-9]+)?(?:.[0-9]+)?.*" collabora: - enabled: true # Currently the app is disabled as using onlyoffice instead - - autoscaling: - # enable autocaling, please check collabora README.md first - enabled: true + enabled: false # Using onlyoffice instead cronjob: enabled: true diff --git a/modules/kubernetes/nvidia/main.tf b/modules/kubernetes/nvidia/main.tf index 8f46839c..b9356cc2 100644 --- a/modules/kubernetes/nvidia/main.tf +++ b/modules/kubernetes/nvidia/main.tf @@ -605,6 +605,7 @@ resource "kubernetes_daemonset" "gpu_pod_exporter" { } initial_delay_seconds = 30 period_seconds = 30 + timeout_seconds = 5 } } diff --git a/secrets/nfs_directories.txt b/secrets/nfs_directories.txt index d99f3aacdafb397a9b299cf35fcff9e30844636e..893c5531d55ed4f3bdcd94e63d07e6ea7e33ff26 100644 GIT binary patch literal 1666 zcmV-|27UPeM@dveQdv+`06#9nxdyQIC5u<%Qsy?4rMO+<+`K(cmXA*YR9F1F0+n=5 zpcpEJm2AI#x{kn$QRlu9BzuKzZrL*q_yAeY`_Erg5oetcA-=Ocqn^`R@ zGRg4L%T4;09NfuK^=u$dMxsWI-&*F*v&RMVB#Z75L+U~4T@H~`AAtoLOPUp;FUt}o z0avvKcP-aw6o1v?wRUt6%6s&EOBib?_0+xVktH7_o^`R2utO!1(a6NazQ!&jL|Gag zNx-|XXF;60_L+Ld=$%wHfAmWQ{O5Q{9G%kX+tjQd<`Xt2```Ns3)$bHGKr9niwmk~ zr(5*rEQdaDcseKwQE|We{+2c1T0rJ~88SKw6EF3%9)L^z%fK=8V)|?Ia%N=Z=E645 z7@{P0JLrg$u3pIO1ZPDa@p594$M{q7u3zgAlo!;*1B~Zf(SzHw+skD2#ko#Ef@or~ zAmc65_I)4}$czE2_k%DlZR^$R)j-b+mz($jy#G4R@fu0%snBn8DWVDyUdXW7!A&AN zc4W2sPPA0-YoF_Bs5H5Lwg^$p;CHT~)|n@B;|p#%}lZl6^Y#qq@pJmEJbR+`ug zil8haEC!j`rCgJrk)!VzlsxT-ytMUQ%q_-4gy*dN{;*S9miZWxyt%%f3W;ddmHl+w2J{N zPGh(|NUu@g&J{;9Q&R8KjB`YA;S%>?%Yd*mtue6`k#5TbE|X4!Hdk{N^wI3Tx{Pi% z33}bOKCn%6?Tp3KnaFc4Bh#0VNTHYTZoBbasTWcl`F1_gYvi|%Hd59hk7*CS!|1YW zJ+10uDmL85CC6$8oO6$H!X7tbq5-@7@EB&ZiMtCiD!_|UEXN4Fmjzn2>T*=Fr1_w% zhg_8TS|56rp|%Bc48Fx!^-=z9`;TT@T?>|D13#S=E>|bwnFiD8)?)YKUWd6(?0=9K zkoK&m*l74SJHIdy#@!fSCV2*r&@ih9e9{-DVD}ii2&wI09lygCkG;y^BII5XNU5cB`~VTH$K*P z7WNJ}2j>mk$ykKd_v$&84^roOfz}X%fFHw*RJK;Nfi;RG zP|xEeVdw&?QjoKKZ%!&KwDcjQexg^vqO~7CIDD`TsV_dDB$y8siB$-E4JqwbXd}v{ zSO~w;-w7P*>8gJP$`bQ( z3oBG=7znPlym0@UpK%|JCY3^g)51aQFV#wYer|?z0{+Ho2lUU{$bRd(%;A^6X|87_QXE=rEJ2Vkkrm88%TCfq zhRKE=$h~GkIE&+@8rHS?qL<2CxTZNry{dvyUb&h)6|M?lj-+D|-fH^NhEnY33=>_v zTh5_Q%;Uy~#1mE_pb+Md?yEZlsJ_Cei-(uyDHL+A4Jr7NzmK>XRwCnx?g!RCv{bla z+)rQHtAb>=E_jN9lWDxQD<7F-7_y(`{BB|)Q9uo#*K&^4n@eJn7$Zj&aLBdU##WuX M=$6rmpUh=$3iINagh3OIMNvdv zrM&71bfq{7u~-Bw5^}wu@!T5PnLiPC(EYS2@P+|4Hz1YI_wOO#Dm}cJP;w?%{ipvq z{d-}67ZwAkJvg0(19U>UaXJsJryPp7-YVe4#(^B2oooPdNiN-SFof5o6p0}^8&Y6m zfCpTjo#(9%c{5-;?bWIgCpArLnqM&x=R>LP>+r$?BJFPE=z;q zh!F`OAnSh%A}~fe<_f1$TsSi<^nNUxGNF1!9?6<#oz)Rg6!>}j;rrkW+F#bnk>bEV z#E%39?r?r<5-(~+(9eCxKTJ@<28rC3`BwR4_`&?v_!|(2u6{l{<2LX|$J8Md5i*Wq zKOzvf^lwuaKa3bpL*~&djr#7q*cQVn+QE+Yu_h^FQR$ny+>B*^X*7hK5qZXMf*XFt z)Bnt_dwYocSgH7*JGyM@IHgBr+?q3X?8jV9IkYj5rSx48;LC8&hcobc!^Y58!9VdN znpt+s)uoQk^jsjG@83?AxuBiRJ1tVgU)+RKGAg7e%(S1*?Gah=a8cnJ~2C4SaXoje{m_^K}YkWAft4zw-U z6P3T2P=3bXvQ(SUiz`$oBv@v6T)rtQV*1x&o@Y>Fu$hysvq&>n|2FYS79(eF69pup z2)M<}{};MtVQ3ybnz$=(nuD7!_TlgP!7M2jRr@43UY+XG3-@jKyt*aQ!b&CCNh;nc zBXQzFMcBctK)V^R~!p5E9 zA;5G|k$s|$9OXQMy|JG~RogUL81y5T`rPDnpEjIM$yOZs;k!X4wYu(cFUP!hg=O&! z$jM@xlN%*C;*i9f0@1_6=w-eQKqowvIPQdBcs9wDHYt~mync)fa9j6LM+GABkqVh2 zN75jzDzl1NKYh-hh(V_XFFuW0?r%If*em;-C82*Ufs(LZJoZN4Ggx!J<3d=`X(BYi}jFx>k`EXkWeeb@3 z3b?jS(lsZv+b?f?P@d&Ep6InV|7U5urU-fH62w}MrUI(!V&i8eOi?&hzt5Cril|!w zdZO|xCMRRcute_NxzfIf#@WhO-S`ei=@>&h=fjEMdkI2<#u{Cv-jaJU4%%EYgUTmF zd)5wnaYyvKR`f(Y|BRbX$GTv+!N&@6-KH#j7UhBd@OOyzT? zj1c<1qH|}_e27ke7NdjnHvqgoD5jisc`!k5@}TVXBuh^@JcCTUYaW VVQV?B_3XpZ5bIcYR6?SDV6XPYAS?g?