From c35bef2fd8932d7655d87eab64a9ef9796e82e89 Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Tue, 24 Feb 2026 22:55:58 +0000 Subject: [PATCH] [ci skip] fix cluster health: GPU tolerations, actualbudget nfs_server, AuthentikDown alert - Add missing nvidia.com/gpu toleration to ollama and yt-highlights deployments - Add node_selector gpu=true to ollama deployment - Pass nfs_server variable through to actualbudget factory modules - Fix AuthentikDown alert to match actual deployment name (goauthentik-server) --- stacks/actualbudget/main.tf | 4 ++++ stacks/ollama/main.tf | 8 ++++++++ .../modules/monitoring/prometheus_chart_values.tpl | 2 +- stacks/ytdlp/main.tf | 5 +++++ 4 files changed, 18 insertions(+), 1 deletion(-) diff --git a/stacks/actualbudget/main.tf b/stacks/actualbudget/main.tf index 40339b02..68975507 100644 --- a/stacks/actualbudget/main.tf +++ b/stacks/actualbudget/main.tf @@ -1,5 +1,6 @@ variable "tls_secret_name" { type = string } variable "actualbudget_credentials" { type = map(any) } +variable "nfs_server" { type = string } # To create a new deployment: @@ -32,6 +33,7 @@ module "viktor" { name = "viktor" tag = "edge" tls_secret_name = var.tls_secret_name + nfs_server = var.nfs_server depends_on = [kubernetes_namespace.actualbudget] tier = local.tiers.edge budget_encryption_password = lookup(var.actualbudget_credentials["viktor"], "password", null) @@ -44,6 +46,7 @@ module "anca" { name = "anca" tag = "edge" tls_secret_name = var.tls_secret_name + nfs_server = var.nfs_server depends_on = [kubernetes_namespace.actualbudget] tier = local.tiers.edge budget_encryption_password = lookup(var.actualbudget_credentials["anca"], "password", null) @@ -56,6 +59,7 @@ module "emo" { name = "emo" tag = "edge" tls_secret_name = var.tls_secret_name + nfs_server = var.nfs_server depends_on = [kubernetes_namespace.actualbudget] tier = local.tiers.edge budget_encryption_password = lookup(var.actualbudget_credentials["emo"], "password", null) diff --git a/stacks/ollama/main.tf b/stacks/ollama/main.tf index e22075fb..18dba845 100644 --- a/stacks/ollama/main.tf +++ b/stacks/ollama/main.tf @@ -89,6 +89,14 @@ resource "kubernetes_deployment" "ollama" { } } spec { + node_selector = { + "gpu" = "true" + } + toleration { + key = "nvidia.com/gpu" + value = "true" + effect = "NoSchedule" + } container { image = "ollama/ollama:latest" name = "ollama" diff --git a/stacks/platform/modules/monitoring/prometheus_chart_values.tpl b/stacks/platform/modules/monitoring/prometheus_chart_values.tpl index caba0ef3..f6deaad1 100755 --- a/stacks/platform/modules/monitoring/prometheus_chart_values.tpl +++ b/stacks/platform/modules/monitoring/prometheus_chart_values.tpl @@ -427,7 +427,7 @@ serverFiles: annotations: summary: "Headscale VPN has no available replicas" - alert: AuthentikDown - expr: (kube_deployment_status_replicas_available{namespace="authentik", deployment="authentik-server"} or on() vector(0)) < 1 + expr: (kube_deployment_status_replicas_available{namespace="authentik", deployment="goauthentik-server"} or on() vector(0)) < 1 for: 5m labels: severity: critical diff --git a/stacks/ytdlp/main.tf b/stacks/ytdlp/main.tf index e8990ecf..a59cdbd8 100644 --- a/stacks/ytdlp/main.tf +++ b/stacks/ytdlp/main.tf @@ -189,6 +189,11 @@ resource "kubernetes_deployment" "yt_highlights" { node_selector = { "gpu" : "true" } + toleration { + key = "nvidia.com/gpu" + value = "true" + effect = "NoSchedule" + } container { name = "yt-highlights" image = "viktorbarzin/yt-highlights:v20-20260127"