[ci skip] fix cluster health: GPU tolerations, actualbudget nfs_server, AuthentikDown alert

- Add missing nvidia.com/gpu toleration to ollama and yt-highlights deployments
- Add node_selector gpu=true to ollama deployment
- Pass nfs_server variable through to actualbudget factory modules
- Fix AuthentikDown alert to match actual deployment name (goauthentik-server)
This commit is contained in:
Viktor Barzin 2026-02-24 22:55:58 +00:00
parent 4fab38da1f
commit c35bef2fd8
4 changed files with 18 additions and 1 deletions

View file

@ -1,5 +1,6 @@
variable "tls_secret_name" { type = string }
variable "actualbudget_credentials" { type = map(any) }
variable "nfs_server" { type = string }
# To create a new deployment:
@ -32,6 +33,7 @@ module "viktor" {
name = "viktor"
tag = "edge"
tls_secret_name = var.tls_secret_name
nfs_server = var.nfs_server
depends_on = [kubernetes_namespace.actualbudget]
tier = local.tiers.edge
budget_encryption_password = lookup(var.actualbudget_credentials["viktor"], "password", null)
@ -44,6 +46,7 @@ module "anca" {
name = "anca"
tag = "edge"
tls_secret_name = var.tls_secret_name
nfs_server = var.nfs_server
depends_on = [kubernetes_namespace.actualbudget]
tier = local.tiers.edge
budget_encryption_password = lookup(var.actualbudget_credentials["anca"], "password", null)
@ -56,6 +59,7 @@ module "emo" {
name = "emo"
tag = "edge"
tls_secret_name = var.tls_secret_name
nfs_server = var.nfs_server
depends_on = [kubernetes_namespace.actualbudget]
tier = local.tiers.edge
budget_encryption_password = lookup(var.actualbudget_credentials["emo"], "password", null)

View file

@ -89,6 +89,14 @@ resource "kubernetes_deployment" "ollama" {
}
}
spec {
node_selector = {
"gpu" = "true"
}
toleration {
key = "nvidia.com/gpu"
value = "true"
effect = "NoSchedule"
}
container {
image = "ollama/ollama:latest"
name = "ollama"

View file

@ -427,7 +427,7 @@ serverFiles:
annotations:
summary: "Headscale VPN has no available replicas"
- alert: AuthentikDown
expr: (kube_deployment_status_replicas_available{namespace="authentik", deployment="authentik-server"} or on() vector(0)) < 1
expr: (kube_deployment_status_replicas_available{namespace="authentik", deployment="goauthentik-server"} or on() vector(0)) < 1
for: 5m
labels:
severity: critical

View file

@ -189,6 +189,11 @@ resource "kubernetes_deployment" "yt_highlights" {
node_selector = {
"gpu" : "true"
}
toleration {
key = "nvidia.com/gpu"
value = "true"
effect = "NoSchedule"
}
container {
name = "yt-highlights"
image = "viktorbarzin/yt-highlights:v20-20260127"