fix cluster health: resolve 21/23 failures from healthcheck

- nvidia: change GPU taint NoSchedule -> PreferNoSchedule to allow
  overflow scheduling on k8s-node1 (frees ~7Gi capacity)
- kyverno: increase reports-controller memory 256Mi -> 512Mi (OOMKilled)
- speedtest: add missing DB_PORT=3306 env var (nc: service "" unknown)
- realestate-crawler: increase API memory 64Mi -> 256Mi (OOMKilled)
- calibre: increase liveness probe timeout 1s -> 5s (false restarts)
This commit is contained in:
Viktor Barzin 2026-03-15 02:33:46 +00:00 committed by Viktor Barzin
parent dc576aa8b6
commit 6f2f4c089c
5 changed files with 10 additions and 5 deletions

View file

@ -215,6 +215,7 @@ resource "kubernetes_deployment" "calibre-web-automated" {
path = "/" path = "/"
port = 8083 port = 8083
} }
timeout_seconds = 5
period_seconds = 30 period_seconds = 30
failure_threshold = 3 failure_threshold = 3
} }

View file

@ -30,11 +30,11 @@ resource "helm_release" "kyverno" {
reportsController = { reportsController = {
resources = { resources = {
limits = { limits = {
memory = "256Mi" memory = "512Mi"
} }
requests = { requests = {
cpu = "100m" cpu = "100m"
memory = "128Mi" memory = "384Mi"
} }
} }
} }

View file

@ -37,7 +37,7 @@ resource "kubernetes_resource_quota" "nvidia_quota" {
resource "null_resource" "gpu_node_config" { resource "null_resource" "gpu_node_config" {
provisioner "local-exec" { provisioner "local-exec" {
command = <<-EOT command = <<-EOT
kubectl taint nodes k8s-node1 nvidia.com/gpu=true:NoSchedule --overwrite kubectl taint nodes k8s-node1 nvidia.com/gpu=true:PreferNoSchedule --overwrite
kubectl label nodes k8s-node1 gpu=true --overwrite kubectl label nodes k8s-node1 gpu=true --overwrite
EOT EOT
} }

View file

@ -210,10 +210,10 @@ resource "kubernetes_deployment" "realestate-crawler-api" {
resources { resources {
requests = { requests = {
cpu = "15m" cpu = "15m"
memory = "64Mi" memory = "256Mi"
} }
limits = { limits = {
memory = "64Mi" memory = "256Mi"
} }
} }
volume_mount { volume_mount {

View file

@ -111,6 +111,10 @@ resource "kubernetes_deployment" "speedtest" {
name = "DB_PASSWORD" name = "DB_PASSWORD"
value = data.vault_kv_secret_v2.secrets.data["db_password"] value = data.vault_kv_secret_v2.secrets.data["db_password"]
} }
env {
name = "DB_PORT"
value = "3306"
}
env { env {
name = "APP_TIMEZONE" name = "APP_TIMEZONE"
value = "Europe/Sofia" value = "Europe/Sofia"