From 6f2f4c089ceac42256ea38fbb2a6004651003a5a Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Sun, 15 Mar 2026 02:33:46 +0000 Subject: [PATCH] fix cluster health: resolve 21/23 failures from healthcheck - nvidia: change GPU taint NoSchedule -> PreferNoSchedule to allow overflow scheduling on k8s-node1 (frees ~7Gi capacity) - kyverno: increase reports-controller memory 256Mi -> 512Mi (OOMKilled) - speedtest: add missing DB_PORT=3306 env var (nc: service "" unknown) - realestate-crawler: increase API memory 64Mi -> 256Mi (OOMKilled) - calibre: increase liveness probe timeout 1s -> 5s (false restarts) --- stacks/calibre/main.tf | 1 + stacks/platform/modules/kyverno/main.tf | 4 ++-- stacks/platform/modules/nvidia/main.tf | 2 +- stacks/real-estate-crawler/main.tf | 4 ++-- stacks/speedtest/main.tf | 4 ++++ 5 files changed, 10 insertions(+), 5 deletions(-) diff --git a/stacks/calibre/main.tf b/stacks/calibre/main.tf index 94bd713f..739fef17 100644 --- a/stacks/calibre/main.tf +++ b/stacks/calibre/main.tf @@ -215,6 +215,7 @@ resource "kubernetes_deployment" "calibre-web-automated" { path = "/" port = 8083 } + timeout_seconds = 5 period_seconds = 30 failure_threshold = 3 } diff --git a/stacks/platform/modules/kyverno/main.tf b/stacks/platform/modules/kyverno/main.tf index 256a25f7..83a752cb 100644 --- a/stacks/platform/modules/kyverno/main.tf +++ b/stacks/platform/modules/kyverno/main.tf @@ -30,11 +30,11 @@ resource "helm_release" "kyverno" { reportsController = { resources = { limits = { - memory = "256Mi" + memory = "512Mi" } requests = { cpu = "100m" - memory = "128Mi" + memory = "384Mi" } } } diff --git a/stacks/platform/modules/nvidia/main.tf b/stacks/platform/modules/nvidia/main.tf index dd09d549..7297491d 100644 --- a/stacks/platform/modules/nvidia/main.tf +++ b/stacks/platform/modules/nvidia/main.tf @@ -37,7 +37,7 @@ resource "kubernetes_resource_quota" "nvidia_quota" { resource "null_resource" "gpu_node_config" { provisioner "local-exec" { command = <<-EOT - kubectl taint nodes k8s-node1 nvidia.com/gpu=true:NoSchedule --overwrite + kubectl taint nodes k8s-node1 nvidia.com/gpu=true:PreferNoSchedule --overwrite kubectl label nodes k8s-node1 gpu=true --overwrite EOT } diff --git a/stacks/real-estate-crawler/main.tf b/stacks/real-estate-crawler/main.tf index 1865557b..806d79e5 100644 --- a/stacks/real-estate-crawler/main.tf +++ b/stacks/real-estate-crawler/main.tf @@ -210,10 +210,10 @@ resource "kubernetes_deployment" "realestate-crawler-api" { resources { requests = { cpu = "15m" - memory = "64Mi" + memory = "256Mi" } limits = { - memory = "64Mi" + memory = "256Mi" } } volume_mount { diff --git a/stacks/speedtest/main.tf b/stacks/speedtest/main.tf index a8e3038f..9a9259c9 100644 --- a/stacks/speedtest/main.tf +++ b/stacks/speedtest/main.tf @@ -111,6 +111,10 @@ resource "kubernetes_deployment" "speedtest" { name = "DB_PASSWORD" value = data.vault_kv_secret_v2.secrets.data["db_password"] } + env { + name = "DB_PORT" + value = "3306" + } env { name = "APP_TIMEZONE" value = "Europe/Sofia"