diff --git a/stacks/frigate/main.tf b/stacks/frigate/main.tf index f142b160..71eb8a75 100644 --- a/stacks/frigate/main.tf +++ b/stacks/frigate/main.tf @@ -86,10 +86,10 @@ resource "kubernetes_deployment" "frigate" { resources { requests = { cpu = "1500m" - memory = "2Gi" + memory = "5Gi" } limits = { - memory = "8Gi" + memory = "10Gi" "nvidia.com/gpu" = "1" } } diff --git a/stacks/immich/main.tf b/stacks/immich/main.tf index 46f09603..8752abe4 100644 --- a/stacks/immich/main.tf +++ b/stacks/immich/main.tf @@ -513,10 +513,10 @@ resource "kubernetes_deployment" "immich-machine-learning" { resources { requests = { cpu = "100m" - memory = "4Gi" + memory = "3584Mi" } limits = { - memory = "4Gi" + memory = "3584Mi" "nvidia.com/gpu" = "1" } } diff --git a/stacks/onlyoffice/main.tf b/stacks/onlyoffice/main.tf index 44f30a9a..86b16bc3 100644 --- a/stacks/onlyoffice/main.tf +++ b/stacks/onlyoffice/main.tf @@ -103,10 +103,10 @@ resource "kubernetes_deployment" "onlyoffice-document-server" { resources { requests = { cpu = "100m" - memory = "3Gi" + memory = "2304Mi" } limits = { - memory = "3Gi" + memory = "2304Mi" } } port { diff --git a/stacks/openclaw/main.tf b/stacks/openclaw/main.tf index e153fede..bffd6511 100644 --- a/stacks/openclaw/main.tf +++ b/stacks/openclaw/main.tf @@ -426,7 +426,7 @@ resource "kubernetes_deployment" "openclaw" { } requests = { cpu = "100m" - memory = "1280Mi" + memory = "2Gi" } } } diff --git a/stacks/platform/modules/authentik/values.yaml b/stacks/platform/modules/authentik/values.yaml index ddd191cd..05cc1065 100644 --- a/stacks/platform/modules/authentik/values.yaml +++ b/stacks/platform/modules/authentik/values.yaml @@ -48,9 +48,9 @@ worker: resources: requests: cpu: 100m - memory: 1Gi + memory: 896Mi limits: - memory: 1Gi + memory: 896Mi topologySpreadConstraints: - maxSkew: 1 topologyKey: kubernetes.io/hostname diff --git a/stacks/platform/modules/iscsi-csi/main.tf b/stacks/platform/modules/iscsi-csi/main.tf index 0860b317..4c4a8d59 100644 --- a/stacks/platform/modules/iscsi-csi/main.tf +++ b/stacks/platform/modules/iscsi-csi/main.tf @@ -43,6 +43,38 @@ resource "helm_release" "democratic_csi" { limits = { memory = "192Mi" } } } + externalProvisioner = { + resources = { + requests = { cpu = "5m", memory = "64Mi" } + limits = { memory = "64Mi" } + } + } + externalAttacher = { + resources = { + requests = { cpu = "5m", memory = "64Mi" } + limits = { memory = "64Mi" } + } + } + externalResizer = { + resources = { + requests = { cpu = "5m", memory = "64Mi" } + limits = { memory = "64Mi" } + } + } + externalSnapshotter = { + resources = { + requests = { cpu = "5m", memory = "80Mi" } + limits = { memory = "80Mi" } + } + } + } + + # csiProxy is a top-level chart key, NOT nested under controller/node + csiProxy = { + resources = { + requests = { cpu = "5m", memory = "32Mi" } + limits = { memory = "32Mi" } + } } node = { @@ -52,6 +84,18 @@ resource "helm_release" "democratic_csi" { limits = { memory = "192Mi" } } } + driverRegistrar = { + resources = { + requests = { cpu = "5m", memory = "32Mi" } + limits = { memory = "32Mi" } + } + } + cleanup = { + resources = { + requests = { cpu = "5m", memory = "32Mi" } + limits = { memory = "32Mi" } + } + } hostPID = true hostPath = "/lib/modules" diff --git a/stacks/platform/modules/kyverno/resource-governance.tf b/stacks/platform/modules/kyverno/resource-governance.tf index 5b948b62..86384279 100644 --- a/stacks/platform/modules/kyverno/resource-governance.tf +++ b/stacks/platform/modules/kyverno/resource-governance.tf @@ -263,7 +263,7 @@ resource "kubernetes_manifest" "generate_limitrange_by_tier" { } } }, - # Tier 3-edge + # Tier 3-edge — Burstable QoS: request < limit to reduce scheduler pressure { name = "limitrange-tier-3-edge" match = { @@ -305,11 +305,11 @@ resource "kubernetes_manifest" "generate_limitrange_by_tier" { { type = "Container" default = { - memory = "128Mi" + memory = "192Mi" } defaultRequest = { cpu = "50m" - memory = "128Mi" + memory = "96Mi" } max = { memory = "4Gi" @@ -320,7 +320,7 @@ resource "kubernetes_manifest" "generate_limitrange_by_tier" { } } }, - # Tier 4-aux + # Tier 4-aux — Burstable QoS: request < limit to reduce scheduler pressure { name = "limitrange-tier-4-aux" match = { @@ -362,11 +362,11 @@ resource "kubernetes_manifest" "generate_limitrange_by_tier" { { type = "Container" default = { - memory = "128Mi" + memory = "256Mi" } defaultRequest = { cpu = "50m" - memory = "128Mi" + memory = "64Mi" } max = { memory = "4Gi" diff --git a/stacks/platform/modules/monitoring/caretta.tf b/stacks/platform/modules/monitoring/caretta.tf index 5f3ca34a..cf8bfcae 100644 --- a/stacks/platform/modules/monitoring/caretta.tf +++ b/stacks/platform/modules/monitoring/caretta.tf @@ -29,10 +29,10 @@ resource "helm_release" "caretta" { resources = { requests = { cpu = "10m" - memory = "768Mi" + memory = "600Mi" } limits = { - memory = "768Mi" + memory = "600Mi" } } })] diff --git a/stacks/platform/modules/monitoring/prometheus_chart_values.tpl b/stacks/platform/modules/monitoring/prometheus_chart_values.tpl index 107a7199..5a1fc65d 100755 --- a/stacks/platform/modules/monitoring/prometheus_chart_values.tpl +++ b/stacks/platform/modules/monitoring/prometheus_chart_values.tpl @@ -396,6 +396,27 @@ serverFiles: severity: warning annotations: summary: "{{ $labels.namespace }}/{{ $labels.pod }}/{{ $labels.container }}: OOM killed" + - alert: ClusterMemoryRequestsHigh + expr: sum(kube_pod_container_resource_requests{resource="memory"}) / sum(kube_node_status_allocatable{resource="memory"}) > 0.85 + for: 15m + labels: + severity: warning + annotations: + summary: "Cluster memory requests above 85% of allocatable" + - alert: ContainerNearOOM + expr: (container_memory_working_set_bytes / container_spec_memory_limit_bytes > 0.85) and container_spec_memory_limit_bytes > 0 + for: 30m + labels: + severity: warning + annotations: + summary: "{{ $labels.container }} in {{ $labels.namespace }}/{{ $labels.pod }} using >85% of memory limit" + - alert: PodUnschedulable + expr: kube_pod_status_conditions{condition="PodScheduled", status="false"} == 1 + for: 5m + labels: + severity: critical + annotations: + summary: "Pod {{ $labels.namespace }}/{{ $labels.pod }} unschedulable" - alert: NodeNotReady expr: kube_node_status_condition{condition="Ready",status="true"} == 0 for: 5m diff --git a/stacks/url/main.tf b/stacks/url/main.tf index 1950dfc0..a42c7c86 100644 --- a/stacks/url/main.tf +++ b/stacks/url/main.tf @@ -141,11 +141,11 @@ resource "kubernetes_deployment" "shlink" { # } resources { limits = { - memory = "768Mi" + memory = "960Mi" } requests = { cpu = "25m" - memory = "512Mi" + memory = "960Mi" } } port {