- nvidia: custom LimitRange (128Mi default, was 1Gi from Kyverno) to stop inflating GPU operator init containers; saves ~2.5Gi on GPU node - nvidia: dcgm-exporter 1536Mi → 768Mi (actual usage 489Mi) - monitoring: prometheus server 4Gi → 3Gi (actual usage 2.6Gi) - onlyoffice: 2304Mi → 1536Mi (actual usage 1.3Gi) - immich: frame explicit 64Mi resources (was getting 1Gi LimitRange default) - dbaas: quota limits.memory 20Gi → 24Gi to fit 3rd MySQL replica Root cause: Kyverno tier-2-gpu LimitRange injected 1Gi on every NVIDIA init container (no explicit resources), wasting ~2.5Gi scheduling overhead on the GPU node. Combined with over-requesting, frigate and immich-ml couldn't schedule.
43 lines
1.1 KiB
YAML
43 lines
1.1 KiB
YAML
driver:
|
|
enabled: true
|
|
# repository: nvcr.io/nvidia/driver
|
|
# choose a driver version compatible with your GPU + CUDA 12.x (example)
|
|
# NVIDIA GPU driver - https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/latest/platform-support.html#known-issue
|
|
# https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/
|
|
# 13.x >= 580
|
|
# 12.x >= 525, <580
|
|
# 11.x >= 450, <525
|
|
#
|
|
# Delete the cluster policy before each change
|
|
# version: "575.57.08" # CUDA 12.9
|
|
version: "570.195.03" # CUDA 12.8
|
|
upgradePolicy:
|
|
autoUpgrade: false
|
|
|
|
devicePlugin:
|
|
config:
|
|
name: time-slicing-config
|
|
|
|
# DCGM Exporter - reduced to 768Mi (actual usage ~489Mi, 1.5x margin)
|
|
dcgmExporter:
|
|
resources:
|
|
requests:
|
|
memory: "768Mi"
|
|
limits:
|
|
memory: "768Mi"
|
|
|
|
# CUDA Validator - reduced from 1024Mi to 256Mi (one-shot job)
|
|
validator:
|
|
resources:
|
|
requests:
|
|
memory: "256Mi"
|
|
limits:
|
|
memory: "256Mi"
|
|
|
|
# Tolerate GPU node taint for all GPU operator components
|
|
daemonsets:
|
|
tolerations:
|
|
- key: "nvidia.com/gpu"
|
|
operator: "Equal"
|
|
value: "true"
|
|
effect: "NoSchedule"
|