Add nvidia.com/gpu toleration to all GPU workloads (frigate, ollama) to support NoSchedule taint on GPU nodes. Update nvidia operator helm values with daemonset tolerations. Enhance GPU pod memory exporter with Kubernetes API integration to resolve container IDs to pod names/namespaces, adding RBAC resources for API access.
27 lines
796 B
YAML
27 lines
796 B
YAML
driver:
|
|
enabled: true
|
|
# repository: nvcr.io/nvidia/driver
|
|
# choose a driver version compatible with your GPU + CUDA 12.x (example)
|
|
# NVIDIA GPU driver - https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/latest/platform-support.html#known-issue
|
|
# https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/
|
|
# 13.x >= 580
|
|
# 12.x >= 525, <580
|
|
# 11.x >= 450, <525
|
|
#
|
|
# Delete the cluster policy before each change
|
|
# version: "575.57.08" # CUDA 12.9
|
|
version: "570.195.03" # CUDA 12.8
|
|
upgradePolicy:
|
|
autoUpgrade: false
|
|
|
|
devicePlugin:
|
|
config:
|
|
name: time-slicing-config
|
|
|
|
# Tolerate GPU node taint for all GPU operator components
|
|
daemonsets:
|
|
tolerations:
|
|
- key: "nvidia.com/gpu"
|
|
operator: "Equal"
|
|
value: "true"
|
|
effect: "NoSchedule"
|