add nvidia deplaoyment [ci skip]

This commit is contained in:
Viktor Barzin 2025-12-14 09:49:58 +00:00
parent 15ec331589
commit e17f10f9ee
3 changed files with 218 additions and 0 deletions

View file

@ -0,0 +1,170 @@
variable "tls_secret_name" {}
module "tls_secret" {
source = "../setup_tls_secret"
namespace = "nvidia"
tls_secret_name = var.tls_secret_name
}
resource "kubernetes_namespace" "nvidia" {
metadata {
name = "nvidia"
labels = {
"istio-injection" : "disabled"
}
}
}
# Apply to operator with:
# kubectl patch clusterpolicies.nvidia.com/cluster-policy -n gpu-operator --type merge -p '{"spec": {"devicePlugin": {"config": {"name": "time-slicing-config", "default": "any"}}}}'
resource "kubernetes_config_map" "time_slicing_config" {
metadata {
name = "time-slicing-config"
namespace = "nvidia"
}
data = {
any = <<-EOF
flags:
migStrategy: none
sharing:
timeSlicing:
renameByDefault: false
failRequestsGreaterThanOne: false
resources:
- name: nvidia.com/gpu
replicas: 10
EOF
}
}
resource "helm_release" "nvidia-gpu-operator" {
namespace = "nvidia"
name = "nvidia-gpu-operator"
repository = "https://helm.ngc.nvidia.com/nvidia"
chart = "gpu-operator"
atomic = true
# version = "0.9.3"
timeout = 6000
# values = [templatefile("${path.module}/values.yaml", {})]
}
resource "kubernetes_deployment" "nvidia-exporter" {
metadata {
name = "nvidia-exporter"
namespace = "nvidia"
labels = {
app = "nvidia-exporter"
}
}
spec {
replicas = 1
selector {
match_labels = {
app = "nvidia-exporter"
}
}
template {
metadata {
labels = {
app = "nvidia-exporter"
}
}
spec {
node_selector = {
"gpu" : "true"
}
container {
image = "nvidia/dcgm-exporter:latest"
name = "nvidia-exporter"
port {
container_port = 9400
}
port {
container_port = 9400
}
security_context {
privileged = true
capabilities {
add = ["SYS_ADMIN"]
}
}
resources {
limits = {
"nvidia.com/gpu" = "1"
}
}
}
}
}
}
}
resource "kubernetes_service" "nvidia-exporter" {
metadata {
name = "nvidia-exporter"
namespace = "nvidia"
labels = {
"app" = "nvidia-exporter"
}
}
spec {
selector = {
app = "nvidia-exporter"
}
port {
name = "http"
port = 80
target_port = 9400
}
}
}
module "ingress" {
source = "../ingress_factory"
namespace = "nvidia"
name = "nvidia-exporter"
root_domain = "viktorbarzin.lan"
tls_secret_name = var.tls_secret_name
allow_local_access_only = true
ssl_redirect = false
}
# resource "kubernetes_ingress_v1" "nvidia-exporter" {
# metadata {
# name = "nvidia-exporter"
# namespace = "nvidia"
# annotations = {
# "kubernetes.io/ingress.class" = "nginx"
# "nginx.ingress.kubernetes.io/whitelist-source-range" : "192.168.1.0/24, 10.0.0.0/8"
# "nginx.ingress.kubernetes.io/ssl-redirect" : "false" # used only in LAN
# }
# }
# spec {
# tls {
# hosts = ["nvidia-exporter.viktorbarzin.lan"]
# secret_name = var.tls_secret_name
# }
# rule {
# host = "nvidia-exporter.viktorbarzin.lan"
# http {
# path {
# backend {
# service {
# name = "nvidia-exporter"
# port {
# number = 80
# }
# }
# }
# }
# }
# }
# }
# }

View file

@ -0,0 +1,48 @@
# values-cuda12.8.yaml
driver:
enabled: true
repository: nvcr.io/nvidia/driver
# choose a driver version compatible with your GPU + CUDA 12.x (example)
version: "535.113.01"
toolkit:
enabled: true
image:
repository: nvcr.io/nvidia/cuda
# use a CUDA 12.8 toolkit image; adjust OS tag as needed (e.g. ubuntu22.04, ubuntu20.04)
tag: "12.8.1-devel-ubuntu22.04"
# (Optional) If you have containerd, you might need environment for containerd config
# env:
# - name: CONTAINERD_CONFIG
# value: /etc/containerd/config.toml
# - name: CONTAINERD_SOCKET
# value: /run/containerd/containerd.sock
# - name: RUNTIME_CONFIG_SOURCE
# value: "command, file"
devicePlugin:
enabled: true
image:
repository: nvcr.io/nvidia/k8s-device-plugin
# pick a device-plugin build compatible with CUDA 12.8
tag: "1.14.0-12.8-ubuntu22.04"
dcgmExporter:
enabled: true
image:
repository: nvcr.io/nvidia/k8s/dcgm-exporter
tag: "2.12.13-12.8-ubuntu22.04"
# Optional: if you use MIG or want MIG Manager
# migManager:
# enabled: false
# If you want GPUFeatureDiscovery (GFD) — optional
# gfd:
# enabled: false
# Other optional overrides (keep defaults or adjust as needed)
# ccManager:
# enabled: false
# cdi:
# enabled: true

Binary file not shown.