diff --git a/modules/kubernetes/nvidia/main.tf b/modules/kubernetes/nvidia/main.tf new file mode 100644 index 00000000..1cef3968 --- /dev/null +++ b/modules/kubernetes/nvidia/main.tf @@ -0,0 +1,170 @@ +variable "tls_secret_name" {} + +module "tls_secret" { + source = "../setup_tls_secret" + namespace = "nvidia" + tls_secret_name = var.tls_secret_name +} + +resource "kubernetes_namespace" "nvidia" { + metadata { + name = "nvidia" + labels = { + "istio-injection" : "disabled" + } + } +} + +# Apply to operator with: +# kubectl patch clusterpolicies.nvidia.com/cluster-policy -n gpu-operator --type merge -p '{"spec": {"devicePlugin": {"config": {"name": "time-slicing-config", "default": "any"}}}}' + +resource "kubernetes_config_map" "time_slicing_config" { + metadata { + name = "time-slicing-config" + namespace = "nvidia" + } + + data = { + any = <<-EOF + flags: + migStrategy: none + sharing: + timeSlicing: + renameByDefault: false + failRequestsGreaterThanOne: false + resources: + - name: nvidia.com/gpu + replicas: 10 + EOF + } +} + +resource "helm_release" "nvidia-gpu-operator" { + namespace = "nvidia" + name = "nvidia-gpu-operator" + + repository = "https://helm.ngc.nvidia.com/nvidia" + chart = "gpu-operator" + atomic = true + # version = "0.9.3" + timeout = 6000 + + # values = [templatefile("${path.module}/values.yaml", {})] +} + +resource "kubernetes_deployment" "nvidia-exporter" { + metadata { + name = "nvidia-exporter" + namespace = "nvidia" + labels = { + app = "nvidia-exporter" + } + } + spec { + replicas = 1 + selector { + match_labels = { + app = "nvidia-exporter" + } + } + template { + metadata { + labels = { + app = "nvidia-exporter" + } + } + spec { + node_selector = { + "gpu" : "true" + } + container { + image = "nvidia/dcgm-exporter:latest" + name = "nvidia-exporter" + port { + container_port = 9400 + } + port { + container_port = 9400 + } + security_context { + privileged = true + capabilities { + add = ["SYS_ADMIN"] + } + } + resources { + limits = { + "nvidia.com/gpu" = "1" + } + } + } + } + } + } +} + +resource "kubernetes_service" "nvidia-exporter" { + metadata { + name = "nvidia-exporter" + namespace = "nvidia" + labels = { + "app" = "nvidia-exporter" + } + } + + spec { + selector = { + app = "nvidia-exporter" + } + port { + name = "http" + port = 80 + target_port = 9400 + } + } +} + + +module "ingress" { + source = "../ingress_factory" + namespace = "nvidia" + name = "nvidia-exporter" + root_domain = "viktorbarzin.lan" + tls_secret_name = var.tls_secret_name + allow_local_access_only = true + ssl_redirect = false +} + +# resource "kubernetes_ingress_v1" "nvidia-exporter" { +# metadata { +# name = "nvidia-exporter" +# namespace = "nvidia" +# annotations = { +# "kubernetes.io/ingress.class" = "nginx" +# "nginx.ingress.kubernetes.io/whitelist-source-range" : "192.168.1.0/24, 10.0.0.0/8" +# "nginx.ingress.kubernetes.io/ssl-redirect" : "false" # used only in LAN + +# } +# } +# spec { +# tls { +# hosts = ["nvidia-exporter.viktorbarzin.lan"] +# secret_name = var.tls_secret_name +# } +# rule { +# host = "nvidia-exporter.viktorbarzin.lan" +# http { +# path { +# backend { +# service { +# name = "nvidia-exporter" +# port { +# number = 80 +# } +# } +# } +# } +# } +# } +# } +# } diff --git a/modules/kubernetes/nvidia/values.yaml b/modules/kubernetes/nvidia/values.yaml new file mode 100644 index 00000000..97115270 --- /dev/null +++ b/modules/kubernetes/nvidia/values.yaml @@ -0,0 +1,48 @@ +# values-cuda12.8.yaml +driver: + enabled: true + repository: nvcr.io/nvidia/driver + # choose a driver version compatible with your GPU + CUDA 12.x (example) + version: "535.113.01" + +toolkit: + enabled: true + image: + repository: nvcr.io/nvidia/cuda + # use a CUDA 12.8 toolkit image; adjust OS tag as needed (e.g. ubuntu22.04, ubuntu20.04) + tag: "12.8.1-devel-ubuntu22.04" + # (Optional) If you have containerd, you might need environment for containerd config + # env: + # - name: CONTAINERD_CONFIG + # value: /etc/containerd/config.toml + # - name: CONTAINERD_SOCKET + # value: /run/containerd/containerd.sock + # - name: RUNTIME_CONFIG_SOURCE + # value: "command, file" + +devicePlugin: + enabled: true + image: + repository: nvcr.io/nvidia/k8s-device-plugin + # pick a device-plugin build compatible with CUDA 12.8 + tag: "1.14.0-12.8-ubuntu22.04" + +dcgmExporter: + enabled: true + image: + repository: nvcr.io/nvidia/k8s/dcgm-exporter + tag: "2.12.13-12.8-ubuntu22.04" + +# Optional: if you use MIG or want MIG Manager +# migManager: +# enabled: false + +# If you want GPU‑Feature‑Discovery (GFD) — optional +# gfd: +# enabled: false + +# Other optional overrides (keep defaults or adjust as needed) +# ccManager: +# enabled: false +# cdi: +# enabled: true diff --git a/terraform.tfvars b/terraform.tfvars index 95b29e4f..21489391 100644 Binary files a/terraform.tfvars and b/terraform.tfvars differ