Add GPU node taint tolerations and enhance GPU memory exporter

Add nvidia.com/gpu toleration to all GPU workloads (frigate, ollama) to support NoSchedule taint on GPU nodes. Update nvidia operator helm values with daemonset tolerations. Enhance GPU pod memory exporter with Kubernetes API integration to resolve container IDs to pod names/namespaces, adding RBAC resources for API access.
2026-02-06 20:19:26 +00:00 · 2026-02-06 20:19:26 +00:00 · 1275697f2b
commit 1275697f2b
parent ffa80f0df6
5 changed files with 188 additions and 12 deletions
--- a/modules/kubernetes/drone/main.tf
+++ b/modules/kubernetes/drone/main.tf
@ -11,7 +11,7 @@ variable "rpc_host" {
 }
 variable "allowed_users" {
  # comma separated list
-  default = "viktorbarzin"
+  default = "viktorbarzin,ancamilea"
 }

 resource "kubernetes_namespace" "drone" {
@ -67,7 +67,7 @@ resource "kubernetes_deployment" "drone_server" {
      }
      spec {
        container {
-          image = "drone/drone"
+          image = "drone/drone:2.27.0"
          name  = "drone-server"
          # resources {
          #   limits = {
@ -119,6 +119,18 @@ resource "kubernetes_deployment" "drone_server" {
            name  = "DRONE_CRON_INTERVAL"
            value = "1m"
          }
+          env {
+            name  = "DRONE_LOGS_TRACE"
+            value = "true"
+          }
+          env {
+            name  = "DRONE_LOGS_PRETTY"
+            value = "true"
+          }
+          env {
+            name  = "DRONE_LOGS_TEXT"
+            value = "true"
+          }

        }
        volume {
--- a/modules/kubernetes/frigate/main.tf
+++ b/modules/kubernetes/frigate/main.tf
@ -48,6 +48,12 @@ resource "kubernetes_deployment" "frigate" {
        node_selector = {
          "gpu" : true
        }
+        toleration {
+          key      = "nvidia.com/gpu"
+          operator = "Equal"
+          value    = "true"
+          effect   = "NoSchedule"
+        }
        container {
          # image = "ghcr.io/blakeblackshear/frigate:stable"
          # image = "ghcr.io/blakeblackshear/frigate:stable-tensorrt"
--- a/modules/kubernetes/nvidia/main.tf
+++ b/modules/kubernetes/nvidia/main.tf
@ -254,11 +254,98 @@ import subprocess
 import time
 import re
 import os
+import json
+import urllib.request
+import ssl
 from http.server import HTTPServer, BaseHTTPRequestHandler

 METRICS_PORT = 9401
 SCRAPE_INTERVAL = 15

+# Kubernetes API configuration
+K8S_API = "https://kubernetes.default.svc"
+TOKEN_PATH = "/var/run/secrets/kubernetes.io/serviceaccount/token"
+CA_PATH = "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt"
+
+# Cache for container ID to pod info mapping
+container_cache = {}
+cache_refresh_time = 0
+CACHE_TTL = 60  # Refresh cache every 60 seconds
+
+def get_k8s_token():
+    """Read Kubernetes service account token."""
+    try:
+        with open(TOKEN_PATH, 'r') as f:
+            return f.read().strip()
+    except:
+        return None
+
+def refresh_container_cache():
+    """Refresh the container ID to pod mapping from Kubernetes API."""
+    global container_cache, cache_refresh_time
+
+    token = get_k8s_token()
+    if not token:
+        return
+
+    try:
+        # Create SSL context with K8s CA
+        ctx = ssl.create_default_context()
+        if os.path.exists(CA_PATH):
+            ctx.load_verify_locations(CA_PATH)
+
+        # Get all pods on this node
+        node_name = os.environ.get('NODE_NAME', '')
+        url = f"{K8S_API}/api/v1/pods?fieldSelector=spec.nodeName={node_name}"
+
+        req = urllib.request.Request(url, headers={
+            'Authorization': f'Bearer {token}',
+            'Accept': 'application/json'
+        })
+
+        with urllib.request.urlopen(req, context=ctx, timeout=10) as resp:
+            data = json.loads(resp.read().decode())
+
+        new_cache = {}
+        for pod in data.get('items', []):
+            pod_name = pod['metadata']['name']
+            namespace = pod['metadata']['namespace']
+
+            # Get container statuses
+            for status in pod.get('status', {}).get('containerStatuses', []):
+                container_id = status.get('containerID', '')
+                # Extract the ID part (e.g., "containerd://abc123..." -> "abc123")
+                if '://' in container_id:
+                    container_id = container_id.split('://')[-1]
+                if container_id:
+                    short_id = container_id[:12]
+                    new_cache[short_id] = {
+                        'pod': pod_name,
+                        'namespace': namespace,
+                        'container': status.get('name', 'unknown')
+                    }
+
+        container_cache = new_cache
+        cache_refresh_time = time.time()
+        print(f"Refreshed container cache: {len(new_cache)} containers")
+
+    except Exception as e:
+        print(f"Error refreshing container cache: {e}")
+
+def get_pod_info(container_id):
+    """Look up pod info for a container ID."""
+    global cache_refresh_time
+
+    # Refresh cache if stale
+    if time.time() - cache_refresh_time > CACHE_TTL:
+        refresh_container_cache()
+
+    return container_cache.get(container_id, {
+        'pod': 'unknown',
+        'namespace': 'unknown',
+        'container': 'unknown'
+    })
+
 def get_gpu_processes():
    """Run nvidia-smi to get GPU process info."""
    try:
@ -294,11 +381,9 @@ def get_container_id(pid):
        with open(cgroup_path, 'r') as f:
            for line in f:
                # Match container ID patterns (docker, containerd, cri-o)
-                # e.g., /kubepods/pod.../containerid or /docker/containerid
                match = re.search(r'[:/]([a-f0-9]{64})', line)
                if match:
-                    return match.group(1)[:12]  # Return short container ID
-                # Also check for cri-containerd pattern
+                    return match.group(1)[:12]
                match = re.search(r'cri-containerd-([a-f0-9]{64})', line)
                if match:
                    return match.group(1)[:12]
@ -317,11 +402,15 @@ def collect_metrics():

    for proc in processes:
        container_id = get_container_id(proc['pid'])
+        pod_info = get_pod_info(container_id)
        metrics.append({
            'container_id': container_id,
            'pid': proc['pid'],
            'process_name': proc['process_name'],
-            'memory_bytes': proc['memory_bytes']
+            'memory_bytes': proc['memory_bytes'],
+            'pod': pod_info['pod'],
+            'namespace': pod_info['namespace'],
+            'container': pod_info['container']
        })

    current_metrics = metrics
@ -329,13 +418,19 @@ def collect_metrics():
 def format_metrics():
    """Format metrics in Prometheus exposition format."""
    lines = [
-        "# HELP gpu_pod_memory_used_bytes GPU memory used by container",
+        "# HELP gpu_pod_memory_used_bytes GPU memory used by pod",
        "# TYPE gpu_pod_memory_used_bytes gauge"
    ]

    for m in current_metrics:
-        labels = f'container_id="{m["container_id"]}",pid="{m["pid"]}",process_name="{m["process_name"]}"'
-        lines.append(f"gpu_pod_memory_used_bytes{{{labels}}} {m['memory_bytes']}")
+        labels = ','.join([
+            f'namespace="{m["namespace"]}"',
+            f'pod="{m["pod"]}"',
+            f'container="{m["container"]}"',
+            f'process_name="{m["process_name"]}"',
+            f'pid="{m["pid"]}"'
+        ])
+        lines.append(f'gpu_pod_memory_used_bytes{{{labels}}} {m["memory_bytes"]}')

    return '\n'.join(lines) + '\n'

@ -370,6 +465,7 @@ def background_collector():

 if __name__ == '__main__':
    print(f"Starting GPU Pod Memory Exporter on port {METRICS_PORT}")
+    refresh_container_cache()  # Initial cache load
    collect_metrics()  # Initial collection
    background_collector()

@ -379,6 +475,43 @@ EOF
  }
 }

+resource "kubernetes_service_account" "gpu_pod_exporter" {
+  metadata {
+    name      = "gpu-pod-exporter"
+    namespace = kubernetes_namespace.nvidia.metadata[0].name
+  }
+}
+
+resource "kubernetes_cluster_role" "gpu_pod_exporter" {
+  metadata {
+    name = "gpu-pod-exporter"
+  }
+
+  rule {
+    api_groups = [""]
+    resources  = ["pods"]
+    verbs      = ["list"]
+  }
+}
+
+resource "kubernetes_cluster_role_binding" "gpu_pod_exporter" {
+  metadata {
+    name = "gpu-pod-exporter"
+  }
+
+  role_ref {
+    api_group = "rbac.authorization.k8s.io"
+    kind      = "ClusterRole"
+    name      = kubernetes_cluster_role.gpu_pod_exporter.metadata[0].name
+  }
+
+  subject {
+    kind      = "ServiceAccount"
+    name      = kubernetes_service_account.gpu_pod_exporter.metadata[0].name
+    namespace = kubernetes_namespace.nvidia.metadata[0].name
+  }
+}
+
 resource "kubernetes_daemonset" "gpu_pod_exporter" {
  metadata {
    name      = "gpu-pod-exporter"
@ -404,7 +537,8 @@ resource "kubernetes_daemonset" "gpu_pod_exporter" {
      }

      spec {
-        host_pid = true
+        host_pid             = true
+        service_account_name = kubernetes_service_account.gpu_pod_exporter.metadata[0].name

        node_selector = {
          "gpu" : "true"
@ -426,6 +560,15 @@ resource "kubernetes_daemonset" "gpu_pod_exporter" {
            "python3 /scripts/exporter.py"
          ]

+          env {
+            name = "NODE_NAME"
+            value_from {
+              field_ref {
+                field_path = "spec.nodeName"
+              }
+            }
+          }
+
          port {
            container_port = 9401
            name           = "metrics"
--- a/modules/kubernetes/nvidia/values.yaml
+++ b/modules/kubernetes/nvidia/values.yaml
@ -17,3 +17,11 @@ driver:
  devicePlugin:
    config:
      name: time-slicing-config
+
+# Tolerate GPU node taint for all GPU operator components
+daemonsets:
+  tolerations:
+    - key: "nvidia.com/gpu"
+      operator: "Equal"
+      value: "true"
+      effect: "NoSchedule"
--- a/modules/kubernetes/ollama/values.yaml
+++ b/modules/kubernetes/ollama/values.yaml
@ -17,5 +17,12 @@ ollama:
 persistentVolume:
  enabled: true
  existingClaim: "ollama-pvc"
-# nodeSelector:
-#   kubernetes.io/hostname: k8s-node1
+
+nodeSelector:
+  gpu: "true"
+
+tolerations:
+  - key: "nvidia.com/gpu"
+    operator: "Equal"
+    value: "true"
+    effect: "NoSchedule"