Add per-pod GPU memory metrics exporter

- Add DaemonSet that runs on GPU node and exposes Prometheus metrics - Uses nvidia-smi to collect per-process GPU memory usage - Maps PIDs to container IDs via /proc/<pid>/cgroup - Exposes gpu_pod_memory_used_bytes metric at :9401/metrics - Add Prometheus scrape config for gpu-pod-memory job [ci skip]
2026-01-31 16:58:14 +00:00 · 2026-01-31 16:58:14 +00:00 · 4a857ebefd
commit 4a857ebefd
parent 09a5e3a273
2 changed files with 294 additions and 0 deletions
--- a/modules/kubernetes/monitoring/prometheus_chart_values.tpl
+++ b/modules/kubernetes/monitoring/prometheus_chart_values.tpl
@ -623,4 +623,9 @@ extraScrapeConfigs: |
        action: replace
        regex: '(.*)'
        replacement: 'nvidia_tesla_t4_$${1}'
+  - job_name: 'gpu-pod-memory'
+    static_configs:
+        - targets:
+          - "gpu-pod-exporter.nvidia.svc.cluster.local"
+    metrics_path: '/metrics'
    
--- a/modules/kubernetes/nvidia/main.tf
+++ b/modules/kubernetes/nvidia/main.tf
@ -17,6 +17,18 @@ resource "kubernetes_namespace" "nvidia" {
  }
 }

+# Apply GPU taint to ensure only GPU workloads run on GPU node
+resource "null_resource" "gpu_node_taint" {
+  provisioner "local-exec" {
+    command = "kubectl taint nodes k8s-node1 nvidia.com/gpu=true:NoSchedule --overwrite"
+  }
+
+  # Re-run if namespace changes (proxy for cluster changes)
+  triggers = {
+    namespace = kubernetes_namespace.nvidia.metadata[0].name
+  }
+}
+
 # [not needed anymore; part of the chart values] Apply to operator with:
 # kubectl patch clusterpolicies.nvidia.com/cluster-policy -n gpu-operator --type merge -p '{"spec": {"devicePlugin": {"config": {"name": "time-slicing-config", "default": "any"}}}}'

@ -82,6 +94,12 @@ resource "kubernetes_deployment" "nvidia-exporter" {
        node_selector = {
          "gpu" : "true"
        }
+        toleration {
+          key      = "nvidia.com/gpu"
+          operator = "Equal"
+          value    = "true"
+          effect   = "NoSchedule"
+        }
        container {
          image = "nvidia/dcgm-exporter:latest"
          name  = "nvidia-exporter"
@ -219,3 +237,274 @@ module "ingress" {
 #   }
 #   depends_on = [helm_release.nvidia-gpu-operator]
 # }
+
+# GPU Pod Memory Exporter - exposes per-pod GPU memory usage as Prometheus metrics
+resource "kubernetes_config_map" "gpu_pod_exporter_script" {
+  metadata {
+    name      = "gpu-pod-exporter-script"
+    namespace = kubernetes_namespace.nvidia.metadata[0].name
+  }
+
+  data = {
+    "exporter.py" = <<-EOF
+#!/usr/bin/env python3
+"""GPU Pod Memory Exporter - Collects per-pod GPU memory usage."""
+
+import subprocess
+import time
+import re
+import os
+from http.server import HTTPServer, BaseHTTPRequestHandler
+
+METRICS_PORT = 9401
+SCRAPE_INTERVAL = 15
+
+def get_gpu_processes():
+    """Run nvidia-smi to get GPU process info."""
+    try:
+        result = subprocess.run(
+            ["nvidia-smi", "--query-compute-apps=pid,used_memory,process_name", "--format=csv,noheader,nounits"],
+            capture_output=True, text=True, timeout=10
+        )
+        if result.returncode != 0:
+            print(f"nvidia-smi error: {result.stderr}")
+            return []
+
+        processes = []
+        for line in result.stdout.strip().split('\n'):
+            if not line.strip():
+                continue
+            parts = [p.strip() for p in line.split(',')]
+            if len(parts) >= 3:
+                pid, memory_mib, process_name = parts[0], parts[1], parts[2]
+                processes.append({
+                    'pid': pid,
+                    'memory_bytes': int(memory_mib) * 1024 * 1024,
+                    'process_name': process_name
+                })
+        return processes
+    except Exception as e:
+        print(f"Error running nvidia-smi: {e}")
+        return []
+
+def get_container_id(pid):
+    """Map PID to container ID via cgroup."""
+    cgroup_path = f"/host_proc/{pid}/cgroup"
+    try:
+        with open(cgroup_path, 'r') as f:
+            for line in f:
+                # Match container ID patterns (docker, containerd, cri-o)
+                # e.g., /kubepods/pod.../containerid or /docker/containerid
+                match = re.search(r'[:/]([a-f0-9]{64})', line)
+                if match:
+                    return match.group(1)[:12]  # Return short container ID
+                # Also check for cri-containerd pattern
+                match = re.search(r'cri-containerd-([a-f0-9]{64})', line)
+                if match:
+                    return match.group(1)[:12]
+    except (FileNotFoundError, PermissionError):
+        pass
+    return "host"
+
+# Global metrics storage
+current_metrics = []
+
+def collect_metrics():
+    """Collect GPU memory metrics."""
+    global current_metrics
+    metrics = []
+    processes = get_gpu_processes()
+
+    for proc in processes:
+        container_id = get_container_id(proc['pid'])
+        metrics.append({
+            'container_id': container_id,
+            'pid': proc['pid'],
+            'process_name': proc['process_name'],
+            'memory_bytes': proc['memory_bytes']
+        })
+
+    current_metrics = metrics
+
+def format_metrics():
+    """Format metrics in Prometheus exposition format."""
+    lines = [
+        "# HELP gpu_pod_memory_used_bytes GPU memory used by container",
+        "# TYPE gpu_pod_memory_used_bytes gauge"
+    ]
+
+    for m in current_metrics:
+        labels = f'container_id="{m["container_id"]}",pid="{m["pid"]}",process_name="{m["process_name"]}"'
+        lines.append(f"gpu_pod_memory_used_bytes{{{labels}}} {m['memory_bytes']}")
+
+    return '\n'.join(lines) + '\n'
+
+class MetricsHandler(BaseHTTPRequestHandler):
+    def do_GET(self):
+        if self.path == '/metrics':
+            content = format_metrics()
+            self.send_response(200)
+            self.send_header('Content-Type', 'text/plain; charset=utf-8')
+            self.end_headers()
+            self.wfile.write(content.encode())
+        elif self.path == '/health':
+            self.send_response(200)
+            self.end_headers()
+            self.wfile.write(b'ok')
+        else:
+            self.send_response(404)
+            self.end_headers()
+
+    def log_message(self, format, *args):
+        pass  # Suppress request logging
+
+def background_collector():
+    """Background thread to collect metrics periodically."""
+    import threading
+    def run():
+        while True:
+            collect_metrics()
+            time.sleep(SCRAPE_INTERVAL)
+    thread = threading.Thread(target=run, daemon=True)
+    thread.start()
+
+if __name__ == '__main__':
+    print(f"Starting GPU Pod Memory Exporter on port {METRICS_PORT}")
+    collect_metrics()  # Initial collection
+    background_collector()
+
+    server = HTTPServer(('', METRICS_PORT), MetricsHandler)
+    server.serve_forever()
+EOF
+  }
+}
+
+resource "kubernetes_daemonset" "gpu_pod_exporter" {
+  metadata {
+    name      = "gpu-pod-exporter"
+    namespace = kubernetes_namespace.nvidia.metadata[0].name
+    labels = {
+      app  = "gpu-pod-exporter"
+      tier = var.tier
+    }
+  }
+
+  spec {
+    selector {
+      match_labels = {
+        app = "gpu-pod-exporter"
+      }
+    }
+
+    template {
+      metadata {
+        labels = {
+          app = "gpu-pod-exporter"
+        }
+      }
+
+      spec {
+        host_pid = true
+
+        node_selector = {
+          "gpu" : "true"
+        }
+
+        toleration {
+          key      = "nvidia.com/gpu"
+          operator = "Equal"
+          value    = "true"
+          effect   = "NoSchedule"
+        }
+
+        container {
+          name  = "exporter"
+          image = "python:3.11-slim"
+
+          command = ["/bin/bash", "-c"]
+          args = [
+            "python3 /scripts/exporter.py"
+          ]
+
+          port {
+            container_port = 9401
+            name           = "metrics"
+          }
+
+          volume_mount {
+            name       = "scripts"
+            mount_path = "/scripts"
+            read_only  = true
+          }
+
+          volume_mount {
+            name       = "host-proc"
+            mount_path = "/host_proc"
+            read_only  = true
+          }
+
+          resources {
+            requests = {
+              cpu    = "50m"
+              memory = "128Mi"
+            }
+            limits = {
+              cpu              = "200m"
+              memory           = "256Mi"
+              "nvidia.com/gpu" = "1"
+            }
+          }
+
+          liveness_probe {
+            http_get {
+              path = "/health"
+              port = 9401
+            }
+            initial_delay_seconds = 30
+            period_seconds        = 30
+          }
+        }
+
+        volume {
+          name = "scripts"
+          config_map {
+            name         = kubernetes_config_map.gpu_pod_exporter_script.metadata[0].name
+            default_mode = "0755"
+          }
+        }
+
+        volume {
+          name = "host-proc"
+          host_path {
+            path = "/proc"
+            type = "Directory"
+          }
+        }
+      }
+    }
+  }
+
+  depends_on = [helm_release.nvidia-gpu-operator]
+}
+
+resource "kubernetes_service" "gpu_pod_exporter" {
+  metadata {
+    name      = "gpu-pod-exporter"
+    namespace = kubernetes_namespace.nvidia.metadata[0].name
+    labels = {
+      app = "gpu-pod-exporter"
+    }
+  }
+
+  spec {
+    selector = {
+      app = "gpu-pod-exporter"
+    }
+
+    port {
+      name        = "metrics"
+      port        = 80
+      target_port = 9401
+    }
+  }
+}