From 1275697f2b00ad1fa5332eaea5b75351e0c28c8e Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Fri, 6 Feb 2026 20:19:26 +0000 Subject: [PATCH] Add GPU node taint tolerations and enhance GPU memory exporter Add nvidia.com/gpu toleration to all GPU workloads (frigate, ollama) to support NoSchedule taint on GPU nodes. Update nvidia operator helm values with daemonset tolerations. Enhance GPU pod memory exporter with Kubernetes API integration to resolve container IDs to pod names/namespaces, adding RBAC resources for API access. --- modules/kubernetes/drone/main.tf | 16 ++- modules/kubernetes/frigate/main.tf | 6 + modules/kubernetes/nvidia/main.tf | 159 ++++++++++++++++++++++++-- modules/kubernetes/nvidia/values.yaml | 8 ++ modules/kubernetes/ollama/values.yaml | 11 +- 5 files changed, 188 insertions(+), 12 deletions(-) diff --git a/modules/kubernetes/drone/main.tf b/modules/kubernetes/drone/main.tf index 4d778e3d..fa5d162b 100644 --- a/modules/kubernetes/drone/main.tf +++ b/modules/kubernetes/drone/main.tf @@ -11,7 +11,7 @@ variable "rpc_host" { } variable "allowed_users" { # comma separated list - default = "viktorbarzin" + default = "viktorbarzin,ancamilea" } resource "kubernetes_namespace" "drone" { @@ -67,7 +67,7 @@ resource "kubernetes_deployment" "drone_server" { } spec { container { - image = "drone/drone" + image = "drone/drone:2.27.0" name = "drone-server" # resources { # limits = { @@ -119,6 +119,18 @@ resource "kubernetes_deployment" "drone_server" { name = "DRONE_CRON_INTERVAL" value = "1m" } + env { + name = "DRONE_LOGS_TRACE" + value = "true" + } + env { + name = "DRONE_LOGS_PRETTY" + value = "true" + } + env { + name = "DRONE_LOGS_TEXT" + value = "true" + } } volume { diff --git a/modules/kubernetes/frigate/main.tf b/modules/kubernetes/frigate/main.tf index 215836a4..32322951 100644 --- a/modules/kubernetes/frigate/main.tf +++ b/modules/kubernetes/frigate/main.tf @@ -48,6 +48,12 @@ resource "kubernetes_deployment" "frigate" { node_selector = { "gpu" : true } + toleration { + key = "nvidia.com/gpu" + operator = "Equal" + value = "true" + effect = "NoSchedule" + } container { # image = "ghcr.io/blakeblackshear/frigate:stable" # image = "ghcr.io/blakeblackshear/frigate:stable-tensorrt" diff --git a/modules/kubernetes/nvidia/main.tf b/modules/kubernetes/nvidia/main.tf index dc05a790..6384f245 100644 --- a/modules/kubernetes/nvidia/main.tf +++ b/modules/kubernetes/nvidia/main.tf @@ -254,11 +254,98 @@ import subprocess import time import re import os +import json +import urllib.request +import ssl from http.server import HTTPServer, BaseHTTPRequestHandler METRICS_PORT = 9401 SCRAPE_INTERVAL = 15 +# Kubernetes API configuration +K8S_API = "https://kubernetes.default.svc" +TOKEN_PATH = "/var/run/secrets/kubernetes.io/serviceaccount/token" +CA_PATH = "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt" + +# Cache for container ID to pod info mapping +container_cache = {} +cache_refresh_time = 0 +CACHE_TTL = 60 # Refresh cache every 60 seconds + +def get_k8s_token(): + """Read Kubernetes service account token.""" + try: + with open(TOKEN_PATH, 'r') as f: + return f.read().strip() + except: + return None + +def refresh_container_cache(): + """Refresh the container ID to pod mapping from Kubernetes API.""" + global container_cache, cache_refresh_time + + token = get_k8s_token() + if not token: + return + + try: + # Create SSL context with K8s CA + ctx = ssl.create_default_context() + if os.path.exists(CA_PATH): + ctx.load_verify_locations(CA_PATH) + + # Get all pods on this node + node_name = os.environ.get('NODE_NAME', '') + url = f"{K8S_API}/api/v1/pods?fieldSelector=spec.nodeName={node_name}" + + req = urllib.request.Request(url, headers={ + 'Authorization': f'Bearer {token}', + 'Accept': 'application/json' + }) + + with urllib.request.urlopen(req, context=ctx, timeout=10) as resp: + data = json.loads(resp.read().decode()) + + new_cache = {} + for pod in data.get('items', []): + pod_name = pod['metadata']['name'] + namespace = pod['metadata']['namespace'] + + # Get container statuses + for status in pod.get('status', {}).get('containerStatuses', []): + container_id = status.get('containerID', '') + # Extract the ID part (e.g., "containerd://abc123..." -> "abc123") + if '://' in container_id: + container_id = container_id.split('://')[-1] + if container_id: + short_id = container_id[:12] + new_cache[short_id] = { + 'pod': pod_name, + 'namespace': namespace, + 'container': status.get('name', 'unknown') + } + + container_cache = new_cache + cache_refresh_time = time.time() + print(f"Refreshed container cache: {len(new_cache)} containers") + + except Exception as e: + print(f"Error refreshing container cache: {e}") + +def get_pod_info(container_id): + """Look up pod info for a container ID.""" + global cache_refresh_time + + # Refresh cache if stale + if time.time() - cache_refresh_time > CACHE_TTL: + refresh_container_cache() + + return container_cache.get(container_id, { + 'pod': 'unknown', + 'namespace': 'unknown', + 'container': 'unknown' + }) + def get_gpu_processes(): """Run nvidia-smi to get GPU process info.""" try: @@ -294,11 +381,9 @@ def get_container_id(pid): with open(cgroup_path, 'r') as f: for line in f: # Match container ID patterns (docker, containerd, cri-o) - # e.g., /kubepods/pod.../containerid or /docker/containerid match = re.search(r'[:/]([a-f0-9]{64})', line) if match: - return match.group(1)[:12] # Return short container ID - # Also check for cri-containerd pattern + return match.group(1)[:12] match = re.search(r'cri-containerd-([a-f0-9]{64})', line) if match: return match.group(1)[:12] @@ -317,11 +402,15 @@ def collect_metrics(): for proc in processes: container_id = get_container_id(proc['pid']) + pod_info = get_pod_info(container_id) metrics.append({ 'container_id': container_id, 'pid': proc['pid'], 'process_name': proc['process_name'], - 'memory_bytes': proc['memory_bytes'] + 'memory_bytes': proc['memory_bytes'], + 'pod': pod_info['pod'], + 'namespace': pod_info['namespace'], + 'container': pod_info['container'] }) current_metrics = metrics @@ -329,13 +418,19 @@ def collect_metrics(): def format_metrics(): """Format metrics in Prometheus exposition format.""" lines = [ - "# HELP gpu_pod_memory_used_bytes GPU memory used by container", + "# HELP gpu_pod_memory_used_bytes GPU memory used by pod", "# TYPE gpu_pod_memory_used_bytes gauge" ] for m in current_metrics: - labels = f'container_id="{m["container_id"]}",pid="{m["pid"]}",process_name="{m["process_name"]}"' - lines.append(f"gpu_pod_memory_used_bytes{{{labels}}} {m['memory_bytes']}") + labels = ','.join([ + f'namespace="{m["namespace"]}"', + f'pod="{m["pod"]}"', + f'container="{m["container"]}"', + f'process_name="{m["process_name"]}"', + f'pid="{m["pid"]}"' + ]) + lines.append(f'gpu_pod_memory_used_bytes{{{labels}}} {m["memory_bytes"]}') return '\n'.join(lines) + '\n' @@ -370,6 +465,7 @@ def background_collector(): if __name__ == '__main__': print(f"Starting GPU Pod Memory Exporter on port {METRICS_PORT}") + refresh_container_cache() # Initial cache load collect_metrics() # Initial collection background_collector() @@ -379,6 +475,43 @@ EOF } } +resource "kubernetes_service_account" "gpu_pod_exporter" { + metadata { + name = "gpu-pod-exporter" + namespace = kubernetes_namespace.nvidia.metadata[0].name + } +} + +resource "kubernetes_cluster_role" "gpu_pod_exporter" { + metadata { + name = "gpu-pod-exporter" + } + + rule { + api_groups = [""] + resources = ["pods"] + verbs = ["list"] + } +} + +resource "kubernetes_cluster_role_binding" "gpu_pod_exporter" { + metadata { + name = "gpu-pod-exporter" + } + + role_ref { + api_group = "rbac.authorization.k8s.io" + kind = "ClusterRole" + name = kubernetes_cluster_role.gpu_pod_exporter.metadata[0].name + } + + subject { + kind = "ServiceAccount" + name = kubernetes_service_account.gpu_pod_exporter.metadata[0].name + namespace = kubernetes_namespace.nvidia.metadata[0].name + } +} + resource "kubernetes_daemonset" "gpu_pod_exporter" { metadata { name = "gpu-pod-exporter" @@ -404,7 +537,8 @@ resource "kubernetes_daemonset" "gpu_pod_exporter" { } spec { - host_pid = true + host_pid = true + service_account_name = kubernetes_service_account.gpu_pod_exporter.metadata[0].name node_selector = { "gpu" : "true" @@ -426,6 +560,15 @@ resource "kubernetes_daemonset" "gpu_pod_exporter" { "python3 /scripts/exporter.py" ] + env { + name = "NODE_NAME" + value_from { + field_ref { + field_path = "spec.nodeName" + } + } + } + port { container_port = 9401 name = "metrics" diff --git a/modules/kubernetes/nvidia/values.yaml b/modules/kubernetes/nvidia/values.yaml index 9049068c..3f9d4639 100644 --- a/modules/kubernetes/nvidia/values.yaml +++ b/modules/kubernetes/nvidia/values.yaml @@ -17,3 +17,11 @@ driver: devicePlugin: config: name: time-slicing-config + +# Tolerate GPU node taint for all GPU operator components +daemonsets: + tolerations: + - key: "nvidia.com/gpu" + operator: "Equal" + value: "true" + effect: "NoSchedule" diff --git a/modules/kubernetes/ollama/values.yaml b/modules/kubernetes/ollama/values.yaml index 18f78d24..fe92927a 100644 --- a/modules/kubernetes/ollama/values.yaml +++ b/modules/kubernetes/ollama/values.yaml @@ -17,5 +17,12 @@ ollama: persistentVolume: enabled: true existingClaim: "ollama-pvc" -# nodeSelector: -# kubernetes.io/hostname: k8s-node1 + +nodeSelector: + gpu: "true" + +tolerations: + - key: "nvidia.com/gpu" + operator: "Equal" + value: "true" + effect: "NoSchedule"