[ci skip] Reduce node config drift: GPU label, OIDC idempotency, node-exporter, rebuild docs
- Add gpu=true label to Terraform (nvidia null_resource alongside taint) - Improve API server OIDC config to detect value changes, not just flag presence - Add policy_hash trigger to audit-policy so rule changes auto-reapply - Enable prometheus-node-exporter sub-chart, delete unused Ansible playbook - Document full node rebuild procedure in CLAUDE.md - Save Talos Linux migration evaluation for future reference
This commit is contained in:
parent
ff66adbe9e
commit
cf67e02135
8 changed files with 369 additions and 78 deletions
|
|
@ -17,10 +17,13 @@ resource "kubernetes_namespace" "nvidia" {
|
|||
}
|
||||
}
|
||||
|
||||
# Apply GPU taint to ensure only GPU workloads run on GPU node
|
||||
resource "null_resource" "gpu_node_taint" {
|
||||
# Apply GPU taint and label to ensure only GPU workloads run on GPU node
|
||||
resource "null_resource" "gpu_node_config" {
|
||||
provisioner "local-exec" {
|
||||
command = "kubectl taint nodes k8s-node1 nvidia.com/gpu=true:NoSchedule --overwrite"
|
||||
command = <<-EOT
|
||||
kubectl taint nodes k8s-node1 nvidia.com/gpu=true:NoSchedule --overwrite
|
||||
kubectl label nodes k8s-node1 gpu=true --overwrite
|
||||
EOT
|
||||
}
|
||||
|
||||
# Re-run if namespace changes (proxy for cluster changes)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue