[ci skip] Reduce node config drift: GPU label, OIDC idempotency, node-exporter, rebuild docs

- Add gpu=true label to Terraform (nvidia null_resource alongside taint)
- Improve API server OIDC config to detect value changes, not just flag presence
- Add policy_hash trigger to audit-policy so rule changes auto-reapply
- Enable prometheus-node-exporter sub-chart, delete unused Ansible playbook
- Document full node rebuild procedure in CLAUDE.md
- Save Talos Linux migration evaluation for future reference
This commit is contained in:
Viktor Barzin 2026-02-22 22:59:38 +00:00
parent ff66adbe9e
commit cf67e02135
No known key found for this signature in database
GPG key ID: 0EB088298288D958
8 changed files with 369 additions and 78 deletions

View file

@ -101,8 +101,8 @@ alertmanager:
# web.external-url seems to be hardcoded, edited deployment manually
# extraArgs:
# web.external-url: "https://prometheus.viktorbarzin.me"
# prometheus-node-exporter:
# enabled: true
prometheus-node-exporter:
enabled: true
server:
# Enable me to delete metrics
extraFlags:

View file

@ -17,10 +17,13 @@ resource "kubernetes_namespace" "nvidia" {
}
}
# Apply GPU taint to ensure only GPU workloads run on GPU node
resource "null_resource" "gpu_node_taint" {
# Apply GPU taint and label to ensure only GPU workloads run on GPU node
resource "null_resource" "gpu_node_config" {
provisioner "local-exec" {
command = "kubectl taint nodes k8s-node1 nvidia.com/gpu=true:NoSchedule --overwrite"
command = <<-EOT
kubectl taint nodes k8s-node1 nvidia.com/gpu=true:NoSchedule --overwrite
kubectl label nodes k8s-node1 gpu=true --overwrite
EOT
}
# Re-run if namespace changes (proxy for cluster changes)

View file

@ -32,8 +32,11 @@ resource "null_resource" "apiserver_oidc_config" {
provisioner "remote-exec" {
inline = [
# Check if OIDC flags already present
"if grep -q 'oidc-issuer-url' /etc/kubernetes/manifests/kube-apiserver.yaml; then echo 'OIDC flags already configured'; exit 0; fi",
# Check if OIDC flags already configured with the correct values
"if grep -q 'oidc-issuer-url=${var.oidc_issuer_url}' /etc/kubernetes/manifests/kube-apiserver.yaml && grep -q 'oidc-client-id=${var.oidc_client_id}' /etc/kubernetes/manifests/kube-apiserver.yaml; then echo 'OIDC flags already configured with correct values'; exit 0; fi",
# Remove any existing OIDC flags (in case values changed)
"sudo sed -i '/--oidc-issuer-url/d; /--oidc-client-id/d; /--oidc-username-claim/d; /--oidc-groups-claim/d' /etc/kubernetes/manifests/kube-apiserver.yaml",
# Backup the manifest
"sudo cp /etc/kubernetes/manifests/kube-apiserver.yaml /etc/kubernetes/manifests/kube-apiserver.yaml.bak",

View file

@ -88,7 +88,44 @@ resource "null_resource" "audit_policy" {
}
triggers = {
policy_version = "v1" # Bump to re-apply
policy_version = "v1" # Bump to force re-apply of manifest flags
policy_hash = sha256(yamlencode({
apiVersion = "audit.k8s.io/v1"
kind = "Policy"
rules = [
{
level = "None"
resources = [{
group = ""
resources = ["endpoints", "services", "services/status"]
}]
users = ["system:kube-proxy"]
},
{
level = "None"
verbs = ["watch"]
},
{
level = "None"
nonResourceURLs = ["/healthz*", "/readyz*", "/livez*"]
},
{
level = "Metadata"
resources = [{
group = ""
resources = ["secrets"]
}]
},
{
level = "RequestResponse"
verbs = ["create", "update", "patch", "delete"]
},
{
level = "Metadata"
verbs = ["get", "list"]
},
]
}))
}
depends_on = [null_resource.apiserver_oidc_config]