[ci skip] Reduce node config drift: GPU label, OIDC idempotency, node-exporter, rebuild docs
- Add gpu=true label to Terraform (nvidia null_resource alongside taint) - Improve API server OIDC config to detect value changes, not just flag presence - Add policy_hash trigger to audit-policy so rule changes auto-reapply - Enable prometheus-node-exporter sub-chart, delete unused Ansible playbook - Document full node rebuild procedure in CLAUDE.md - Save Talos Linux migration evaluation for future reference
This commit is contained in:
parent
ff66adbe9e
commit
cf67e02135
8 changed files with 369 additions and 78 deletions
|
|
@ -101,8 +101,8 @@ alertmanager:
|
|||
# web.external-url seems to be hardcoded, edited deployment manually
|
||||
# extraArgs:
|
||||
# web.external-url: "https://prometheus.viktorbarzin.me"
|
||||
# prometheus-node-exporter:
|
||||
# enabled: true
|
||||
prometheus-node-exporter:
|
||||
enabled: true
|
||||
server:
|
||||
# Enable me to delete metrics
|
||||
extraFlags:
|
||||
|
|
|
|||
|
|
@ -17,10 +17,13 @@ resource "kubernetes_namespace" "nvidia" {
|
|||
}
|
||||
}
|
||||
|
||||
# Apply GPU taint to ensure only GPU workloads run on GPU node
|
||||
resource "null_resource" "gpu_node_taint" {
|
||||
# Apply GPU taint and label to ensure only GPU workloads run on GPU node
|
||||
resource "null_resource" "gpu_node_config" {
|
||||
provisioner "local-exec" {
|
||||
command = "kubectl taint nodes k8s-node1 nvidia.com/gpu=true:NoSchedule --overwrite"
|
||||
command = <<-EOT
|
||||
kubectl taint nodes k8s-node1 nvidia.com/gpu=true:NoSchedule --overwrite
|
||||
kubectl label nodes k8s-node1 gpu=true --overwrite
|
||||
EOT
|
||||
}
|
||||
|
||||
# Re-run if namespace changes (proxy for cluster changes)
|
||||
|
|
|
|||
|
|
@ -32,8 +32,11 @@ resource "null_resource" "apiserver_oidc_config" {
|
|||
|
||||
provisioner "remote-exec" {
|
||||
inline = [
|
||||
# Check if OIDC flags already present
|
||||
"if grep -q 'oidc-issuer-url' /etc/kubernetes/manifests/kube-apiserver.yaml; then echo 'OIDC flags already configured'; exit 0; fi",
|
||||
# Check if OIDC flags already configured with the correct values
|
||||
"if grep -q 'oidc-issuer-url=${var.oidc_issuer_url}' /etc/kubernetes/manifests/kube-apiserver.yaml && grep -q 'oidc-client-id=${var.oidc_client_id}' /etc/kubernetes/manifests/kube-apiserver.yaml; then echo 'OIDC flags already configured with correct values'; exit 0; fi",
|
||||
|
||||
# Remove any existing OIDC flags (in case values changed)
|
||||
"sudo sed -i '/--oidc-issuer-url/d; /--oidc-client-id/d; /--oidc-username-claim/d; /--oidc-groups-claim/d' /etc/kubernetes/manifests/kube-apiserver.yaml",
|
||||
|
||||
# Backup the manifest
|
||||
"sudo cp /etc/kubernetes/manifests/kube-apiserver.yaml /etc/kubernetes/manifests/kube-apiserver.yaml.bak",
|
||||
|
|
|
|||
|
|
@ -88,7 +88,44 @@ resource "null_resource" "audit_policy" {
|
|||
}
|
||||
|
||||
triggers = {
|
||||
policy_version = "v1" # Bump to re-apply
|
||||
policy_version = "v1" # Bump to force re-apply of manifest flags
|
||||
policy_hash = sha256(yamlencode({
|
||||
apiVersion = "audit.k8s.io/v1"
|
||||
kind = "Policy"
|
||||
rules = [
|
||||
{
|
||||
level = "None"
|
||||
resources = [{
|
||||
group = ""
|
||||
resources = ["endpoints", "services", "services/status"]
|
||||
}]
|
||||
users = ["system:kube-proxy"]
|
||||
},
|
||||
{
|
||||
level = "None"
|
||||
verbs = ["watch"]
|
||||
},
|
||||
{
|
||||
level = "None"
|
||||
nonResourceURLs = ["/healthz*", "/readyz*", "/livez*"]
|
||||
},
|
||||
{
|
||||
level = "Metadata"
|
||||
resources = [{
|
||||
group = ""
|
||||
resources = ["secrets"]
|
||||
}]
|
||||
},
|
||||
{
|
||||
level = "RequestResponse"
|
||||
verbs = ["create", "update", "patch", "delete"]
|
||||
},
|
||||
{
|
||||
level = "Metadata"
|
||||
verbs = ["get", "list"]
|
||||
},
|
||||
]
|
||||
}))
|
||||
}
|
||||
|
||||
depends_on = [null_resource.apiserver_oidc_config]
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue