infra/stacks/monitoring/modules/monitoring/loki.tf

variable "nfs_server" { type = string }

# LOKI DISABLED - Uncomment to re-enable centralized logging
# Disabled due to operational overhead vs benefit analysis after node2 incident
# All configuration preserved in loki.yaml for future re-enabling
/*
resource "helm_release" "loki" {
  namespace        = kubernetes_namespace.monitoring.metadata[0].name
  create_namespace = true
  name             = "loki"

  repository = "https://grafana.github.io/helm-charts"
  chart      = "loki"

  values  = [templatefile("${path.module}/loki.yaml", {})]
  timeout = 600

  depends_on = [kubernetes_config_map.loki_alert_rules]
}
*/

# ALLOY DISABLED - Log collection agents (depends on Loki)
# https://grafana.com/docs/alloy/latest/configure/kubernetes/
# Configuration preserved in alloy.yaml for future re-enabling
/*
resource "helm_release" "alloy" {
  namespace        = kubernetes_namespace.monitoring.metadata[0].name
  create_namespace = true
  name             = "alloy"

  repository = "https://grafana.github.io/helm-charts"
  chart      = "alloy"

  values = [file("${path.module}/alloy.yaml")]
  atomic = true

  depends_on = [helm_release.loki]
}
*/

# SYSCTL INOTIFY DISABLED - Was specifically for Loki file watching requirements
# Can be re-enabled when Loki is restored
/*
resource "kubernetes_daemon_set_v1" "sysctl-inotify" {
  metadata {
    name      = "sysctl-inotify"
    namespace = kubernetes_namespace.monitoring.metadata[0].name
    labels = {
      app = "sysctl-inotify"
    }
  }
  spec {
    selector {
      match_labels = {
        app = "sysctl-inotify"
      }
    }
    template {
      metadata {
        labels = {
          app = "sysctl-inotify"
        }
      }
      spec {
        init_container {
          name  = "sysctl"
          image = "busybox:1.37"
          command = [
            "sh", "-c",
            "sysctl -w fs.inotify.max_user_watches=1048576 && sysctl -w fs.inotify.max_user_instances=8192 && sysctl -w fs.inotify.max_queued_events=1048576"
          ]
          security_context {
            privileged = true
          }
        }
        container {
          name  = "pause"
          image = "registry.k8s.io/pause:3.10"
          resources {
            requests = {
              cpu    = "1m"
              memory = "4Mi"
            }
            limits = {
              cpu    = "1m"
              memory = "4Mi"
            }
          }
        }
        host_pid = true
        toleration {
          operator = "Exists"
        }
        dns_config {
          option {
            name  = "ndots"
            value = "2"
          }
        }
      }
    }
  }
  lifecycle {
    # KYVERNO_LIFECYCLE_V1: Kyverno admission webhook mutates dns_config with ndots=2
    ignore_changes = [spec[0].template[0].spec[0].dns_config]
  }
}
*/

# resource "helm_release" "k8s-monitoring" {
#  namespace = kubernetes_namespace.monitoring.metadata[0].name
#   create_namespace = true
#   name             = "k8s-monitoring"

#   repository = "https://grafana.github.io/helm-charts"
#   chart      = "k8s-monitoring"

#   values = [templatefile("${path.module}/k8s-monitoring-values.yaml", {})]
#   atomic = true
# }

# LOKI ALERT RULES DISABLED - Depend on Loki log queries
# These alert on kernel events from systemd journal logs via Loki
# Can be re-enabled when Loki is restored
/*
resource "kubernetes_config_map" "loki_alert_rules" {
  metadata {
    name      = "loki-alert-rules"
    namespace = kubernetes_namespace.monitoring.metadata[0].name
  }
  data = {
    "rules.yaml" = yamlencode({
      groups = [
        {
          name = "Node Health"
          rules = [
            {
              alert = "KernelOOMKiller"
              expr  = "sum by (node) (count_over_time({job=\"node-journal\"} |~ \"(?i)Out of memory.*Killed process\" [5m])) > 0"
              for   = "0m"
              labels = {
                severity = "critical"
              }
              annotations = {
                summary = "OOM killer active on {{ $labels.node }}"
              }
            },
            {
              alert = "KernelPanic"
              expr  = "sum by (node) (count_over_time({job=\"node-journal\"} |~ \"(?i)Kernel panic\" [5m])) > 0"
              for   = "0m"
              labels = {
                severity = "critical"
              }
              annotations = {
                summary = "Kernel panic on {{ $labels.node }}"
              }
            },
            {
              alert = "KernelHungTask"
              expr  = "sum by (node) (count_over_time({job=\"node-journal\"} |~ \"blocked for more than\" [5m])) > 0"
              for   = "0m"
              labels = {
                severity = "warning"
              }
              annotations = {
                summary = "Hung task detected on {{ $labels.node }}"
              }
            },
            {
              alert = "KernelSoftLockup"
              expr  = "sum by (node) (count_over_time({job=\"node-journal\"} |~ \"(?i)soft lockup\" [5m])) > 0"
              for   = "0m"
              labels = {
                severity = "critical"
              }
              annotations = {
                summary = "Soft lockup on {{ $labels.node }}"
              }
            },
            {
              alert = "ContainerdDown"
              expr  = "sum by (node) (count_over_time({job=\"node-journal\", unit=\"containerd.service\"} |~ \"(?i)(dead|failed|deactivating)\" [5m])) > 0"
              for   = "1m"
              labels = {
                severity = "critical"
              }
              annotations = {
                summary = "containerd service unhealthy on {{ $labels.node }}"
              }
            },
          ]
        }
      ]
    })
  }
}
*/

# GRAFANA LOKI DATASOURCE DISABLED - Points to non-existent Loki service
# Can be re-enabled when Loki is restored
/*
resource "kubernetes_config_map" "grafana_loki_datasource" {
  metadata {
    name      = "grafana-loki-datasource"
    namespace = kubernetes_namespace.monitoring.metadata[0].name
    labels = {
      grafana_datasource = "1"
    }
  }
  data = {
    "loki-datasource.yaml" = yamlencode({
      apiVersion = 1
      datasources = [{
        name      = "Loki"
        type      = "loki"
        access    = "proxy"
        url       = "http://loki.monitoring.svc.cluster.local:3100"
        isDefault = false
      }]
    })
  }
}
*/
extract monitoring, nvidia, mailserver, cloudflared, kyverno from platform [ci skip] Phase 2 of platform stack split. 5 more modules extracted into independent stacks. All applied successfully with zero destroys. Cloudflared now reads k8s_users from Vault directly to compute user_domains. Woodpecker pipeline runs all 8 extracted stacks in parallel. Memory bumped to 6Gi for 9 concurrent TF processes. Platform reduced from 27 to 19 modules. 2026-03-17 21:34:11 +00:00			`variable "nfs_server" { type = string }`

			`# LOKI DISABLED - Uncomment to re-enable centralized logging`
			`# Disabled due to operational overhead vs benefit analysis after node2 incident`
			`# All configuration preserved in loki.yaml for future re-enabling`
			`/*`
			`resource "helm_release" "loki" {`
			`namespace = kubernetes_namespace.monitoring.metadata[0].name`
			`create_namespace = true`
			`name = "loki"`

			`repository = "https://grafana.github.io/helm-charts"`
			`chart = "loki"`

			`values = [templatefile("${path.module}/loki.yaml", {})]`
			`timeout = 600`

			`depends_on = [kubernetes_config_map.loki_alert_rules]`
			`}`
			`*/`

			`# ALLOY DISABLED - Log collection agents (depends on Loki)`
			`# https://grafana.com/docs/alloy/latest/configure/kubernetes/`
			`# Configuration preserved in alloy.yaml for future re-enabling`
			`/*`
			`resource "helm_release" "alloy" {`
			`namespace = kubernetes_namespace.monitoring.metadata[0].name`
			`create_namespace = true`
			`name = "alloy"`

			`repository = "https://grafana.github.io/helm-charts"`
			`chart = "alloy"`

			`values = [file("${path.module}/alloy.yaml")]`
			`atomic = true`

			`depends_on = [helm_release.loki]`
			`}`
			`*/`

			`# SYSCTL INOTIFY DISABLED - Was specifically for Loki file watching requirements`
			`# Can be re-enabled when Loki is restored`
			`/*`
			`resource "kubernetes_daemon_set_v1" "sysctl-inotify" {`
			`metadata {`
			`name = "sysctl-inotify"`
			`namespace = kubernetes_namespace.monitoring.metadata[0].name`
			`labels = {`
			`app = "sysctl-inotify"`
			`}`
			`}`
			`spec {`
			`selector {`
			`match_labels = {`
			`app = "sysctl-inotify"`
			`}`
			`}`
			`template {`
			`metadata {`
			`labels = {`
			`app = "sysctl-inotify"`
			`}`
			`}`
			`spec {`
			`init_container {`
			`name = "sysctl"`
			`image = "busybox:1.37"`
			`command = [`
			`"sh", "-c",`
			`"sysctl -w fs.inotify.max_user_watches=1048576 && sysctl -w fs.inotify.max_user_instances=8192 && sysctl -w fs.inotify.max_queued_events=1048576"`
			`]`
			`security_context {`
			`privileged = true`
			`}`
			`}`
			`container {`
			`name = "pause"`
			`image = "registry.k8s.io/pause:3.10"`
			`resources {`
			`requests = {`
			`cpu = "1m"`
			`memory = "4Mi"`
			`}`
			`limits = {`
			`cpu = "1m"`
			`memory = "4Mi"`
			`}`
			`}`
			`}`
			`host_pid = true`
			`toleration {`
			`operator = "Exists"`
			`}`
			`dns_config {`
			`option {`
			`name = "ndots"`
			`value = "2"`
			`}`
			`}`
			`}`
			`}`
			`}`
[infra] Sweep dns_config ignore_changes across all pod-owning resources [ci skip] ## Context Wave 3A (commit c9d221d5) added the `# KYVERNO_LIFECYCLE_V1` marker to the 27 pre-existing `ignore_changes = [...dns_config]` sites so they could be grepped and audited. It did NOT address pod-owning resources that were simply missing the suppression entirely. Post-Wave-3A sampling (2026-04-18) found that navidrome, f1-stream, frigate, servarr, monitoring, crowdsec, and many other stacks showed perpetual `dns_config` drift every plan because their `kubernetes_deployment` / `kubernetes_stateful_set` / `kubernetes_cron_job_v1` resources had no `lifecycle {}` block at all. Root cause (same as Wave 3A): Kyverno's admission webhook stamps `dns_config { option { name = "ndots"; value = "2" } }` on every pod's `spec.template.spec.dns_config` to prevent NxDomain search-domain flooding (see `k8s-ndots-search-domain-nxdomain-flood` skill). Without `ignore_changes` on every Terraform-managed pod-owner, Terraform repeatedly tries to strip the injected field. ## This change Extends the Wave 3A convention by sweeping EVERY `kubernetes_deployment`, `kubernetes_stateful_set`, `kubernetes_daemon_set`, `kubernetes_cron_job_v1`, `kubernetes_job_v1` (+ their `_v1` variants) in the repo and ensuring each carries the right `ignore_changes` path: - kubernetes_deployment / stateful_set / daemon_set / job_v1: `spec[0].template[0].spec[0].dns_config` - kubernetes_cron_job_v1: `spec[0].job_template[0].spec[0].template[0].spec[0].dns_config` (extra `job_template[0]` nesting — the CronJob's PodTemplateSpec is one level deeper) Each injection / extension is tagged `# KYVERNO_LIFECYCLE_V1: Kyverno admission webhook mutates dns_config with ndots=2` inline so the suppression is discoverable via `rg 'KYVERNO_LIFECYCLE_V1' stacks/`. Two insertion paths are handled by a Python pass (`/tmp/add_dns_config_ignore.py`): 1. No existing `lifecycle {}`: inject a brand-new block just before the resource's closing `}`. 108 new blocks on 93 files. 2. Existing `lifecycle {}` (usually for `DRIFT_WORKAROUND: CI owns image tag` from Wave 4, commit a62b43d1): extend its `ignore_changes` list with the dns_config path. Handles both inline (`= [x]`) and multiline (`= [\n x,\n]`) forms; ensures the last pre-existing list item carries a trailing comma so the extended list is valid HCL. 34 extensions. The script skips anything already mentioning `dns_config` inside an `ignore_changes`, so re-running is a no-op. ## Scale - 142 total lifecycle injections/extensions - 93 `.tf` files touched - 108 brand-new `lifecycle {}` blocks + 34 extensions of existing ones - Every Tier 0 and Tier 1 stack with a pod-owning resource is covered - Together with Wave 3A's 27 pre-existing markers → 169 greppable `KYVERNO_LIFECYCLE_V1` dns_config sites across the repo ## What is NOT in this change - `stacks/trading-bot/main.tf` — entirely commented-out block (`/* … /`). Python script touched the file, reverted manually. - `_template/main.tf.example` skeleton — kept minimal on purpose; any future stack created from it should either inherit the Wave 3A one-line form or add its own on first `kubernetes_deployment`. - `terraform fmt` fixes to pre-existing alignment issues in meshcentral, nvidia/modules/nvidia, vault — unrelated to this commit. Left for a separate fmt-only pass. - Non-pod resources (`kubernetes_service`, `kubernetes_secret`, `kubernetes_manifest`, etc.) — they don't own pods so they don't get Kyverno dns_config mutation. ## Verification Random sample post-commit: ``` $ cd stacks/navidrome && ../../scripts/tg plan → No changes. $ cd stacks/f1-stream && ../../scripts/tg plan → No changes. $ cd stacks/frigate && ../../scripts/tg plan → No changes. $ rg -c 'KYVERNO_LIFECYCLE_V1' stacks/ --include='.tf' --include='*.tf.example' \ \| awk -F: '{s+=$2} END {print s}' 169 ``` ## Reproduce locally 1. `git pull` 2. `rg 'KYVERNO_LIFECYCLE_V1' stacks/ \| wc -l` → 169+ 3. `cd stacks/navidrome && ../../scripts/tg plan` → expect 0 drift on the deployment's dns_config field. Refs: code-seq (Wave 3B dns_config class closed; kubernetes_manifest annotation class handled separately in 8d94688d for tls_secret) Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> 2026-04-18 21:19:48 +00:00			`lifecycle {`
			`# KYVERNO_LIFECYCLE_V1: Kyverno admission webhook mutates dns_config with ndots=2`
			`ignore_changes = [spec[0].template[0].spec[0].dns_config]`
			`}`
extract monitoring, nvidia, mailserver, cloudflared, kyverno from platform [ci skip] Phase 2 of platform stack split. 5 more modules extracted into independent stacks. All applied successfully with zero destroys. Cloudflared now reads k8s_users from Vault directly to compute user_domains. Woodpecker pipeline runs all 8 extracted stacks in parallel. Memory bumped to 6Gi for 9 concurrent TF processes. Platform reduced from 27 to 19 modules. 2026-03-17 21:34:11 +00:00			`}`
			`*/`

			`# resource "helm_release" "k8s-monitoring" {`
			`# namespace = kubernetes_namespace.monitoring.metadata[0].name`
			`# create_namespace = true`
			`# name = "k8s-monitoring"`

			`# repository = "https://grafana.github.io/helm-charts"`
			`# chart = "k8s-monitoring"`

			`# values = [templatefile("${path.module}/k8s-monitoring-values.yaml", {})]`
			`# atomic = true`
			`# }`

			`# LOKI ALERT RULES DISABLED - Depend on Loki log queries`
			`# These alert on kernel events from systemd journal logs via Loki`
			`# Can be re-enabled when Loki is restored`
			`/*`
			`resource "kubernetes_config_map" "loki_alert_rules" {`
			`metadata {`
			`name = "loki-alert-rules"`
			`namespace = kubernetes_namespace.monitoring.metadata[0].name`
			`}`
			`data = {`
			`"rules.yaml" = yamlencode({`
			`groups = [`
			`{`
			`name = "Node Health"`
			`rules = [`
			`{`
			`alert = "KernelOOMKiller"`
			`expr = "sum by (node) (count_over_time({job=\"node-journal\"} \|~ \"(?i)Out of memory.*Killed process\" [5m])) > 0"`
			`for = "0m"`
			`labels = {`
			`severity = "critical"`
			`}`
			`annotations = {`
			`summary = "OOM killer active on {{ $labels.node }}"`
			`}`
			`},`
			`{`
			`alert = "KernelPanic"`
			`expr = "sum by (node) (count_over_time({job=\"node-journal\"} \|~ \"(?i)Kernel panic\" [5m])) > 0"`
			`for = "0m"`
			`labels = {`
			`severity = "critical"`
			`}`
			`annotations = {`
			`summary = "Kernel panic on {{ $labels.node }}"`
			`}`
			`},`
			`{`
			`alert = "KernelHungTask"`
			`expr = "sum by (node) (count_over_time({job=\"node-journal\"} \|~ \"blocked for more than\" [5m])) > 0"`
			`for = "0m"`
			`labels = {`
			`severity = "warning"`
			`}`
			`annotations = {`
			`summary = "Hung task detected on {{ $labels.node }}"`
			`}`
			`},`
			`{`
			`alert = "KernelSoftLockup"`
			`expr = "sum by (node) (count_over_time({job=\"node-journal\"} \|~ \"(?i)soft lockup\" [5m])) > 0"`
			`for = "0m"`
			`labels = {`
			`severity = "critical"`
			`}`
			`annotations = {`
			`summary = "Soft lockup on {{ $labels.node }}"`
			`}`
			`},`
			`{`
			`alert = "ContainerdDown"`
			`expr = "sum by (node) (count_over_time({job=\"node-journal\", unit=\"containerd.service\"} \|~ \"(?i)(dead\|failed\|deactivating)\" [5m])) > 0"`
			`for = "1m"`
			`labels = {`
			`severity = "critical"`
			`}`
			`annotations = {`
			`summary = "containerd service unhealthy on {{ $labels.node }}"`
			`}`
			`},`
			`]`
			`}`
			`]`
			`})`
			`}`
			`}`
			`*/`

			`# GRAFANA LOKI DATASOURCE DISABLED - Points to non-existent Loki service`
			`# Can be re-enabled when Loki is restored`
			`/*`
			`resource "kubernetes_config_map" "grafana_loki_datasource" {`
			`metadata {`
			`name = "grafana-loki-datasource"`
			`namespace = kubernetes_namespace.monitoring.metadata[0].name`
			`labels = {`
			`grafana_datasource = "1"`
			`}`
			`}`
			`data = {`
			`"loki-datasource.yaml" = yamlencode({`
			`apiVersion = 1`
			`datasources = [{`
			`name = "Loki"`
			`type = "loki"`
			`access = "proxy"`
			`url = "http://loki.monitoring.svc.cluster.local:3100"`
			`isDefault = false`
			`}]`
			`})`
			`}`
			`}`
			`*/`