variable "nfs_server" { type = string } # LOKI DISABLED - Uncomment to re-enable centralized logging # Disabled due to operational overhead vs benefit analysis after node2 incident # All configuration preserved in loki.yaml for future re-enabling /* resource "helm_release" "loki" { namespace = kubernetes_namespace.monitoring.metadata[0].name create_namespace = true name = "loki" repository = "https://grafana.github.io/helm-charts" chart = "loki" values = [templatefile("${path.module}/loki.yaml", {})] timeout = 600 depends_on = [kubernetes_config_map.loki_alert_rules] } */ # ALLOY DISABLED - Log collection agents (depends on Loki) # https://grafana.com/docs/alloy/latest/configure/kubernetes/ # Configuration preserved in alloy.yaml for future re-enabling /* resource "helm_release" "alloy" { namespace = kubernetes_namespace.monitoring.metadata[0].name create_namespace = true name = "alloy" repository = "https://grafana.github.io/helm-charts" chart = "alloy" values = [file("${path.module}/alloy.yaml")] atomic = true depends_on = [helm_release.loki] } */ # SYSCTL INOTIFY DISABLED - Was specifically for Loki file watching requirements # Can be re-enabled when Loki is restored /* resource "kubernetes_daemon_set_v1" "sysctl-inotify" { metadata { name = "sysctl-inotify" namespace = kubernetes_namespace.monitoring.metadata[0].name labels = { app = "sysctl-inotify" } } spec { selector { match_labels = { app = "sysctl-inotify" } } template { metadata { labels = { app = "sysctl-inotify" } } spec { init_container { name = "sysctl" image = "busybox:1.37" command = [ "sh", "-c", "sysctl -w fs.inotify.max_user_watches=1048576 && sysctl -w fs.inotify.max_user_instances=8192 && sysctl -w fs.inotify.max_queued_events=1048576" ] security_context { privileged = true } } container { name = "pause" image = "registry.k8s.io/pause:3.10" resources { requests = { cpu = "1m" memory = "4Mi" } limits = { cpu = "1m" memory = "4Mi" } } } host_pid = true toleration { operator = "Exists" } dns_config { option { name = "ndots" value = "2" } } } } } } */ # resource "helm_release" "k8s-monitoring" { # namespace = kubernetes_namespace.monitoring.metadata[0].name # create_namespace = true # name = "k8s-monitoring" # repository = "https://grafana.github.io/helm-charts" # chart = "k8s-monitoring" # values = [templatefile("${path.module}/k8s-monitoring-values.yaml", {})] # atomic = true # } # LOKI ALERT RULES DISABLED - Depend on Loki log queries # These alert on kernel events from systemd journal logs via Loki # Can be re-enabled when Loki is restored /* resource "kubernetes_config_map" "loki_alert_rules" { metadata { name = "loki-alert-rules" namespace = kubernetes_namespace.monitoring.metadata[0].name } data = { "rules.yaml" = yamlencode({ groups = [ { name = "Node Health" rules = [ { alert = "KernelOOMKiller" expr = "sum by (node) (count_over_time({job=\"node-journal\"} |~ \"(?i)Out of memory.*Killed process\" [5m])) > 0" for = "0m" labels = { severity = "critical" } annotations = { summary = "OOM killer active on {{ $labels.node }}" } }, { alert = "KernelPanic" expr = "sum by (node) (count_over_time({job=\"node-journal\"} |~ \"(?i)Kernel panic\" [5m])) > 0" for = "0m" labels = { severity = "critical" } annotations = { summary = "Kernel panic on {{ $labels.node }}" } }, { alert = "KernelHungTask" expr = "sum by (node) (count_over_time({job=\"node-journal\"} |~ \"blocked for more than\" [5m])) > 0" for = "0m" labels = { severity = "warning" } annotations = { summary = "Hung task detected on {{ $labels.node }}" } }, { alert = "KernelSoftLockup" expr = "sum by (node) (count_over_time({job=\"node-journal\"} |~ \"(?i)soft lockup\" [5m])) > 0" for = "0m" labels = { severity = "critical" } annotations = { summary = "Soft lockup on {{ $labels.node }}" } }, { alert = "ContainerdDown" expr = "sum by (node) (count_over_time({job=\"node-journal\", unit=\"containerd.service\"} |~ \"(?i)(dead|failed|deactivating)\" [5m])) > 0" for = "1m" labels = { severity = "critical" } annotations = { summary = "containerd service unhealthy on {{ $labels.node }}" } }, ] } ] }) } } */ # GRAFANA LOKI DATASOURCE DISABLED - Points to non-existent Loki service # Can be re-enabled when Loki is restored /* resource "kubernetes_config_map" "grafana_loki_datasource" { metadata { name = "grafana-loki-datasource" namespace = kubernetes_namespace.monitoring.metadata[0].name labels = { grafana_datasource = "1" } } data = { "loki-datasource.yaml" = yamlencode({ apiVersion = 1 datasources = [{ name = "Loki" type = "loki" access = "proxy" url = "http://loki.monitoring.svc.cluster.local:3100" isDefault = false }] }) } } */