extract monitoring, nvidia, mailserver, cloudflared, kyverno from platform [ci skip]
Phase 2 of platform stack split. 5 more modules extracted into independent stacks. All applied successfully with zero destroys. Cloudflared now reads k8s_users from Vault directly to compute user_domains. Woodpecker pipeline runs all 8 extracted stacks in parallel. Memory bumped to 6Gi for 9 concurrent TF processes. Platform reduced from 27 to 19 modules.
This commit is contained in:
parent
3c804aedf8
commit
ae36dc253b
73 changed files with 166093 additions and 96 deletions
220
stacks/monitoring/modules/monitoring/loki.tf
Normal file
220
stacks/monitoring/modules/monitoring/loki.tf
Normal file
|
|
@ -0,0 +1,220 @@
|
|||
variable "nfs_server" { type = string }
|
||||
|
||||
# LOKI DISABLED - Uncomment to re-enable centralized logging
|
||||
# Disabled due to operational overhead vs benefit analysis after node2 incident
|
||||
# All configuration preserved in loki.yaml for future re-enabling
|
||||
/*
|
||||
resource "helm_release" "loki" {
|
||||
namespace = kubernetes_namespace.monitoring.metadata[0].name
|
||||
create_namespace = true
|
||||
name = "loki"
|
||||
|
||||
repository = "https://grafana.github.io/helm-charts"
|
||||
chart = "loki"
|
||||
|
||||
values = [templatefile("${path.module}/loki.yaml", {})]
|
||||
timeout = 600
|
||||
|
||||
depends_on = [kubernetes_config_map.loki_alert_rules]
|
||||
}
|
||||
*/
|
||||
|
||||
# ALLOY DISABLED - Log collection agents (depends on Loki)
|
||||
# https://grafana.com/docs/alloy/latest/configure/kubernetes/
|
||||
# Configuration preserved in alloy.yaml for future re-enabling
|
||||
/*
|
||||
resource "helm_release" "alloy" {
|
||||
namespace = kubernetes_namespace.monitoring.metadata[0].name
|
||||
create_namespace = true
|
||||
name = "alloy"
|
||||
|
||||
repository = "https://grafana.github.io/helm-charts"
|
||||
chart = "alloy"
|
||||
|
||||
values = [file("${path.module}/alloy.yaml")]
|
||||
atomic = true
|
||||
|
||||
depends_on = [helm_release.loki]
|
||||
}
|
||||
*/
|
||||
|
||||
# SYSCTL INOTIFY DISABLED - Was specifically for Loki file watching requirements
|
||||
# Can be re-enabled when Loki is restored
|
||||
/*
|
||||
resource "kubernetes_daemon_set_v1" "sysctl-inotify" {
|
||||
metadata {
|
||||
name = "sysctl-inotify"
|
||||
namespace = kubernetes_namespace.monitoring.metadata[0].name
|
||||
labels = {
|
||||
app = "sysctl-inotify"
|
||||
}
|
||||
}
|
||||
spec {
|
||||
selector {
|
||||
match_labels = {
|
||||
app = "sysctl-inotify"
|
||||
}
|
||||
}
|
||||
template {
|
||||
metadata {
|
||||
labels = {
|
||||
app = "sysctl-inotify"
|
||||
}
|
||||
}
|
||||
spec {
|
||||
init_container {
|
||||
name = "sysctl"
|
||||
image = "busybox:1.37"
|
||||
command = [
|
||||
"sh", "-c",
|
||||
"sysctl -w fs.inotify.max_user_watches=1048576 && sysctl -w fs.inotify.max_user_instances=8192 && sysctl -w fs.inotify.max_queued_events=1048576"
|
||||
]
|
||||
security_context {
|
||||
privileged = true
|
||||
}
|
||||
}
|
||||
container {
|
||||
name = "pause"
|
||||
image = "registry.k8s.io/pause:3.10"
|
||||
resources {
|
||||
requests = {
|
||||
cpu = "1m"
|
||||
memory = "4Mi"
|
||||
}
|
||||
limits = {
|
||||
cpu = "1m"
|
||||
memory = "4Mi"
|
||||
}
|
||||
}
|
||||
}
|
||||
host_pid = true
|
||||
toleration {
|
||||
operator = "Exists"
|
||||
}
|
||||
dns_config {
|
||||
option {
|
||||
name = "ndots"
|
||||
value = "2"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
*/
|
||||
|
||||
# resource "helm_release" "k8s-monitoring" {
|
||||
# namespace = kubernetes_namespace.monitoring.metadata[0].name
|
||||
# create_namespace = true
|
||||
# name = "k8s-monitoring"
|
||||
|
||||
# repository = "https://grafana.github.io/helm-charts"
|
||||
# chart = "k8s-monitoring"
|
||||
|
||||
# values = [templatefile("${path.module}/k8s-monitoring-values.yaml", {})]
|
||||
# atomic = true
|
||||
# }
|
||||
|
||||
# LOKI ALERT RULES DISABLED - Depend on Loki log queries
|
||||
# These alert on kernel events from systemd journal logs via Loki
|
||||
# Can be re-enabled when Loki is restored
|
||||
/*
|
||||
resource "kubernetes_config_map" "loki_alert_rules" {
|
||||
metadata {
|
||||
name = "loki-alert-rules"
|
||||
namespace = kubernetes_namespace.monitoring.metadata[0].name
|
||||
}
|
||||
data = {
|
||||
"rules.yaml" = yamlencode({
|
||||
groups = [
|
||||
{
|
||||
name = "Node Health"
|
||||
rules = [
|
||||
{
|
||||
alert = "KernelOOMKiller"
|
||||
expr = "sum by (node) (count_over_time({job=\"node-journal\"} |~ \"(?i)Out of memory.*Killed process\" [5m])) > 0"
|
||||
for = "0m"
|
||||
labels = {
|
||||
severity = "critical"
|
||||
}
|
||||
annotations = {
|
||||
summary = "OOM killer active on {{ $labels.node }}"
|
||||
}
|
||||
},
|
||||
{
|
||||
alert = "KernelPanic"
|
||||
expr = "sum by (node) (count_over_time({job=\"node-journal\"} |~ \"(?i)Kernel panic\" [5m])) > 0"
|
||||
for = "0m"
|
||||
labels = {
|
||||
severity = "critical"
|
||||
}
|
||||
annotations = {
|
||||
summary = "Kernel panic on {{ $labels.node }}"
|
||||
}
|
||||
},
|
||||
{
|
||||
alert = "KernelHungTask"
|
||||
expr = "sum by (node) (count_over_time({job=\"node-journal\"} |~ \"blocked for more than\" [5m])) > 0"
|
||||
for = "0m"
|
||||
labels = {
|
||||
severity = "warning"
|
||||
}
|
||||
annotations = {
|
||||
summary = "Hung task detected on {{ $labels.node }}"
|
||||
}
|
||||
},
|
||||
{
|
||||
alert = "KernelSoftLockup"
|
||||
expr = "sum by (node) (count_over_time({job=\"node-journal\"} |~ \"(?i)soft lockup\" [5m])) > 0"
|
||||
for = "0m"
|
||||
labels = {
|
||||
severity = "critical"
|
||||
}
|
||||
annotations = {
|
||||
summary = "Soft lockup on {{ $labels.node }}"
|
||||
}
|
||||
},
|
||||
{
|
||||
alert = "ContainerdDown"
|
||||
expr = "sum by (node) (count_over_time({job=\"node-journal\", unit=\"containerd.service\"} |~ \"(?i)(dead|failed|deactivating)\" [5m])) > 0"
|
||||
for = "1m"
|
||||
labels = {
|
||||
severity = "critical"
|
||||
}
|
||||
annotations = {
|
||||
summary = "containerd service unhealthy on {{ $labels.node }}"
|
||||
}
|
||||
},
|
||||
]
|
||||
}
|
||||
]
|
||||
})
|
||||
}
|
||||
}
|
||||
*/
|
||||
|
||||
# GRAFANA LOKI DATASOURCE DISABLED - Points to non-existent Loki service
|
||||
# Can be re-enabled when Loki is restored
|
||||
/*
|
||||
resource "kubernetes_config_map" "grafana_loki_datasource" {
|
||||
metadata {
|
||||
name = "grafana-loki-datasource"
|
||||
namespace = kubernetes_namespace.monitoring.metadata[0].name
|
||||
labels = {
|
||||
grafana_datasource = "1"
|
||||
}
|
||||
}
|
||||
data = {
|
||||
"loki-datasource.yaml" = yamlencode({
|
||||
apiVersion = 1
|
||||
datasources = [{
|
||||
name = "Loki"
|
||||
type = "loki"
|
||||
access = "proxy"
|
||||
url = "http://loki.monitoring.svc.cluster.local:3100"
|
||||
isDefault = false
|
||||
}]
|
||||
})
|
||||
}
|
||||
}
|
||||
*/
|
||||
Loading…
Add table
Add a link
Reference in a new issue