diff --git a/stacks/monitoring/modules/monitoring/alloy.yaml b/stacks/monitoring/modules/monitoring/alloy.yaml index 900cbb25..59b84fdb 100644 --- a/stacks/monitoring/modules/monitoring/alloy.yaml +++ b/stacks/monitoring/modules/monitoring/alloy.yaml @@ -1,4 +1,18 @@ alloy: + # Resource limits for the alloy container itself. + # Must be under `alloy.resources` (NOT `controller.resources`) — the chart + # only maps THIS key onto the alloy container. Without it, the container gets + # `resources: {}` and inherits Kyverno LimitRange `tier-defaults` (256Mi), + # which is below Alloy's 400-450Mi steady state and caused page-cache + # thrashing → 185 MB/s sdc reads → host IO saturation (2026-05-26). + # Burstable QoS (request < limit) — workers are at 97-99% memory-request + # saturation; a 1Gi request blocks scheduling on node2/node3. + resources: + requests: + cpu: 50m + memory: 512Mi + limits: + memory: 1Gi configMap: content: |- // Write your Alloy config here: @@ -183,6 +197,14 @@ alloy: readOnly: true controller: + # Bump maxUnavailable above the chart default (1) so a 5-node DS finishes its + # rolling update inside the helm_release timeout. Log shipper tolerates the + # brief gap. + updateStrategy: + type: RollingUpdate + rollingUpdate: + maxUnavailable: 50% + volumes: extra: - name: journal-run @@ -206,13 +228,3 @@ controller: operator: "Exists" effect: "NoSchedule" - # Resource limits for DaemonSet pods - # Alloy tails logs from all containers on the node via K8s API and batches - # them to Loki. Memory scales with number of active log streams (~30-50 per node). - # 128Mi was OOMKilled; steady-state usage is ~400-450Mi per pod. - resources: - requests: - cpu: 50m - memory: 512Mi - limits: - memory: 1Gi diff --git a/stacks/monitoring/modules/monitoring/loki.tf b/stacks/monitoring/modules/monitoring/loki.tf index 72aa4da2..0b5f3d45 100644 --- a/stacks/monitoring/modules/monitoring/loki.tf +++ b/stacks/monitoring/modules/monitoring/loki.tf @@ -28,8 +28,9 @@ resource "helm_release" "alloy" { repository = "https://grafana.github.io/helm-charts" chart = "alloy" - values = [file("${path.module}/alloy.yaml")] - atomic = true + values = [file("${path.module}/alloy.yaml")] + atomic = true + timeout = 900 # 5-pod DS rolling update + occasional runc-stuck-Terminating on k8s-master needs >300s default depends_on = [helm_release.loki] }