The Alloy Helm chart maps `alloy.resources`, NOT `controller.resources`, onto
the alloy container. The block under `controller:` was silently dropped, so
the container ran with `resources: {}` and inherited the Kyverno LimitRange
`tier-defaults` 256Mi — well below Alloy's 400-450Mi steady state. The
cgroup ran at 255.8/256MB with ~50M memory-reclaim events, page-cache
thrashing drove ~185 MB/s sdc reads (12.18 TB in 24h), saturating the
Proxmox host and rippling out to all VMs + NFS.
Fix:
- Move resources to `alloy.resources` (correct chart key).
- Burstable QoS: request 512Mi, limit 1Gi. Workers are at 97-99%
memory-request saturation cluster-wide; a 1Gi request blocks
scheduling on node2/node3.
- Bump controller.updateStrategy.maxUnavailable to 50% so a 5-pod DS
rolling update fits inside the helm timeout.
- Bump helm_release.alloy.timeout to 900s (default 300s was too short
with occasional runc-stuck-Terminating on k8s-master).
Verified: all 4 alloy pods now show 1Gi/512Mi at the container level;
helm rev=8 deployed; per-pod memory 99-108Mi at steady state (well
under the new limit).
Memory ID 2726.
Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
230 lines
7.9 KiB
YAML
230 lines
7.9 KiB
YAML
alloy:
|
|
# Resource limits for the alloy container itself.
|
|
# Must be under `alloy.resources` (NOT `controller.resources`) — the chart
|
|
# only maps THIS key onto the alloy container. Without it, the container gets
|
|
# `resources: {}` and inherits Kyverno LimitRange `tier-defaults` (256Mi),
|
|
# which is below Alloy's 400-450Mi steady state and caused page-cache
|
|
# thrashing → 185 MB/s sdc reads → host IO saturation (2026-05-26).
|
|
# Burstable QoS (request < limit) — workers are at 97-99% memory-request
|
|
# saturation; a 1Gi request blocks scheduling on node2/node3.
|
|
resources:
|
|
requests:
|
|
cpu: 50m
|
|
memory: 512Mi
|
|
limits:
|
|
memory: 1Gi
|
|
configMap:
|
|
content: |-
|
|
// Write your Alloy config here:
|
|
logging {
|
|
level = "info"
|
|
format = "logfmt"
|
|
}
|
|
loki.write "default" {
|
|
endpoint {
|
|
url = "http://loki.monitoring.svc.cluster.local:3100/loki/api/v1/push"
|
|
}
|
|
}
|
|
|
|
// discovery.kubernetes allows you to find scrape targets from Kubernetes resources.
|
|
// It watches cluster state and ensures targets are continually synced with what is currently running in your cluster.
|
|
discovery.kubernetes "pod" {
|
|
role = "pod"
|
|
}
|
|
|
|
// discovery.relabel rewrites the label set of the input targets by applying one or more relabeling rules.
|
|
// If no rules are defined, then the input targets are exported as-is.
|
|
discovery.relabel "pod_logs" {
|
|
targets = discovery.kubernetes.pod.targets
|
|
|
|
// Label creation - "namespace" field from "__meta_kubernetes_namespace"
|
|
rule {
|
|
source_labels = ["__meta_kubernetes_namespace"]
|
|
action = "replace"
|
|
target_label = "namespace"
|
|
}
|
|
|
|
// Label creation - "pod" field from "__meta_kubernetes_pod_name"
|
|
rule {
|
|
source_labels = ["__meta_kubernetes_pod_name"]
|
|
action = "replace"
|
|
target_label = "pod"
|
|
}
|
|
|
|
// Label creation - "container" field from "__meta_kubernetes_pod_container_name"
|
|
rule {
|
|
source_labels = ["__meta_kubernetes_pod_container_name"]
|
|
action = "replace"
|
|
target_label = "container"
|
|
}
|
|
|
|
// Label creation - "app" field from "__meta_kubernetes_pod_label_app_kubernetes_io_name"
|
|
rule {
|
|
source_labels = ["__meta_kubernetes_pod_label_app_kubernetes_io_name"]
|
|
action = "replace"
|
|
target_label = "app"
|
|
}
|
|
|
|
// Label creation - "job" field from "__meta_kubernetes_namespace" and "__meta_kubernetes_pod_container_name"
|
|
// Concatenate values __meta_kubernetes_namespace/__meta_kubernetes_pod_container_name
|
|
rule {
|
|
source_labels = ["__meta_kubernetes_namespace", "__meta_kubernetes_pod_container_name"]
|
|
action = "replace"
|
|
target_label = "job"
|
|
separator = "/"
|
|
replacement = "$1"
|
|
}
|
|
|
|
// Label creation - "container" field from "__meta_kubernetes_pod_uid" and "__meta_kubernetes_pod_container_name"
|
|
// Concatenate values __meta_kubernetes_pod_uid/__meta_kubernetes_pod_container_name.log
|
|
rule {
|
|
source_labels = ["__meta_kubernetes_pod_uid", "__meta_kubernetes_pod_container_name"]
|
|
action = "replace"
|
|
target_label = "__path__"
|
|
separator = "/"
|
|
replacement = "/var/log/pods/*$1/*.log"
|
|
}
|
|
|
|
// Label creation - "container_runtime" field from "__meta_kubernetes_pod_container_id"
|
|
rule {
|
|
source_labels = ["__meta_kubernetes_pod_container_id"]
|
|
action = "replace"
|
|
target_label = "container_runtime"
|
|
regex = "^(\\S+):\\/\\/.+$"
|
|
replacement = "$1"
|
|
}
|
|
}
|
|
|
|
// loki.source.file tails pod logs from /var/log/pods/* on the host filesystem.
|
|
// Previously used loki.source.kubernetes (apiserver streaming) which drove
|
|
// kube-apiserver `CONNECT pods/log` to ~13 req/s + ~2200 sec/s of streams.
|
|
// discovery.relabel.pod_logs already sets __path__ to the kubelet log path.
|
|
loki.source.file "pod_logs" {
|
|
targets = discovery.relabel.pod_logs.output
|
|
forward_to = [loki.process.pod_logs.receiver]
|
|
}
|
|
|
|
// loki.process receives log entries from other Loki components, applies one or more processing stages,
|
|
// and forwards the results to the list of receivers in the component's arguments.
|
|
loki.process "pod_logs" {
|
|
stage.static_labels {
|
|
values = {
|
|
cluster = "default",
|
|
}
|
|
}
|
|
|
|
forward_to = [loki.write.default.receiver]
|
|
}
|
|
|
|
// Node-level journal log collection for kernel panics, OOMs, hung tasks, etc.
|
|
// Ships system logs off-node so they survive hard resets.
|
|
loki.source.journal "node_journal" {
|
|
forward_to = [loki.process.journal.receiver]
|
|
relabel_rules = loki.relabel.journal.rules
|
|
labels = {
|
|
job = "node-journal",
|
|
}
|
|
max_age = "12h"
|
|
}
|
|
|
|
loki.relabel "journal" {
|
|
forward_to = []
|
|
|
|
rule {
|
|
source_labels = ["__journal__hostname"]
|
|
target_label = "node"
|
|
}
|
|
rule {
|
|
source_labels = ["__journal__systemd_unit"]
|
|
target_label = "unit"
|
|
}
|
|
rule {
|
|
source_labels = ["__journal_priority_keyword"]
|
|
target_label = "level"
|
|
}
|
|
rule {
|
|
source_labels = ["__journal__transport"]
|
|
target_label = "transport"
|
|
}
|
|
}
|
|
|
|
// Forward warning+ journal entries (priority 0-4: emerg, alert, crit, err, warning)
|
|
// Also forwards kernel transport entries regardless of priority for OOM/panic detection.
|
|
loki.process "journal" {
|
|
stage.static_labels {
|
|
values = {
|
|
cluster = "default",
|
|
}
|
|
}
|
|
|
|
// Drop info/debug/notice entries that aren't from the kernel transport
|
|
stage.match {
|
|
selector = "{job=\"node-journal\", level=~\"info|notice|debug\", transport!=\"kernel\"}"
|
|
action = "drop"
|
|
}
|
|
|
|
forward_to = [loki.write.default.receiver]
|
|
}
|
|
|
|
// Kubernetes audit log collection from /var/log/kubernetes/audit.log
|
|
// Requires alloy.mounts.varlog=true to mount /var/log from the host
|
|
local.file_match "audit_logs" {
|
|
path_targets = [{
|
|
__path__ = "/var/log/kubernetes/audit.log",
|
|
job = "kubernetes-audit",
|
|
node = env("HOSTNAME"),
|
|
}]
|
|
}
|
|
|
|
loki.source.file "audit_logs" {
|
|
targets = local.file_match.audit_logs.targets
|
|
forward_to = [loki.write.default.receiver]
|
|
}
|
|
|
|
# Mount /var/log from the host for file-based log collection (audit logs)
|
|
mounts:
|
|
varlog: true
|
|
# Mount journal directories for loki.source.journal
|
|
extra:
|
|
- name: journal-run
|
|
mountPath: /run/log/journal
|
|
readOnly: true
|
|
- name: journal-var
|
|
mountPath: /var/log/journal
|
|
readOnly: true
|
|
- name: machine-id
|
|
mountPath: /etc/machine-id
|
|
readOnly: true
|
|
|
|
controller:
|
|
# Bump maxUnavailable above the chart default (1) so a 5-node DS finishes its
|
|
# rolling update inside the helm_release timeout. Log shipper tolerates the
|
|
# brief gap.
|
|
updateStrategy:
|
|
type: RollingUpdate
|
|
rollingUpdate:
|
|
maxUnavailable: 50%
|
|
|
|
volumes:
|
|
extra:
|
|
- name: journal-run
|
|
hostPath:
|
|
path: /run/log/journal
|
|
type: DirectoryOrCreate
|
|
- name: journal-var
|
|
hostPath:
|
|
path: /var/log/journal
|
|
type: DirectoryOrCreate
|
|
- name: machine-id
|
|
hostPath:
|
|
path: /etc/machine-id
|
|
type: File
|
|
|
|
# Schedule on control-plane node too so we can tail /var/log/kubernetes/audit.log
|
|
# from kube-apiserver. Without this, K8s audit log shipping (wave 1 K2-K9 alert
|
|
# rules) has no source. control-plane has the standard NoSchedule taint.
|
|
tolerations:
|
|
- key: "node-role.kubernetes.io/control-plane"
|
|
operator: "Exists"
|
|
effect: "NoSchedule"
|
|
|