infra/stacks/monitoring/modules/monitoring/alloy.yaml

alloy:
  # Resource limits for the alloy container itself.
  # Must be under `alloy.resources` (NOT `controller.resources`) — the chart
  # only maps THIS key onto the alloy container. Without it, the container gets
  # `resources: {}` and inherits Kyverno LimitRange `tier-defaults` (256Mi),
  # which is below Alloy's 400-450Mi steady state and caused page-cache
  # thrashing → 185 MB/s sdc reads → host IO saturation (2026-05-26).
  # Burstable QoS (request < limit) — workers are at 97-99% memory-request
  # saturation; a 1Gi request blocks scheduling on node2/node3.
  resources:
    requests:
      cpu: 50m
      memory: 512Mi
    limits:
      memory: 1Gi
  configMap:
    content: |-
      // Write your Alloy config here:
      logging {
        level = "info"
        format = "logfmt"
      }
      loki.write "default" {
        endpoint {
          url = "http://loki.monitoring.svc.cluster.local:3100/loki/api/v1/push"
        }
      }

      // discovery.kubernetes allows you to find scrape targets from Kubernetes resources.
      // It watches cluster state and ensures targets are continually synced with what is currently running in your cluster.
      discovery.kubernetes "pod" {
        role = "pod"
      }

      // discovery.relabel rewrites the label set of the input targets by applying one or more relabeling rules.
      // If no rules are defined, then the input targets are exported as-is.
      discovery.relabel "pod_logs" {
        targets = discovery.kubernetes.pod.targets

        // Label creation - "namespace" field from "__meta_kubernetes_namespace"
        rule {
          source_labels = ["__meta_kubernetes_namespace"]
          action = "replace"
          target_label = "namespace"
        }

        // Label creation - "pod" field from "__meta_kubernetes_pod_name"
        rule {
          source_labels = ["__meta_kubernetes_pod_name"]
          action = "replace"
          target_label = "pod"
        }

        // Label creation - "container" field from "__meta_kubernetes_pod_container_name"
        rule {
          source_labels = ["__meta_kubernetes_pod_container_name"]
          action = "replace"
          target_label = "container"
        }

        // Label creation -  "app" field from "__meta_kubernetes_pod_label_app_kubernetes_io_name"
        rule {
          source_labels = ["__meta_kubernetes_pod_label_app_kubernetes_io_name"]
          action = "replace"
          target_label = "app"
        }

        // Label creation -  "job" field from "__meta_kubernetes_namespace" and "__meta_kubernetes_pod_container_name"
        // Concatenate values __meta_kubernetes_namespace/__meta_kubernetes_pod_container_name
        rule {
          source_labels = ["__meta_kubernetes_namespace", "__meta_kubernetes_pod_container_name"]
          action = "replace"
          target_label = "job"
          separator = "/"
          replacement = "$1"
        }

        // Label creation - "container" field from "__meta_kubernetes_pod_uid" and "__meta_kubernetes_pod_container_name"
        // Concatenate values __meta_kubernetes_pod_uid/__meta_kubernetes_pod_container_name.log
        rule {
          source_labels = ["__meta_kubernetes_pod_uid", "__meta_kubernetes_pod_container_name"]
          action = "replace"
          target_label = "__path__"
          separator = "/"
          replacement = "/var/log/pods/*$1/*.log"
        }

        // Label creation -  "container_runtime" field from "__meta_kubernetes_pod_container_id"
        rule {
          source_labels = ["__meta_kubernetes_pod_container_id"]
          action = "replace"
          target_label = "container_runtime"
          regex = "^(\\S+):\\/\\/.+$"
          replacement = "$1"
        }
      }

      // loki.source.file tails pod logs from /var/log/pods/* on the host filesystem.
      // Previously used loki.source.kubernetes (apiserver streaming) which drove
      // kube-apiserver `CONNECT pods/log` to ~13 req/s + ~2200 sec/s of streams.
      // discovery.relabel.pod_logs already sets __path__ to the kubelet log path.
      loki.source.file "pod_logs" {
        targets    = discovery.relabel.pod_logs.output
        forward_to = [loki.process.pod_logs.receiver]
      }

      // loki.process receives log entries from other Loki components, applies one or more processing stages,
      // and forwards the results to the list of receivers in the component's arguments.
      loki.process "pod_logs" {
        stage.static_labels {
            values = {
              cluster = "default",
            }
        }

        forward_to = [loki.write.default.receiver]
      }

      // Node-level journal log collection for kernel panics, OOMs, hung tasks, etc.
      // Ships system logs off-node so they survive hard resets.
      loki.source.journal "node_journal" {
        forward_to = [loki.process.journal.receiver]
        relabel_rules = loki.relabel.journal.rules
        labels = {
          job = "node-journal",
        }
        max_age = "12h"
      }

      loki.relabel "journal" {
        forward_to = []

        rule {
          source_labels = ["__journal__hostname"]
          target_label  = "node"
        }
        rule {
          source_labels = ["__journal__systemd_unit"]
          target_label  = "unit"
        }
        rule {
          source_labels = ["__journal_priority_keyword"]
          target_label  = "level"
        }
        rule {
          source_labels = ["__journal__transport"]
          target_label  = "transport"
        }
      }

      // Forward warning+ journal entries (priority 0-4: emerg, alert, crit, err, warning)
      // Also forwards kernel transport entries regardless of priority for OOM/panic detection.
      loki.process "journal" {
        stage.static_labels {
          values = {
            cluster = "default",
          }
        }

        // Drop info/debug/notice entries that aren't from the kernel transport
        stage.match {
          selector = "{job=\"node-journal\", level=~\"info|notice|debug\", transport!=\"kernel\"}"
          action   = "drop"
        }

        forward_to = [loki.write.default.receiver]
      }

      // Kubernetes audit log collection from /var/log/kubernetes/audit.log
      // Requires alloy.mounts.varlog=true to mount /var/log from the host
      local.file_match "audit_logs" {
        path_targets = [{
          __path__ = "/var/log/kubernetes/audit.log",
          job      = "kubernetes-audit",
          node     = env("HOSTNAME"),
        }]
      }

      loki.source.file "audit_logs" {
        targets    = local.file_match.audit_logs.targets
        forward_to = [loki.write.default.receiver]
      }

  # Mount /var/log from the host for file-based log collection (audit logs)
  mounts:
    varlog: true
    # Mount journal directories for loki.source.journal
    extra:
      - name: journal-run
        mountPath: /run/log/journal
        readOnly: true
      - name: journal-var
        mountPath: /var/log/journal
        readOnly: true
      - name: machine-id
        mountPath: /etc/machine-id
        readOnly: true

controller:
  # Bump maxUnavailable above the chart default (1) so a 5-node DS finishes its
  # rolling update inside the helm_release timeout. Log shipper tolerates the
  # brief gap.
  updateStrategy:
    type: RollingUpdate
    rollingUpdate:
      maxUnavailable: 50%

  volumes:
    extra:
      - name: journal-run
        hostPath:
          path: /run/log/journal
          type: DirectoryOrCreate
      - name: journal-var
        hostPath:
          path: /var/log/journal
          type: DirectoryOrCreate
      - name: machine-id
        hostPath:
          path: /etc/machine-id
          type: File

  # Schedule on control-plane node too so we can tail /var/log/kubernetes/audit.log
  # from kube-apiserver. Without this, K8s audit log shipping (wave 1 K2-K9 alert
  # rules) has no source. control-plane has the standard NoSchedule taint.
  tolerations:
    - key: "node-role.kubernetes.io/control-plane"
      operator: "Exists"
      effect: "NoSchedule"
extract monitoring, nvidia, mailserver, cloudflared, kyverno from platform [ci skip] Phase 2 of platform stack split. 5 more modules extracted into independent stacks. All applied successfully with zero destroys. Cloudflared now reads k8s_users from Vault directly to compute user_domains. Woodpecker pipeline runs all 8 extracted stacks in parallel. Memory bumped to 6Gi for 9 concurrent TF processes. Platform reduced from 27 to 19 modules. 2026-03-17 21:34:11 +00:00			`alloy:`
alloy: move resources to alloy.* (chart key bug); 1Gi limit fixes IO storm The Alloy Helm chart maps `alloy.resources`, NOT `controller.resources`, onto the alloy container. The block under `controller:` was silently dropped, so the container ran with `resources: {}` and inherited the Kyverno LimitRange `tier-defaults` 256Mi — well below Alloy's 400-450Mi steady state. The cgroup ran at 255.8/256MB with ~50M memory-reclaim events, page-cache thrashing drove ~185 MB/s sdc reads (12.18 TB in 24h), saturating the Proxmox host and rippling out to all VMs + NFS. Fix: - Move resources to `alloy.resources` (correct chart key). - Burstable QoS: request 512Mi, limit 1Gi. Workers are at 97-99% memory-request saturation cluster-wide; a 1Gi request blocks scheduling on node2/node3. - Bump controller.updateStrategy.maxUnavailable to 50% so a 5-pod DS rolling update fits inside the helm timeout. - Bump helm_release.alloy.timeout to 900s (default 300s was too short with occasional runc-stuck-Terminating on k8s-master). Verified: all 4 alloy pods now show 1Gi/512Mi at the container level; helm rev=8 deployed; per-pod memory 99-108Mi at steady state (well under the new limit). Memory ID 2726. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com> 2026-05-26 02:08:35 +00:00			`# Resource limits for the alloy container itself.`
			# Must be under `alloy.resources` (NOT `controller.resources`) — the chart
			`# only maps THIS key onto the alloy container. Without it, the container gets`
			# `resources: {}` and inherits Kyverno LimitRange `tier-defaults` (256Mi),
			`# which is below Alloy's 400-450Mi steady state and caused page-cache`
			`# thrashing → 185 MB/s sdc reads → host IO saturation (2026-05-26).`
			`# Burstable QoS (request < limit) — workers are at 97-99% memory-request`
			`# saturation; a 1Gi request blocks scheduling on node2/node3.`
			`resources:`
			`requests:`
			`cpu: 50m`
			`memory: 512Mi`
			`limits:`
			`memory: 1Gi`
extract monitoring, nvidia, mailserver, cloudflared, kyverno from platform [ci skip] Phase 2 of platform stack split. 5 more modules extracted into independent stacks. All applied successfully with zero destroys. Cloudflared now reads k8s_users from Vault directly to compute user_domains. Woodpecker pipeline runs all 8 extracted stacks in parallel. Memory bumped to 6Gi for 9 concurrent TF processes. Platform reduced from 27 to 19 modules. 2026-03-17 21:34:11 +00:00			`configMap:`
			`content: \|-`
			`// Write your Alloy config here:`
			`logging {`
			`level = "info"`
			`format = "logfmt"`
			`}`
			`loki.write "default" {`
			`endpoint {`
			`url = "http://loki.monitoring.svc.cluster.local:3100/loki/api/v1/push"`
			`}`
			`}`

			`// discovery.kubernetes allows you to find scrape targets from Kubernetes resources.`
			`// It watches cluster state and ensures targets are continually synced with what is currently running in your cluster.`
			`discovery.kubernetes "pod" {`
			`role = "pod"`
			`}`

			`// discovery.relabel rewrites the label set of the input targets by applying one or more relabeling rules.`
			`// If no rules are defined, then the input targets are exported as-is.`
			`discovery.relabel "pod_logs" {`
			`targets = discovery.kubernetes.pod.targets`

			`// Label creation - "namespace" field from "__meta_kubernetes_namespace"`
			`rule {`
			`source_labels = ["__meta_kubernetes_namespace"]`
			`action = "replace"`
			`target_label = "namespace"`
			`}`

			`// Label creation - "pod" field from "__meta_kubernetes_pod_name"`
			`rule {`
			`source_labels = ["__meta_kubernetes_pod_name"]`
			`action = "replace"`
			`target_label = "pod"`
			`}`

			`// Label creation - "container" field from "__meta_kubernetes_pod_container_name"`
			`rule {`
			`source_labels = ["__meta_kubernetes_pod_container_name"]`
			`action = "replace"`
			`target_label = "container"`
			`}`

			`// Label creation - "app" field from "__meta_kubernetes_pod_label_app_kubernetes_io_name"`
			`rule {`
			`source_labels = ["__meta_kubernetes_pod_label_app_kubernetes_io_name"]`
			`action = "replace"`
			`target_label = "app"`
			`}`

			`// Label creation - "job" field from "__meta_kubernetes_namespace" and "__meta_kubernetes_pod_container_name"`
			`// Concatenate values __meta_kubernetes_namespace/__meta_kubernetes_pod_container_name`
			`rule {`
			`source_labels = ["__meta_kubernetes_namespace", "__meta_kubernetes_pod_container_name"]`
			`action = "replace"`
			`target_label = "job"`
			`separator = "/"`
			`replacement = "$1"`
			`}`

			`// Label creation - "container" field from "__meta_kubernetes_pod_uid" and "__meta_kubernetes_pod_container_name"`
			`// Concatenate values __meta_kubernetes_pod_uid/__meta_kubernetes_pod_container_name.log`
			`rule {`
			`source_labels = ["__meta_kubernetes_pod_uid", "__meta_kubernetes_pod_container_name"]`
			`action = "replace"`
			`target_label = "__path__"`
			`separator = "/"`
			`replacement = "/var/log/pods/$1/.log"`
			`}`

			`// Label creation - "container_runtime" field from "__meta_kubernetes_pod_container_id"`
			`rule {`
			`source_labels = ["__meta_kubernetes_pod_container_id"]`
			`action = "replace"`
			`target_label = "container_runtime"`
			`regex = "^(\\S+):\\/\\/.+$"`
			`replacement = "$1"`
			`}`
			`}`

alloy: switch pod log shipping from apiserver to file-tail Replaced 'loki.source.kubernetes' with 'loki.source.file' in alloy DS config. discovery.relabel.pod_logs already sets __path__ to the kubelet log path (/var/log/pods/<uid>/<container>/.log) and varlog host-mount was already present, so this is a one-line swap. Why: apiserver was burning ~700m sustained on 'CONNECT pods/log' streams (13 req/s, ~2200 sec/s of long-lived TCP connections). Streaming pod logs through the apiserver instead of tailing kubelet's log files was the dominant residual cost after the recent Loki/Alloy onboarding. Measured before/after: - Alloy DS: ~620m total (5 x ~125m) -> ~92m total (5 x ~18m) - kube-apiserver: peak 1959m midnight burst, settled 632m (Stuck-pod recovery: alloy-7zg7t on k8s-master needed --force delete during rollout — FailedKillPod 'unable to signal init: permission denied' on runc, transient runtime issue, unrelated to this change.) 2026-05-21 08:27:34 +00:00			`// loki.source.file tails pod logs from /var/log/pods/* on the host filesystem.`
			`// Previously used loki.source.kubernetes (apiserver streaming) which drove`
			// kube-apiserver `CONNECT pods/log` to ~13 req/s + ~2200 sec/s of streams.
			`// discovery.relabel.pod_logs already sets __path__ to the kubelet log path.`
			`loki.source.file "pod_logs" {`
extract monitoring, nvidia, mailserver, cloudflared, kyverno from platform [ci skip] Phase 2 of platform stack split. 5 more modules extracted into independent stacks. All applied successfully with zero destroys. Cloudflared now reads k8s_users from Vault directly to compute user_domains. Woodpecker pipeline runs all 8 extracted stacks in parallel. Memory bumped to 6Gi for 9 concurrent TF processes. Platform reduced from 27 to 19 modules. 2026-03-17 21:34:11 +00:00			`targets = discovery.relabel.pod_logs.output`
			`forward_to = [loki.process.pod_logs.receiver]`
			`}`

			`// loki.process receives log entries from other Loki components, applies one or more processing stages,`
			`// and forwards the results to the list of receivers in the component's arguments.`
			`loki.process "pod_logs" {`
			`stage.static_labels {`
			`values = {`
			`cluster = "default",`
			`}`
			`}`

			`forward_to = [loki.write.default.receiver]`
			`}`

			`// Node-level journal log collection for kernel panics, OOMs, hung tasks, etc.`
			`// Ships system logs off-node so they survive hard resets.`
			`loki.source.journal "node_journal" {`
			`forward_to = [loki.process.journal.receiver]`
			`relabel_rules = loki.relabel.journal.rules`
			`labels = {`
			`job = "node-journal",`
			`}`
			`max_age = "12h"`
			`}`

			`loki.relabel "journal" {`
			`forward_to = []`

			`rule {`
			`source_labels = ["__journal__hostname"]`
			`target_label = "node"`
			`}`
			`rule {`
			`source_labels = ["__journal__systemd_unit"]`
			`target_label = "unit"`
			`}`
			`rule {`
			`source_labels = ["__journal_priority_keyword"]`
			`target_label = "level"`
			`}`
			`rule {`
			`source_labels = ["__journal__transport"]`
			`target_label = "transport"`
			`}`
			`}`

			`// Forward warning+ journal entries (priority 0-4: emerg, alert, crit, err, warning)`
			`// Also forwards kernel transport entries regardless of priority for OOM/panic detection.`
			`loki.process "journal" {`
			`stage.static_labels {`
			`values = {`
			`cluster = "default",`
			`}`
			`}`

			`// Drop info/debug/notice entries that aren't from the kernel transport`
			`stage.match {`
			`selector = "{job=\"node-journal\", level=~\"info\|notice\|debug\", transport!=\"kernel\"}"`
			`action = "drop"`
			`}`

			`forward_to = [loki.write.default.receiver]`
			`}`

			`// Kubernetes audit log collection from /var/log/kubernetes/audit.log`
			`// Requires alloy.mounts.varlog=true to mount /var/log from the host`
			`local.file_match "audit_logs" {`
			`path_targets = [{`
			`__path__ = "/var/log/kubernetes/audit.log",`
			`job = "kubernetes-audit",`
			`node = env("HOSTNAME"),`
			`}]`
			`}`

			`loki.source.file "audit_logs" {`
			`targets = local.file_match.audit_logs.targets`
			`forward_to = [loki.write.default.receiver]`
			`}`

			`# Mount /var/log from the host for file-based log collection (audit logs)`
			`mounts:`
			`varlog: true`
			`# Mount journal directories for loki.source.journal`
			`extra:`
			`- name: journal-run`
			`mountPath: /run/log/journal`
			`readOnly: true`
			`- name: journal-var`
			`mountPath: /var/log/journal`
			`readOnly: true`
			`- name: machine-id`
			`mountPath: /etc/machine-id`
			`readOnly: true`

			`controller:`
alloy: move resources to alloy.* (chart key bug); 1Gi limit fixes IO storm The Alloy Helm chart maps `alloy.resources`, NOT `controller.resources`, onto the alloy container. The block under `controller:` was silently dropped, so the container ran with `resources: {}` and inherited the Kyverno LimitRange `tier-defaults` 256Mi — well below Alloy's 400-450Mi steady state. The cgroup ran at 255.8/256MB with ~50M memory-reclaim events, page-cache thrashing drove ~185 MB/s sdc reads (12.18 TB in 24h), saturating the Proxmox host and rippling out to all VMs + NFS. Fix: - Move resources to `alloy.resources` (correct chart key). - Burstable QoS: request 512Mi, limit 1Gi. Workers are at 97-99% memory-request saturation cluster-wide; a 1Gi request blocks scheduling on node2/node3. - Bump controller.updateStrategy.maxUnavailable to 50% so a 5-pod DS rolling update fits inside the helm timeout. - Bump helm_release.alloy.timeout to 900s (default 300s was too short with occasional runc-stuck-Terminating on k8s-master). Verified: all 4 alloy pods now show 1Gi/512Mi at the container level; helm rev=8 deployed; per-pod memory 99-108Mi at steady state (well under the new limit). Memory ID 2726. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com> 2026-05-26 02:08:35 +00:00			`# Bump maxUnavailable above the chart default (1) so a 5-node DS finishes its`
			`# rolling update inside the helm_release timeout. Log shipper tolerates the`
			`# brief gap.`
			`updateStrategy:`
			`type: RollingUpdate`
			`rollingUpdate:`
			`maxUnavailable: 50%`

extract monitoring, nvidia, mailserver, cloudflared, kyverno from platform [ci skip] Phase 2 of platform stack split. 5 more modules extracted into independent stacks. All applied successfully with zero destroys. Cloudflared now reads k8s_users from Vault directly to compute user_domains. Woodpecker pipeline runs all 8 extracted stacks in parallel. Memory bumped to 6Gi for 9 concurrent TF processes. Platform reduced from 27 to 19 modules. 2026-03-17 21:34:11 +00:00			`volumes:`
			`extra:`
			`- name: journal-run`
			`hostPath:`
			`path: /run/log/journal`
			`type: DirectoryOrCreate`
			`- name: journal-var`
			`hostPath:`
			`path: /var/log/journal`
			`type: DirectoryOrCreate`
			`- name: machine-id`
			`hostPath:`
			`path: /etc/machine-id`
			`type: File`

security(wave1): W1.1 audit-log shipping LIVE + W1.5 trusted-registries Enforce LIVE ## W1.1 — K8s API audit log shipping (LIVE) - alloy.yaml: added control-plane toleration so Alloy DaemonSet runs on k8s-master node. Verified alloy-7zg7t scheduled on master, tailing /var/log/kubernetes/audit.log - loki.tf "Security Wave 1" rule group: added K2-K9 alert rules (skipped K1 per Q7 decision): - K2 K8sSATokenFromUnexpectedIP - K3 K8sSensitiveSecretReadByUnexpectedActor - K4 K8sExecIntoSensitiveNamespace - K5 K8sMassDelete (>5 Pod/Secret/CM in 60s by single user) - K6 K8sAuditPolicyModified (kubeadm-config CM change) - K7 K8sClusterRoleWildcardCreated (verbs=* + resources=) - K8 K8sAnonymousBindingGranted - K9 K8sViktorFromUnexpectedIP - All rules use source-IP regex matching the wave-1 allowlist (10.0.20.0/22, 192.168.1.0/24, 10.10.0.0/16 pod, 10.96.0.0/12 svc, 100.64-127 tailnet) and `lane = "security"` → #security Slack route. - Verified: kubectl-audit logs flowing in Loki query {job="kubernetes-audit"} returns events with node=k8s-master. - Verified: /loki/api/v1/rules lists all K2-K9 + V1-V7 + S1. ## W1.5 — require-trusted-registries Enforce (LIVE) - security-policies.tf: flipped Audit→Enforce with explicit allowlist built by `kubectl get pods -A -o jsonpath='{..image}'` enumeration. - Removed `/` catch-all (which made Audit→Enforce a no-op). - Pattern includes 15 explicit registries, 6 DockerHub library bare names, 56 DockerHub user repos. - Verified by admission dry-run: - evilcorp.example/malware:v1 → BLOCKED with custom message - alpine:3.20 → ALLOWED (matches `alpine`) - docker.io/library/alpine:3.20 → ALLOWED (matches `docker.io/*`) ## W1.6 — Calico flow logs (BLOCKED — Calico OSS limitation) - Tried adding FelixConfiguration with flowLogsFileEnabled=true via kubectl_manifest in stacks/calico/main.tf - Calico OSS rejected with "strict decoding error: unknown field spec.flowLogsFileEnabled" — these fields are Calico Enterprise/Tigera-only - Removed the failed resource. Documented alternative paths in main.tf comment block: GNP with action=Log (iptables NFLOG → journal), Cilium migration, eBPF tooling, or Tigera Operator adoption. ## Docs updates - security.md status table refreshed: W1.1/W1.2/W1.3/W1.4/W1.5 LIVE, W1.6/W1.7 blocked - monitoring.md: Loki marked DEPLOYED (was incorrectly NOT-DEPLOYED in prior session before today's apply) ## Cleanup - Removed stacks/kyverno/imports.tf (TF 1.5+ import blocks completed their job in the 2026-05-18 apply; should not stay in tree per TF docs) Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com> 2026-05-19 06:37:54 +00:00			`# Schedule on control-plane node too so we can tail /var/log/kubernetes/audit.log`
			`# from kube-apiserver. Without this, K8s audit log shipping (wave 1 K2-K9 alert`
			`# rules) has no source. control-plane has the standard NoSchedule taint.`
			`tolerations:`
			`- key: "node-role.kubernetes.io/control-plane"`
			`operator: "Exists"`
			`effect: "NoSchedule"`