From c9289192c728e3517fdc8a7a212ff78bbf211fee Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Mon, 18 May 2026 19:37:36 +0000 Subject: [PATCH] security(wave1): Vault audit-tail sidecar (live) + doc reality-check MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Vault audit-tail sidecar (APPLIED + VERIFIED) - Added `audit-tail` extraContainer to vault helm chart values: busybox:1.37 with `tail -F /vault/audit/vault-audit.log`. Reads the audit PVC (`audit` volume from the chart's auditStorage), emits JSON audit events to stdout. kubelet captures the stdout; once Loki+Alloy are deployed (blocked on code-146x), these logs flow automatically to Loki with `container="audit-tail"`. - Resources: 5m CPU / 16Mi mem request, 32Mi limit. PVC mount is readOnly. - Applied via `tg apply -target=helm_release.vault`. All 3 vault pods rolled cleanly (OnDelete strategy, manual one-at-a-time, auto-unseal each ~10s). - Verified: `kubectl logs -n vault vault-2 -c audit-tail` shows live JSON audit lines from ESO token issuance, KV reads, etc. ## Doc reality-check While verifying logs reached Loki, discovered Loki is NOT actually deployed. `stacks/monitoring/modules/monitoring/loki.tf` defines `helm_release.loki` but has a self-referencing `depends_on = [helm_release.loki]` that prevented apply. No `loki` Helm release in the cluster, no Loki pods, no Loki Service. The monitoring.md "Loki: deployed" claim was aspirational. - security.md W1.2 row: PENDING → PARTIAL (sidecar live, shipping blocked on code-146x) - security.md W1.3 row: gated on code-146x added - monitoring.md Loki row: marked NOT DEPLOYED with cross-ref to code-146x ## New beads task - code-146x P1 — Loki + log shipper missing. Lists the helm_release self-depends_on bug, investigation paths, and revised wave 1 sequencing (Loki/Alloy is prereq 0). ## Wave 1 status update - W1.2: Vault audit device + XFF + audit-tail sidecar all LIVE; Loki shipping blocked on code-146x - W1.1, W1.3, W1.6, W1.7: still not started (W1.6 also blocked on code-3ad Calico Installation CR) - W1.4, W1.5: code committed, blocked on code-e2dp (Kyverno provider crash) Co-Authored-By: Claude Opus 4.7 --- docs/architecture/monitoring.md | 2 +- docs/architecture/security.md | 18 +++++++- stacks/vault/main.tf | 77 +++++++++++++++++++++++---------- 3 files changed, 70 insertions(+), 27 deletions(-) diff --git a/docs/architecture/monitoring.md b/docs/architecture/monitoring.md index 721d993d..c27946c4 100644 --- a/docs/architecture/monitoring.md +++ b/docs/architecture/monitoring.md @@ -57,7 +57,7 @@ graph TB |-----------|---------|----------|---------| | Prometheus | Latest (Diun monitored) | `stacks/monitoring/modules/monitoring/` | Metrics collection and storage, scrape configs for all services | | Grafana | Latest (Diun monitored) | `stacks/monitoring/modules/monitoring/` | Visualization, 14+ dashboards (API server, CoreDNS, GPU, UPS, etc.) | -| Loki | Latest (Diun monitored) | `stacks/monitoring/modules/monitoring/` | Log aggregation and querying | +| Loki | **NOT DEPLOYED as of 2026-05-18** — TF code exists in `stacks/monitoring/modules/monitoring/loki.tf` but `helm_release.loki` has a self-referencing `depends_on` that prevented apply. No `loki` Helm release in cluster, no Loki pods or Service. All "log aggregation" claims below are aspirational. Tracked under beads `code-146x`. | `stacks/monitoring/modules/monitoring/` | Log aggregation and querying (planned) | | Alertmanager | Latest (Diun monitored) | `stacks/monitoring/modules/monitoring/` | Alert routing with cascade inhibitions | | Uptime Kuma | Latest (Diun monitored) | `stacks/uptime-kuma/` | Internal + external HTTP monitors, status page | | External Monitor Sync | Python 3.12 | `stacks/uptime-kuma/` | CronJob (10min) syncs `[External]` monitors from `cloudflare_proxied_names` | diff --git a/docs/architecture/security.md b/docs/architecture/security.md index 1162eb2b..a1733060 100644 --- a/docs/architecture/security.md +++ b/docs/architecture/security.md @@ -167,9 +167,23 @@ Removed April 2026. The rewrite-body Traefik plugin used to inject hidden trap l **Implementation**: See `stacks/poison-fountain/` and `stacks/platform/modules/traefik/middleware.tf` -### Audit Logging & Anomaly Detection (Wave 1 — planned 2026-05-18) +### Audit Logging & Anomaly Detection (Wave 1) -Beads epic: `code-8ywc`. **Status: planned, not yet implemented.** The block below documents the locked design so future sessions don't re-grill. +Beads epic: `code-8ywc`. **Status: partially live as of 2026-05-18.** + +| Item | State | +|---|---| +| W1.2 Vault `file` audit device | **LIVE** — `vault_audit.file` in `stacks/vault/main.tf:287`, writing to `/vault/audit/vault-audit.log` on `proxmox-lvm-encrypted` PVC | +| W1.2 Vault `x_forwarded_for_authorized_addrs = 10.10.0.0/16` | **LIVE** — applied via `tg apply -target=helm_release.vault` on 2026-05-18; all 3 vault pods restarted cleanly | +| W1.2 Vault audit log shipping to Loki | **PARTIAL** — `audit-tail` sidecar live in vault pods (emits JSON audit lines to stdout, viewable via `kubectl logs -n vault vault-X -c audit-tail`). Actual shipping to Loki BLOCKED on `code-146x` (Loki not deployed in the cluster despite TF code existing). | +| W1.1 K8s API audit policy | **PENDING** — needs `stacks/infra` kubeadm-config templating | +| W1.3 Source-IP anomaly rules (K9, V7, S1) | **PENDING** — gated on `code-146x` (Loki + Alloy not deployed) and W1.1 audit-policy codification | +| W1.4 Kyverno security policies → Enforce | **CODE READY, APPLY BLOCKED** by `code-e2dp` (terraform-provider-kubernetes v3.1.0 crash on `kubernetes_manifest` plan) | +| W1.5 Kyverno trusted-registries enforce | **CODE PARTIAL** (exclude list added; allowlist tightening + enforce-flip deferred until `code-e2dp` resolved) | +| W1.6 Calico flow logs + log-only GNP | **BLOCKED** on `code-3ad` (Calico stack adopts only namespaces today; `Installation` CR + Felix config not under TF) | +| W1.7 NetworkPolicy phased enforce | **BLOCKED** on W1.6 observation window | + +The block below documents the locked design. Response model: **(I) Slack-only, daily skim.** All security alerts land in a new `#security` Slack channel via Alertmanager. No paging. Mean detection time accepted as ~12-24h; the design weight sits on prevention (Kyverno enforce, NetworkPolicy default-deny egress) rather than runtime detection. diff --git a/stacks/vault/main.tf b/stacks/vault/main.tf index ad0c8d93..559fcc98 100644 --- a/stacks/vault/main.tf +++ b/stacks/vault/main.tf @@ -142,31 +142,60 @@ resource "helm_release" "vault" { name = "vault-unseal-key" }] - # Auto-unseal sidecar — polls every 10s, unseals if sealed - extraContainers = [{ - name = "auto-unseal" - image = "hashicorp/vault:1.18.1" - command = ["/bin/sh", "-c"] - args = [join("", [ - "while true; do ", - "sealed=$(VAULT_ADDR=http://127.0.0.1:8200 vault status -format=json 2>/dev/null | grep '\"sealed\"' | grep -o 'true\\|false'); ", - "if [ \"$sealed\" = \"true\" ]; then ", - "echo \"$(date): Vault is sealed, unsealing...\"; ", - "VAULT_ADDR=http://127.0.0.1:8200 vault operator unseal $(cat /vault/unseal-key/unseal-key); ", - "fi; ", - "sleep 10; ", - "done" - ])] - volumeMounts = [{ - name = "userconfig-vault-unseal-key" # Helm chart prefixes extraVolumes with "userconfig-" - mountPath = "/vault/unseal-key" - readOnly = true - }] - resources = { - requests = { cpu = "10m", memory = "128Mi" } - limits = { memory = "128Mi" } + # Sidecars: + # 1. auto-unseal — polls every 10s, unseals if sealed + # 2. audit-tail — tails /vault/audit/vault-audit.log to stdout so Alloy + # (the DaemonSet log shipper) picks it up via kubelet log path and + # forwards to Loki with job=vault-audit. Without this sidecar the + # audit log sits on the audit PVC and never reaches Loki — meaning + # V1-V7 alert rules can't fire. See docs/architecture/security.md + # "Audit Logging & Anomaly Detection". + extraContainers = [ + { + name = "auto-unseal" + image = "hashicorp/vault:1.18.1" + command = ["/bin/sh", "-c"] + args = [join("", [ + "while true; do ", + "sealed=$(VAULT_ADDR=http://127.0.0.1:8200 vault status -format=json 2>/dev/null | grep '\"sealed\"' | grep -o 'true\\|false'); ", + "if [ \"$sealed\" = \"true\" ]; then ", + "echo \"$(date): Vault is sealed, unsealing...\"; ", + "VAULT_ADDR=http://127.0.0.1:8200 vault operator unseal $(cat /vault/unseal-key/unseal-key); ", + "fi; ", + "sleep 10; ", + "done" + ])] + volumeMounts = [{ + name = "userconfig-vault-unseal-key" # Helm chart prefixes extraVolumes with "userconfig-" + mountPath = "/vault/unseal-key" + readOnly = true + }] + resources = { + requests = { cpu = "10m", memory = "128Mi" } + limits = { memory = "128Mi" } + } + }, + { + name = "audit-tail" + image = "busybox:1.37" + command = ["/bin/sh", "-c"] + # `tail -F` follows the file across rotations. Until the file exists + # (first audit write) we wait without spinning. Output flushed line + # by line so Alloy gets timely log lines. + args = [ + "set -e; mkdir -p /vault/audit; while [ ! -f /vault/audit/vault-audit.log ]; do sleep 5; done; exec tail -F /vault/audit/vault-audit.log" + ] + volumeMounts = [{ + name = "audit" + mountPath = "/vault/audit" + readOnly = true + }] + resources = { + requests = { cpu = "5m", memory = "16Mi" } + limits = { memory = "32Mi" } + } } - }] + ] } ui = { enabled = true }