diff --git a/docs/architecture/security.md b/docs/architecture/security.md index 0bca127d..f2e31f49 100644 --- a/docs/architecture/security.md +++ b/docs/architecture/security.md @@ -180,8 +180,8 @@ Beads epic: `code-8ywc`. **Status: partially live as of 2026-05-18.** | W1.3 Source-IP anomaly rules (K9, V7, S1) | **LIVE** (K9, V7); **S1 PENDING** — fires once promtail/Alloy on PVE host ships sshd journal with `job=sshd-pve`. | | W1.4 Kyverno security policies → Enforce | **LIVE** — 3 policies in Enforce mode with 35-namespace exclude list. | | W1.5 Kyverno trusted-registries → Enforce | **LIVE** — explicit allowlist (15 registries + 6 DockerHub library bare names + 56 DockerHub user repos). Verified by admission dry-run: `evilcorp.example/malware:v1` BLOCKED, `alpine:3.20` and `docker.io/library/alpine:3.20` ALLOWED. | -| W1.6 Calico flow logs + log-only GNP | **BLOCKED** — Calico OSS doesn't support `FelixConfiguration.flowLogsFileEnabled` (Calico Enterprise/Tigera-only, rejected 2026-05-19 with "strict decoding error"). Alternative paths: Calico GlobalNetworkPolicy `action: Log` → iptables NFLOG → node journal, OR Cilium migration, OR Tigera Operator adoption. See stacks/calico/main.tf comment block. | -| W1.7 NetworkPolicy phased enforce | **BLOCKED** on W1.6 observation-method decision | +| W1.6 Calico observe-phase (pilot: recruiter-responder) | **LIVE** (2026-05-19) — GlobalNetworkPolicy `wave1-egress-observe-recruiter-responder` with rules `[action:Log, action:Allow]`. FelixConfiguration.flowLogsFileEnabled approach abandoned (Calico Enterprise-only field, rejected by OSS v3.26). Log action emits iptables LOG with prefix `calico-packet: ` → kernel → journald → Alloy → Loki. Verified: `{job="node-journal"} \|~ "calico-packet"` returns real packet metadata (SRC/DST/PROTO). Expand to more namespaces by adding to `namespaceSelector`. | +| W1.7 NetworkPolicy phased enforce | **PENDING** — needs ~1 week of W1.6 observation, then build empirical allowlist from Loki queries, flip GNP rules from `[Log, Allow]` to `[Allow specific dests, Deny rest]`. | The block below documents the locked design. diff --git a/stacks/calico/main.tf b/stacks/calico/main.tf index 737a4486..677cde60 100644 --- a/stacks/calico/main.tf +++ b/stacks/calico/main.tf @@ -69,21 +69,51 @@ resource "kubernetes_namespace" "tigera_operator" { } } -# Wave 1 W1.6 (beads code-8ywc): Calico OSS does NOT support flow-log-to-file -# export via FelixConfiguration — `flowLogsFileEnabled` and related fields are -# Calico Enterprise / Tigera Cloud features and are rejected by the OSS API -# (verified 2026-05-19: "strict decoding error: unknown field spec.flowLogsFileEnabled"). +# Wave 1 W1.6 (beads code-8ywc): observation phase via Calico GlobalNetworkPolicy +# `action: Log`. This is the supported primitive on Calico OSS v3.26 — the +# Calico-Enterprise FelixConfiguration.flowLogsFileEnabled approach is NOT +# accepted by the OSS CRD (verified 2026-05-19: "strict decoding error"). # -# Alternative observe-then-enforce paths for W1.6/W1.7: -# 1. Calico GlobalNetworkPolicy with `action: Log` on tier 3+4 — Log action -# writes to iptables NFLOG which lands in node syslog. Alloy already -# scrapes journal, but the format needs parsing. -# 2. Cilium replacement with Hubble flow observability (large migration). -# 3. Tigera Operator + Calico Enterprise (commercial). -# 4. eBPF-based flow capture (e.g. inspektor-gadget, retina) sidecar approach. +# How it works: +# - GNP selects pods by namespaceSelector +# - egress rule action=Log writes an iptables NFLOG entry that lands in the +# kernel log / journald with prefix "calico-packet:" on each node +# - Alloy DaemonSet already ships node-journal to Loki (job=node-journal) +# - LogQL query: {job="node-journal"} |= "calico-packet" surfaces egress flows +# - After ~1 week of observation, build the empirical per-namespace egress +# allowlist; then flip the same GNP to [Allow specific dests, Deny rest] # -# Wave 1 stops at this fork. The observe phase requires further design choice -# tracked under code-8ywc as a separate W1.6/W1.7 follow-up. +# Starting with `recruiter-responder` as the W1.7 pilot per the locked plan +# (smallest egress footprint, local llama-cpp). Expand by adding namespaces +# to namespaceSelector.matchExpressions over time. +resource "kubectl_manifest" "wave1_egress_observe_recruiter_responder" { + yaml_body = yamlencode({ + apiVersion = "projectcalico.org/v3" + kind = "GlobalNetworkPolicy" + metadata = { + name = "wave1-egress-observe-recruiter-responder" + annotations = { + "security.viktorbarzin.me/wave" = "1" + "security.viktorbarzin.me/purpose" = "observe-then-enforce egress; observation phase only" + } + } + spec = { + # Order high (numerically lower priority — Calico evaluates lowest order + # first, but here we just want to run before any default-deny gets added). + order = 2000 + selector = "all()" + namespaceSelector = "kubernetes.io/metadata.name == 'recruiter-responder'" + types = ["Egress"] + egress = [ + # Rule 1: log every egress packet (does not terminate; falls through) + { action = "Log" }, + # Rule 2: allow everything (so observation does NOT break the namespace) + { action = "Allow" }, + ] + } + }) + apply_only = true +} # CI retrigger 2026-05-16T13:42:57+00:00 — bulk enrollment apply (pipeline #689 killed) # CI retrigger v2 2026-05-16T13:46:35+00:00