infra/stacks/calico/main.tf
Viktor Barzin f5917f0eb3 security(wave1): W1.6 expand observation from recruiter-responder pilot → tier 3+4 (82 namespaces)
## Change
- Replaced kubectl_manifest.wave1_egress_observe_recruiter_responder with
  kubectl_manifest.wave1_egress_observe_tier34
- namespaceSelector changed from `kubernetes.io/metadata.name == 'recruiter-responder'`
  to `tier in {"3-edge", "4-aux"}` — covers 82 namespaces (17 tier-3-edge + 65 tier-4-aux)
- Legacy pilot GNP wave1-egress-observe-recruiter-responder kubectl-deleted
  (apply_only=true means TF rename does NOT destroy the live old resource;
  cleanup done manually)
- Tier 0/1/2 namespaces explicitly out of wave 1 observation per locked plan
  (cluster infra + GPU workloads, deferred)

## Verification (live cluster, 2026-05-19)
- 82 namespaces match `tier in (3-edge,4-aux)`
- Felix translated the new policy into iptables LOG rule in cali-po-* chain
- LogQL `{job="node-journal"} |~ "calico-packet"` returns real packet metadata
  from multiple namespaces with distinct destinations:
  - east-west pod-to-pod (10.10.108.48, 10.10.122.131)
  - in-cluster service VIP (10.96.0.10 — kube-dns)
  - external (149.154.166.110 — Telegram API from recruiter-responder)

## W1.7 next step (calendar-bound, ~1 week)
- Let observation run for ~1 week
- Aggregate distinct destinations per namespace via LogQL
- Build per-namespace egress allowlist module `tier3_egress_baseline`
- Flip GNP rules from `[Log, Allow]` to `[Allow <specific dests>, Deny]`
- Phased per-namespace as originally planned

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-19 22:14:16 +00:00

133 lines
5.2 KiB
HCL

# Calico CNI
#
# Calico has underpinned this cluster's pod networking since 2024-07-30, installed
# as raw kubectl manifests (tigera-operator Deployment + CRDs + Installation CR).
# Bringing the full stack under Terraform is high-blast — the operator and its
# Deployment must never flap during node pressure or during any apply, because
# new pod scheduling breaks within ~seconds of a CNI outage.
#
# This stack (created 2026-04-18 Wave 5b) adopts the three namespaces only:
# calico-system, calico-apiserver, tigera-operator. The `tigera-operator`
# Deployment, the 20+ CRDs it manages, and the `Installation` CR itself are
# intentionally *not* adopted yet — they require a low-traffic window and a
# careful ignore_changes set to cover operator-generated defaults on the
# Installation CR. Follow-up tracked in beads code-3ad.
#
# The namespaces are safe to adopt (no networking impact — they're just label
# containers) and give TF an audit trail entry for the labels/tier Kyverno
# cares about.
resource "kubernetes_namespace" "calico_system" {
metadata {
name = "calico-system"
labels = {
name = "calico-system"
# calico-system namespace is managed by tigera-operator — auto-update is
# incompatible (operator reverts DaemonSet image from its Installation CR).
# "keel.sh/enrolled" = "true"
}
}
lifecycle {
# KYVERNO_LIFECYCLE_V1: goldilocks-vpa-auto-mode label on every namespace.
# pod-security.kubernetes.io/* labels are applied by the tigera-operator
# reconciler on calico-system + calico-apiserver for PSA 'privileged'.
ignore_changes = [
metadata[0].labels["goldilocks.fairwinds.com/vpa-update-mode"],
metadata[0].labels["pod-security.kubernetes.io/enforce"],
metadata[0].labels["pod-security.kubernetes.io/enforce-version"],
]
}
}
resource "kubernetes_namespace" "calico_apiserver" {
metadata {
name = "calico-apiserver"
labels = {
name = "calico-apiserver"
}
}
lifecycle {
# KYVERNO_LIFECYCLE_V1 + PSA labels applied by tigera-operator (see calico_system).
ignore_changes = [
metadata[0].labels["goldilocks.fairwinds.com/vpa-update-mode"],
metadata[0].labels["pod-security.kubernetes.io/enforce"],
metadata[0].labels["pod-security.kubernetes.io/enforce-version"],
]
}
}
resource "kubernetes_namespace" "tigera_operator" {
metadata {
name = "tigera-operator"
labels = {
name = "tigera-operator"
}
}
lifecycle {
# KYVERNO_LIFECYCLE_V1: goldilocks-vpa-auto-mode ClusterPolicy stamps this label on every namespace
ignore_changes = [metadata[0].labels["goldilocks.fairwinds.com/vpa-update-mode"]]
}
}
# Wave 1 W1.6 (beads code-8ywc): observation phase via Calico GlobalNetworkPolicy
# `action: Log`. This is the supported primitive on Calico OSS v3.26 — the
# Calico-Enterprise FelixConfiguration.flowLogsFileEnabled approach is NOT
# accepted by the OSS CRD (verified 2026-05-19: "strict decoding error").
#
# How it works:
# - GNP selects pods by namespaceSelector
# - egress rule action=Log writes an iptables NFLOG entry that lands in the
# kernel log / journald with prefix "calico-packet:" on each node
# - Alloy DaemonSet already ships node-journal to Loki (job=node-journal)
# - LogQL query: {job="node-journal"} |= "calico-packet" surfaces egress flows
# - After ~1 week of observation, build the empirical per-namespace egress
# allowlist; then flip the same GNP to [Allow specific dests, Deny rest]
#
# Started with `recruiter-responder` as the pilot on 2026-05-19; expanded
# 2026-05-19 to all tier 3+4 namespaces (per locked plan — tier 3-edge has
# 17 ns, tier 4-aux has 65 ns, all use Calico's WorkloadEndpoint policy
# path). Tier 0/1/2 stay out of observation in wave 1 (cluster infra +
# GPU workloads, deferred per the plan).
#
# `apply_only = true` on the kubectl_manifest means renaming the TF resource
# does NOT destroy the old GNP via TF — we kubectl delete the legacy pilot
# GNP after this applies to clean it up. (Tracked manually.)
resource "kubectl_manifest" "wave1_egress_observe_tier34" {
yaml_body = yamlencode({
apiVersion = "projectcalico.org/v3"
kind = "GlobalNetworkPolicy"
metadata = {
name = "wave1-egress-observe-tier34"
annotations = {
"security.viktorbarzin.me/wave" = "1"
"security.viktorbarzin.me/purpose" = "observe-then-enforce egress for tier 3-edge + 4-aux"
}
}
spec = {
order = 2000
selector = "all()"
namespaceSelector = "tier in {\"3-edge\", \"4-aux\"}"
types = ["Egress"]
egress = [
# Rule 1: log every egress packet (LOG target writes to kernel/journal,
# alloy ships to Loki with job=node-journal,transport=kernel).
# LogQL: {job="node-journal"} |~ "calico-packet"
{ action = "Log" },
# Rule 2: allow everything (observation must NOT break workloads).
{ action = "Allow" },
]
}
})
apply_only = true
}
# CI retrigger 2026-05-16T13:42:57+00:00 — bulk enrollment apply (pipeline #689 killed)
# CI retrigger v2 2026-05-16T13:46:35+00:00
# CI retrigger v3 2026-05-16T14:06:39Z
# CI retrigger v4 2026-05-16T14:13:59Z
# CI retrigger v5 2026-05-16T23:10:38Z
# CI retrigger v6 2026-05-16T23:18:58Z