infra/stacks/keel/main.tf

# Keel — automated Kubernetes Deployment image updates.
# Design: docs/plans/2026-05-16-auto-upgrade-apps-design.md
# Plan:   docs/plans/2026-05-16-auto-upgrade-apps-plan.md
#
# Operation: Keel polls each watched workload's registry hourly (default
# schedule below; overridable per-workload via keel.sh/pollSchedule).
# Detection of a new digest under the watched tag triggers a Deployment
# update (pod template hash bump → rolling restart). Workloads opt in by
# carrying keel.sh/policy + keel.sh/trigger annotations — those are
# injected cluster-wide by the inject-keel-annotations ClusterPolicy
# (stacks/kyverno/modules/kyverno/keel-annotations.tf) on namespaces
# labeled keel.sh/enrolled=true.

# Slack bot token for posting upgrade notifications. Existing token in
# Vault — same one used elsewhere — see secret/viktor -> slack_bot_token.
data "vault_kv_secret_v2" "viktor" {
  mount = "secret"
  name  = "viktor"
}

resource "kubernetes_namespace" "keel" {
  metadata {
    name = "keel"
    labels = {
      tier = local.tiers.cluster
    }
  }
  lifecycle {
    # KYVERNO_LIFECYCLE_V1
    ignore_changes = [metadata[0].labels["goldilocks.fairwinds.com/vpa-update-mode"]]
  }
}

resource "helm_release" "keel" {
  name       = "keel"
  namespace  = kubernetes_namespace.keel.metadata[0].name
  repository = "https://charts.keel.sh"
  chart      = "keel"
  # Latest stable per `helm search repo keel/keel -l` 2026-05-16
  # (app version 0.21.1). 1.0.6 doesn't exist — verify before bumping.
  version    = "1.2.0"

  # Atomic mitigates partial-deploy state. Keel itself is exempt from
  # auto-update (Kyverno mutate excludes the keel namespace), so it only
  # rolls when this stack applies — making atomic safe here.
  atomic = true

  values = [yamlencode({
    # EMERGENCY STOP — scaled to 0 on 2026-05-26 16:42 UTC. Keel was actively
    # rewriting tag strings (not just digests) despite the
    # `keel.sh/match-tag=true` annotation injected by Kyverno that's supposed
    # to constrain it to digest-only watches. Known casualties this round:
    # uptime-kuma (2 → 1, 4h CrashLoopBackOff), n8n (1.80.5 → 0.1.2, silent
    # degradation), beads-server/dolt-workbench (0.3.73 → 0.1.0), and ~10
    # other deployments with downgrade-flavored change-cause annotations.
    # Re-enable only after root-causing why match-tag isn't being enforced,
    # OR after migrating each app to a content-addressed (SHA) tag pin.
    replicaCount = 0
    # Prometheus pod-annotation scrape — picks up Keel-specific metrics
    # (pending_approvals, poll_trigger_tracked_images, registries_scanned_total{image,registry})
    # on container port 9300 /metrics. The cluster's `kubernetes-pods`
    # Prometheus job keys on these annotations. Used by
    # infra/scripts/upgrade_state.sh (the /upgrade-state skill).
    podAnnotations = {
      "prometheus.io/scrape" = "true"
      "prometheus.io/port"   = "9300"
      "prometheus.io/path"   = "/metrics"
    }
    polling = {
      enabled = true
      # Default poll cadence for workloads that don't override per-Deployment
      # via keel.sh/pollSchedule. Decision #8 in the design doc.
      defaultSchedule = "@every 1h"
    }
    helmProvider = {
      enabled = false # We use annotations, not Helm hooks
    }
    notificationLevel = "info"
    persistence = {
      enabled = false
    }
    # Slack notifications: post every rollout to the configured channel.
    # Bot token from Vault (secret/viktor -> slack_bot_token). The Keel
    # chart sets SLACK_BOT_TOKEN, SLACK_CHANNELS, etc. on the deployment
    # from these values.
    slack = {
      enabled  = true
      botToken = data.vault_kv_secret_v2.viktor.data["slack_bot_token"]
      channel  = "general"
      # No approval flow — opt-out-pure means everything auto-rolls.
      # If we ever introduce gated rollouts, set approvalsChannel here.
    }
    # Keel uses each watched Deployment's own imagePullSecrets to query
    # its registry. Forgejo creds (`registry-credentials`) are auto-synced
    # to every namespace by Kyverno already, so Keel pods don't need a
    # separate pull-secret for their own image (ghcr.io is public).
    rbac = {
      enabled = true
    }
    resources = {
      requests = { cpu = "50m", memory = "64Mi" }
      limits   = { memory = "256Mi" }
    }
  })]
}