infra/stacks/openclaw/main.tf

variable "tls_secret_name" {
  type      = string
  sensitive = true
}
variable "nfs_server" { type = string }

data "vault_kv_secret_v2" "secrets" {
  mount = "secret"
  name  = "openclaw"
}

locals {
  skill_secrets = jsondecode(data.vault_kv_secret_v2.secrets.data["skill_secrets"])
}


resource "kubernetes_namespace" "openclaw" {
  metadata {
    name = "openclaw"
    labels = {
      tier                                    = local.tiers.aux
      "resource-governance/custom-limitrange" = "true"
      "resource-governance/custom-quota"      = "true"
    }
  }
  lifecycle {
    # KYVERNO_LIFECYCLE_V1: goldilocks-vpa-auto-mode ClusterPolicy stamps this label on every namespace
    ignore_changes = [metadata[0].labels["goldilocks.fairwinds.com/vpa-update-mode"]]
  }
}

module "tls_secret" {
  source          = "../../modules/kubernetes/setup_tls_secret"
  namespace       = kubernetes_namespace.openclaw.metadata[0].name
  tls_secret_name = var.tls_secret_name
}

resource "kubernetes_manifest" "external_secret" {
  manifest = {
    apiVersion = "external-secrets.io/v1beta1"
    kind       = "ExternalSecret"
    metadata = {
      name      = "openclaw-secrets"
      namespace = "openclaw"
    }
    spec = {
      refreshInterval = "15m"
      secretStoreRef = {
        name = "vault-kv"
        kind = "ClusterSecretStore"
      }
      target = {
        name = "openclaw-secrets"
      }
      dataFrom = [{
        extract = {
          key = "openclaw"
        }
      }]
    }
  }
  depends_on = [kubernetes_namespace.openclaw]
}

resource "kubernetes_service_account" "openclaw" {
  metadata {
    name      = "openclaw"
    namespace = kubernetes_namespace.openclaw.metadata[0].name
  }
}

resource "kubernetes_cluster_role_binding" "openclaw" {
  metadata {
    name = "openclaw-cluster-admin"
  }
  subject {
    kind      = "ServiceAccount"
    name      = kubernetes_service_account.openclaw.metadata[0].name
    namespace = kubernetes_namespace.openclaw.metadata[0].name
  }
  role_ref {
    api_group = "rbac.authorization.k8s.io"
    kind      = "ClusterRole"
    name      = "cluster-admin"
  }
}

resource "kubernetes_secret" "ssh_key" {
  metadata {
    name      = "ssh-key"
    namespace = kubernetes_namespace.openclaw.metadata[0].name
  }
  data = {
    "id_rsa" = data.vault_kv_secret_v2.secrets.data["ssh_key"]
  }
  type = "generic"
}

resource "kubernetes_config_map" "git_crypt_key" {
  metadata {
    name      = "git-crypt-key"
    namespace = kubernetes_namespace.openclaw.metadata[0].name
  }
  data = {
    "key" = filebase64("${path.root}/../../.git/git-crypt/keys/default")
  }
}

resource "kubernetes_config_map" "openclaw_config" {
  metadata {
    name      = "openclaw-config"
    namespace = kubernetes_namespace.openclaw.metadata[0].name
  }
  data = {
    "openclaw.json" = jsonencode({
      gateway = {
        mode           = "local"
        bind           = "lan"
        trustedProxies = ["10.0.0.0/8"]
        controlUi = {
          dangerouslyDisableDeviceAuth             = true
          dangerouslyAllowHostHeaderOriginFallback = true
        }
      }
      agents = {
        defaults = {
          contextTokens     = 1000000
          bootstrapMaxChars = 30000
          workspace         = "/workspace"
          sandbox = {
            mode = "off"
          }
          model = {
            # ChatGPT Plus OAuth via openai-codex plugin (account: ancaelena98@gmail.com).
            # gpt-5.4-mini is the only mini variant the Codex backend accepts for Plus tier;
            # gpt-5-mini / gpt-5.1-codex-mini return model_not_found / "not supported with
            # ChatGPT account". Plus rate-card: 1,200–7,000 local msgs / 5h on gpt-5.4-mini.
            primary   = "openai-codex/gpt-5.4-mini"
            fallbacks = ["openai-codex/gpt-5.5", "nim/qwen/qwen3-coder-480b-a35b-instruct", "modelrelay/auto-fastest"]
          }
          models = {
            "modelrelay/auto-fastest"                                = {}
            "nim/deepseek-ai/deepseek-v3.2"                          = {}
            "nim/qwen/qwen3.5-397b-a17b"                             = {}
            "nim/mistralai/mistral-large-3-675b-instruct-2512"       = {}
            "nim/qwen/qwen3-coder-480b-a35b-instruct"                = {}
            "nim/nvidia/llama-3.1-nemotron-ultra-253b-v1"            = {}
            "nim/z-ai/glm5"                                          = {}
            "llama-as-openai/Llama-4-Maverick-17B-128E-Instruct-FP8" = {}
            "llama-as-openai/Llama-4-Scout-17B-16E-Instruct-FP8"     = {}
            "openrouter/stepfun/step-3.5-flash:free"                 = {}
            "openrouter/arcee-ai/trinity-large-preview:free"         = {}
            "openai-codex/gpt-5.4-mini"                              = {}
            "openai-codex/gpt-5.5"                                   = {}
          }
        }
      }
      tools = {
        profile = "full"
        deny    = []
        elevated = {
          enabled = true
        }
        exec = {
          host        = "gateway"
          security    = "full"
          ask         = "off"
          pathPrepend = ["/tools", "/workspace"]
        }
        web = {
          search = {
            enabled    = true
            provider   = "brave"
            apiKey     = data.vault_kv_secret_v2.secrets.data["brave_api_key"]
            maxResults = 5
          }
          fetch = {
            enabled        = true
            maxChars       = 50000
            timeoutSeconds = 30
          }
        }
      }
      plugins = {
        allow = ["memory-core"]
        slots = { memory = "memory-core" }
        load = {
          paths = ["/home/node/.openclaw/extensions", "/app/extensions"]
        }
      }
      commands = {
        native       = true
        nativeSkills = true
      }
      channels = {
        telegram = {
          enabled     = true
          botToken    = data.vault_kv_secret_v2.secrets.data["telegram_bot_token"]
          dmPolicy    = "allowlist"
          allowFrom   = ["tg:8281953845"]
          groupPolicy = "allowlist"
          streamMode  = "partial"
        }
      }
      models = {
        mode = "merge"
        providers = {
          modelrelay = {
            baseUrl = "http://127.0.0.1:7352/v1"
            api     = "openai-completions"
            apiKey  = "modelrelay"
            models = [
              { id = "auto-fastest", name = "Auto (Fastest)", reasoning = false, input = ["text"], contextWindow = 200000, maxTokens = 16384, cost = { input = 0, output = 0, cacheRead = 0, cacheWrite = 0 } },
            ]
          }
          nim = {
            baseUrl = "https://integrate.api.nvidia.com/v1"
            api     = "openai-completions"
            apiKey  = data.vault_kv_secret_v2.secrets.data["nvidia_api_key"]
            models = [
              { id = "deepseek-ai/deepseek-v3.2", name = "DeepSeek V3.2", reasoning = false, input = ["text"], contextWindow = 164000, maxTokens = 16384, cost = { input = 0, output = 0, cacheRead = 0, cacheWrite = 0 } },
              { id = "qwen/qwen3.5-397b-a17b", name = "Qwen 3.5", reasoning = true, input = ["text"], contextWindow = 262000, maxTokens = 16384, cost = { input = 0, output = 0, cacheRead = 0, cacheWrite = 0 } },
              { id = "mistralai/mistral-large-3-675b-instruct-2512", name = "Mistral Large 3", reasoning = false, input = ["text"], contextWindow = 262000, maxTokens = 16384, cost = { input = 0, output = 0, cacheRead = 0, cacheWrite = 0 } },
              { id = "qwen/qwen3-coder-480b-a35b-instruct", name = "Qwen 3 Coder", reasoning = false, input = ["text"], contextWindow = 262000, maxTokens = 16384, cost = { input = 0, output = 0, cacheRead = 0, cacheWrite = 0 } },
              { id = "nvidia/llama-3.1-nemotron-ultra-253b-v1", name = "Nemotron Ultra 253B", reasoning = true, input = ["text"], contextWindow = 128000, maxTokens = 16384, cost = { input = 0, output = 0, cacheRead = 0, cacheWrite = 0 } },
              { id = "z-ai/glm5", name = "GLM-5", reasoning = false, input = ["text"], contextWindow = 128000, maxTokens = 16384, cost = { input = 0, output = 0, cacheRead = 0, cacheWrite = 0 } },
            ]
          }
          openrouter = {
            baseUrl = "https://openrouter.ai/api/v1"
            api     = "openai-completions"
            apiKey  = data.vault_kv_secret_v2.secrets.data["openrouter_api_key"]
            models = [
              { id = "stepfun/step-3.5-flash:free", name = "Step 3.5 Flash", reasoning = true, input = ["text"], contextWindow = 256000, maxTokens = 16384, cost = { input = 0, output = 0, cacheRead = 0, cacheWrite = 0 } },
              { id = "arcee-ai/trinity-large-preview:free", name = "Trinity Large", reasoning = false, input = ["text"], contextWindow = 131000, maxTokens = 16384, cost = { input = 0, output = 0, cacheRead = 0, cacheWrite = 0 } },
            ]
          }
          llama-as-openai = {
            baseUrl = "https://api.llama.com/compat/v1"
            apiKey  = data.vault_kv_secret_v2.secrets.data["llama_api_key"]
            api     = "openai-completions"
            models = [
              { id = "Llama-4-Maverick-17B-128E-Instruct-FP8", name = "Llama 4 Maverick", reasoning = false, input = ["text"], contextWindow = 200000, maxTokens = 16384, cost = { input = 0, output = 0, cacheRead = 0, cacheWrite = 0 } },
              { id = "Llama-4-Scout-17B-16E-Instruct-FP8", name = "Llama 4 Scout", reasoning = false, input = ["text"], contextWindow = 200000, maxTokens = 16384, cost = { input = 0, output = 0, cacheRead = 0, cacheWrite = 0 } },
            ]
          }
        }
      }
      wizard = {
        lastRunAt      = "2026-03-01T15:11:54.176Z"
        lastRunVersion = "2026.2.9"
        lastRunCommand = "configure"
        lastRunMode    = "local"
      }
    })
  }
}

resource "random_password" "gateway_token" {
  length  = 32
  special = false
}

# Prometheus exporter script — read by the openclaw-exporter sidecar.
# Stdlib-only Python so no pip install at startup. Reads sessions JSONL +
# auth-profiles.json from the NFS-backed openclaw home volume (mounted ro).
resource "kubernetes_config_map" "openclaw_exporter" {
  metadata {
    name      = "openclaw-exporter"
    namespace = kubernetes_namespace.openclaw.metadata[0].name
  }
  data = {
    "exporter.py" = file("${path.module}/files/exporter.py")
  }
}

module "nfs_tools_host" {
  source     = "../../modules/kubernetes/nfs_volume"
  name       = "openclaw-tools-host"
  namespace  = kubernetes_namespace.openclaw.metadata[0].name
  nfs_server = "192.168.1.127"
  nfs_path   = "/srv/nfs/openclaw/tools"
}

resource "kubernetes_persistent_volume_claim" "home_proxmox" {
  wait_until_bound = false
  metadata {
    name      = "openclaw-home-proxmox"
    namespace = kubernetes_namespace.openclaw.metadata[0].name
    annotations = {
      "resize.topolvm.io/threshold"     = "80%"
      "resize.topolvm.io/increase"      = "100%"
      "resize.topolvm.io/storage_limit" = "5Gi"
    }
  }
  spec {
    access_modes       = ["ReadWriteOnce"]
    storage_class_name = "proxmox-lvm"
    resources {
      requests = {
        storage = "1Gi"
      }
    }
  }
}

module "nfs_workspace_host" {
  source     = "../../modules/kubernetes/nfs_volume"
  name       = "openclaw-workspace-host"
  namespace  = kubernetes_namespace.openclaw.metadata[0].name
  nfs_server = "192.168.1.127"
  nfs_path   = "/srv/nfs/openclaw/workspace"
}

resource "kubernetes_persistent_volume_claim" "data_proxmox" {
  wait_until_bound = false
  metadata {
    name      = "openclaw-data-proxmox"
    namespace = kubernetes_namespace.openclaw.metadata[0].name
    annotations = {
      "resize.topolvm.io/threshold"     = "80%"
      "resize.topolvm.io/increase"      = "100%"
      "resize.topolvm.io/storage_limit" = "5Gi"
    }
  }
  spec {
    access_modes       = ["ReadWriteOnce"]
    storage_class_name = "proxmox-lvm"
    resources {
      requests = {
        storage = "1Gi"
      }
    }
  }
}

## cc-config NFS volume removed — replaced by dotfiles repo clone in init container
## See init_container "install-dotfiles" in the deployment

resource "kubernetes_deployment" "openclaw" {
  metadata {
    name      = "openclaw"
    namespace = kubernetes_namespace.openclaw.metadata[0].name
    labels = {
      app  = "openclaw"
      tier = local.tiers.aux
    }
  }
  spec {
    strategy {
      type = "Recreate"
    }
    replicas = 1
    selector {
      match_labels = {
        app = "openclaw"
      }
    }
    template {
      metadata {
        labels = {
          app = "openclaw"
        }
        annotations = {
          "reloader.stakater.com/search" = "true"
          # Prometheus auto-discovers pods with these annotations.
          # Scraped by the openclaw-exporter sidecar — exposes /metrics on :9099.
          "prometheus.io/scrape" = "true"
          "prometheus.io/port"   = "9099"
          "prometheus.io/path"   = "/metrics"
        }
      }
      spec {
        service_account_name = kubernetes_service_account.openclaw.metadata[0].name

        # Init 0: fix /workspace ownership so node user can write
        init_container {
          name    = "fix-workspace-perms"
          image   = "busybox:1.37"
          command = ["sh", "-c", "chown 1000:1000 /workspace"]
          volume_mount {
            name       = "workspace"
            mount_path = "/workspace"
          }
        }

        # Init 1: copy openclaw.json from ConfigMap into writable NFS home
        init_container {
          name    = "copy-config"
          image   = "busybox:1.37"
          command = ["sh", "-c", "cp /config/openclaw.json /home/node/.openclaw/openclaw.json && chown 1000:1000 /home/node/.openclaw/openclaw.json"]
          volume_mount {
            name       = "openclaw-config"
            mount_path = "/config"
          }
          volume_mount {
            name       = "openclaw-home"
            mount_path = "/home/node/.openclaw"
          }
        }

        # Init 1b: regenerate kubeconfig pointing at the projected SA tokenFile
        # so kubectl always reads the fresh, kubelet-rotated token. Without
        # this the previously-baked kubeconfig retains a SA token bound to a
        # long-dead pod and kubectl returns "must be logged in to the server".
        init_container {
          name    = "setup-kubeconfig"
          image   = "busybox:1.37"
          command = ["sh", "-c", <<-EOT
            cat > /home/node/.openclaw/kubeconfig <<'KUBECONFIG_EOF'
            apiVersion: v1
            kind: Config
            clusters:
            - cluster:
                certificate-authority: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
                server: https://kubernetes.default.svc
              name: in-cluster
            contexts:
            - context:
                cluster: in-cluster
                user: openclaw
                namespace: openclaw
              name: in-cluster
            current-context: in-cluster
            users:
            - name: openclaw
              user:
                tokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
            KUBECONFIG_EOF
            chown 1000:1000 /home/node/.openclaw/kubeconfig
            chmod 0644 /home/node/.openclaw/kubeconfig
          EOT
          ]
          volume_mount {
            name       = "openclaw-home"
            mount_path = "/home/node/.openclaw"
          }
        }

        # Init 2 removed: install-dotfiles init container was cloning dotfiles
        # repo via git on every pod start, causing 200+ small NFS writes.
        # Dotfiles already exist on NFS at /home/node/.openclaw/dotfiles from
        # a previous clone. To update, run git pull manually or via CronJob.

        # Main container: OpenClaw
        container {
          name    = "openclaw"
          image   = "ghcr.io/openclaw/openclaw:2026.5.4"
          # Doctor --fix auto-promotes the highest-tier codex model (gpt-5-pro) after
          # auth-profile-based model discovery; pin gpt-5.4-mini back to default after it.
          command = ["sh", "-c", "node openclaw.mjs doctor --fix 2>/dev/null; node openclaw.mjs models set openai-codex/gpt-5.4-mini 2>/dev/null; exec node openclaw.mjs gateway --allow-unconfigured --bind lan"]
          port {
            container_port = 18789
          }
          readiness_probe {
            tcp_socket {
              port = 18789
            }
            initial_delay_seconds = 30
            period_seconds        = 10
          }
          env {
            name  = "NODE_OPTIONS"
            value = "--max-old-space-size=1536"
          }
          env {
            name  = "OPENCLAW_GATEWAY_TOKEN"
            value = random_password.gateway_token.result
          }
          env {
            name  = "PATH"
            value = "/tools:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin"
          }
          env {
            name  = "TF_VAR_prod"
            value = "true"
          }
          env {
            name  = "KUBECONFIG"
            value = "/home/node/.openclaw/kubeconfig"
          }
          env {
            name  = "GIT_CONFIG_GLOBAL"
            value = "/home/node/.openclaw/.gitconfig"
          }
          # Skill secrets - Home Assistant
          env {
            name  = "HOME_ASSISTANT_URL"
            value = "https://ha-london.viktorbarzin.me"
          }
          env {
            name  = "HOME_ASSISTANT_TOKEN"
            value = local.skill_secrets["home_assistant_token"]
          }
          env {
            name  = "HOME_ASSISTANT_SOFIA_URL"
            value = "https://ha-sofia.viktorbarzin.me"
          }
          env {
            name  = "HOME_ASSISTANT_SOFIA_TOKEN"
            value = local.skill_secrets["home_assistant_sofia_token"]
          }
          # Skill secrets - Uptime Kuma
          env {
            name  = "UPTIME_KUMA_PASSWORD"
            value = local.skill_secrets["uptime_kuma_password"]
          }
          # Memory API
          env {
            name  = "MEMORY_API_URL"
            value = "http://claude-memory.claude-memory.svc.cluster.local"
          }
          env {
            name = "MEMORY_API_KEY"
            value_from {
              secret_key_ref {
                name = "openclaw-secrets"
                key  = "claude_memory_api_key"
              }
            }
          }
          # Python packages path for skills
          env {
            name  = "PYTHONPATH"
            value = "/tools/python-libs"
          }
          volume_mount {
            name       = "tools"
            mount_path = "/tools"
          }
          volume_mount {
            name       = "workspace"
            mount_path = "/workspace"
          }
          volume_mount {
            name       = "data"
            mount_path = "/data"
          }
          volume_mount {
            name       = "ssh-key"
            mount_path = "/ssh"
          }
          volume_mount {
            name       = "openclaw-home"
            mount_path = "/home/node/.openclaw"
          }
          resources {
            limits = {
              memory = "2Gi"
            }
            requests = {
              cpu    = "100m"
              memory = "2Gi"
            }
          }
        }

        # Sidecar: playwright-mcp — headless browser for agents
        container {
          name  = "playwright-mcp"
          image = "docker.io/viktorbarzin/playwright-mcp:v1"
          args  = ["--headless", "--browser", "chromium", "--no-sandbox", "--port", "3000", "--host", "0.0.0.0"]
          port {
            container_port = 3000
          }
          resources {
            requests = {
              cpu    = "50m"
              memory = "256Mi"
            }
            limits = {
              memory = "512Mi"
            }
          }
        }

        # Sidecar: openclaw-exporter — Prometheus exporter for Codex/OAuth usage.
        # Reads sessions JSONL files + auth-profiles.json, exposes /metrics on :9099.
        # Stdlib-only Python; no pip install at startup.
        container {
          name    = "openclaw-exporter"
          image   = "docker.io/library/python:3.12-slim"
          command = ["python3", "/scripts/exporter.py"]
          port {
            container_port = 9099
            name           = "metrics"
          }
          env {
            name  = "OPENCLAW_HOME"
            value = "/home/node/.openclaw"
          }
          env {
            name  = "METRICS_PORT"
            value = "9099"
          }
          volume_mount {
            name       = "openclaw-exporter-script"
            mount_path = "/scripts"
            read_only  = true
          }
          volume_mount {
            name       = "openclaw-home"
            mount_path = "/home/node/.openclaw"
            read_only  = true
          }
          readiness_probe {
            http_get {
              path = "/healthz"
              port = 9099
            }
            initial_delay_seconds = 5
            period_seconds        = 30
          }
          resources {
            requests = {
              cpu    = "10m"
              memory = "64Mi"
            }
            limits = {
              memory = "128Mi"
            }
          }
        }

        # Sidecar: modelrelay — auto-routes to fastest healthy free model
        container {
          name  = "modelrelay"
          image = "docker.io/library/node:22-alpine"
          command = ["sh", "-c", <<-EOF
            if [ ! -f /tools/modelrelay/node_modules/.package-lock.json ]; then
              mkdir -p /tools/modelrelay
              cd /tools/modelrelay
              npm init -y > /dev/null 2>&1
              npm install modelrelay > /dev/null 2>&1
            fi
            cd /tools/modelrelay
            exec npx modelrelay --port 7352
          EOF
          ]
          port {
            container_port = 7352
          }
          env {
            name = "NVIDIA_API_KEY"
            value_from {
              secret_key_ref {
                name = "openclaw-secrets"
                key  = "nvidia_api_key"
              }
            }
          }
          env {
            name = "OPENROUTER_API_KEY"
            value_from {
              secret_key_ref {
                name = "openclaw-secrets"
                key  = "openrouter_api_key"
              }
            }
          }
          volume_mount {
            name       = "tools"
            mount_path = "/tools"
          }
          resources {
            limits = {
              memory = "256Mi"
            }
            requests = {
              cpu    = "25m"
              memory = "128Mi"
            }
          }
        }

        volume {
          name = "tools"
          persistent_volume_claim {
            claim_name = module.nfs_tools_host.claim_name
          }
        }
        volume {
          name = "openclaw-home"
          persistent_volume_claim {
            claim_name = kubernetes_persistent_volume_claim.home_proxmox.metadata[0].name
          }
        }
        volume {
          name = "workspace"
          persistent_volume_claim {
            claim_name = module.nfs_workspace_host.claim_name
          }
        }
        volume {
          name = "data"
          persistent_volume_claim {
            claim_name = kubernetes_persistent_volume_claim.data_proxmox.metadata[0].name
          }
        }
        volume {
          name = "ssh-key"
          secret {
            secret_name  = kubernetes_secret.ssh_key.metadata[0].name
            default_mode = "0600"
          }
        }
        volume {
          name = "openclaw-config"
          config_map {
            name = kubernetes_config_map.openclaw_config.metadata[0].name
          }
        }
        volume {
          name = "openclaw-exporter-script"
          config_map {
            name         = kubernetes_config_map.openclaw_exporter.metadata[0].name
            default_mode = "0555"
          }
        }
      }
    }
  }
  lifecycle {
    # KYVERNO_LIFECYCLE_V1: Kyverno admission webhook mutates dns_config with ndots=2
    ignore_changes = [spec[0].template[0].spec[0].dns_config]
  }
}

resource "kubernetes_service" "openclaw" {
  metadata {
    name      = "openclaw"
    namespace = kubernetes_namespace.openclaw.metadata[0].name
    labels = {
      app = "openclaw"
    }
  }
  spec {
    selector = {
      app = "openclaw"
    }
    port {
      port        = 80
      target_port = 18789
    }
  }
}

module "ingress" {
  source          = "../../modules/kubernetes/ingress_factory"
  dns_type        = "non-proxied"
  namespace       = kubernetes_namespace.openclaw.metadata[0].name
  name            = "openclaw"
  tls_secret_name = var.tls_secret_name
  port            = 80
  protected       = true
  extra_annotations = {
    "gethomepage.dev/enabled"      = "true"
    "gethomepage.dev/name"         = "OpenClaw"
    "gethomepage.dev/description"  = "AI assistant"
    "gethomepage.dev/icon"         = "openai.png"
    "gethomepage.dev/group"        = "AI & Data"
    "gethomepage.dev/pod-selector" = ""
  }
}

# --- Webhook receiver: triggers task-processor Job on Forgejo issue events ---

resource "kubernetes_config_map" "task_webhook" {
  metadata {
    name      = "task-webhook"
    namespace = kubernetes_namespace.openclaw.metadata[0].name
  }
  data = {
    "server.py" = <<-PYEOF
      from http.server import HTTPServer, BaseHTTPRequestHandler
      import subprocess, time, json, os

      BOT_USER = os.environ.get('FORGEJO_BOT_USER', 'viktor')

      class Handler(BaseHTTPRequestHandler):
          def do_POST(self):
              try:
                  body = self.rfile.read(int(self.headers.get('Content-Length', 0)))
                  data = json.loads(body)
                  action = data.get('action', '')

                  # Trigger on: new issue, reopened issue, or new comment
                  trigger = False
                  if action in ('opened', 'reopened'):
                      issue = data.get('issue', {})
                      print(f"Issue #{issue.get('number','?')} {action}: {issue.get('title','?')}")
                      trigger = True
                  elif action == 'created' and 'comment' in data:
                      comment = data.get('comment', {})
                      commenter = comment.get('user', {}).get('login', '')
                      # Skip comments from the bot itself to avoid loops
                      if commenter != BOT_USER:
                          issue = data.get('issue', {})
                          print(f"Comment on #{issue.get('number','?')} by {commenter}")
                          trigger = True
                      else:
                          print(f"Skipping own comment on #{data.get('issue',{}).get('number','?')}")

                  if trigger:
                      job_name = f"task-processor-{int(time.time())}"
                      subprocess.run([
                          'kubectl', 'create', 'job', job_name,
                          '--from=cronjob/task-processor',
                          '-n', 'openclaw'
                      ], check=True)
                      self.send_response(200)
                      self.end_headers()
                      self.wfile.write(b'{"ok":true}')
                  else:
                      self.send_response(200)
                      self.end_headers()
                      self.wfile.write(b'{"ok":true,"skipped":true}')
              except Exception as e:
                  print(f"Error: {e}")
                  self.send_response(500)
                  self.end_headers()
                  self.wfile.write(f'{{"error":"{e}"}}'.encode())

          def do_GET(self):
              self.send_response(200)
              self.end_headers()
              self.wfile.write(b'{"status":"ok"}')

          def log_message(self, fmt, *args):
              print(f"[webhook] {args[0]} {args[1]} {args[2]}")

      print("Task webhook receiver listening on :8080")
      HTTPServer(('', 8080), Handler).serve_forever()
    PYEOF
  }
}

resource "kubernetes_service_account" "task_webhook" {
  metadata {
    name      = "task-webhook"
    namespace = kubernetes_namespace.openclaw.metadata[0].name
  }
}

resource "kubernetes_role" "task_webhook" {
  metadata {
    name      = "task-webhook-job-creator"
    namespace = kubernetes_namespace.openclaw.metadata[0].name
  }
  rule {
    api_groups = ["batch"]
    resources  = ["jobs", "cronjobs"]
    verbs      = ["get", "list", "create"]
  }
}

resource "kubernetes_role_binding" "task_webhook" {
  metadata {
    name      = "task-webhook-job-creator"
    namespace = kubernetes_namespace.openclaw.metadata[0].name
  }
  subject {
    kind      = "ServiceAccount"
    name      = kubernetes_service_account.task_webhook.metadata[0].name
    namespace = kubernetes_namespace.openclaw.metadata[0].name
  }
  role_ref {
    api_group = "rbac.authorization.k8s.io"
    kind      = "Role"
    name      = kubernetes_role.task_webhook.metadata[0].name
  }
}

resource "kubernetes_deployment" "task_webhook" {
  metadata {
    name      = "task-webhook"
    namespace = kubernetes_namespace.openclaw.metadata[0].name
    labels = {
      app  = "task-webhook"
      tier = local.tiers.aux
    }
  }
  spec {
    replicas = 1
    selector {
      match_labels = {
        app = "task-webhook"
      }
    }
    template {
      metadata {
        labels = {
          app = "task-webhook"
        }
      }
      spec {
        service_account_name = kubernetes_service_account.task_webhook.metadata[0].name
        container {
          name    = "webhook"
          image   = "python:3-alpine"
          command = ["sh", "-c", "apk add --no-cache curl > /dev/null 2>&1 && curl -sfL https://dl.k8s.io/release/v1.34.2/bin/linux/amd64/kubectl -o /usr/local/bin/kubectl && chmod +x /usr/local/bin/kubectl && exec python3 -u /app/server.py"]
          port {
            container_port = 8080
          }
          volume_mount {
            name       = "app"
            mount_path = "/app"
          }
          resources {
            requests = {
              cpu    = "5m"
              memory = "64Mi"
            }
            limits = {
              memory = "64Mi"
            }
          }
        }
        volume {
          name = "app"
          config_map {
            name = kubernetes_config_map.task_webhook.metadata[0].name
          }
        }
      }
    }
  }
  lifecycle {
    # KYVERNO_LIFECYCLE_V1: Kyverno admission webhook mutates dns_config with ndots=2
    ignore_changes = [spec[0].template[0].spec[0].dns_config]
  }
}

resource "kubernetes_service" "task_webhook" {
  metadata {
    name      = "task-webhook"
    namespace = kubernetes_namespace.openclaw.metadata[0].name
    labels = {
      app = "task-webhook"
    }
  }
  spec {
    selector = {
      app = "task-webhook"
    }
    port {
      port        = 80
      target_port = 8080
    }
  }
}

module "task_webhook_ingress" {
  source           = "../../modules/kubernetes/ingress_factory"
  namespace        = kubernetes_namespace.openclaw.metadata[0].name
  name             = "task-webhook"
  tls_secret_name  = var.tls_secret_name
  host             = "task-webhook"
  port             = 80
  external_monitor = false
}

# --- Shared ServiceAccount: grants pod-exec into the openclaw pod ---
# Used by the task_processor CronJob (below). Previously also used by the
# cluster_healthcheck CronJob, which has been decommissioned — the local
# `scripts/cluster_healthcheck.sh` is now the single authoritative runner.

resource "kubernetes_service_account" "healthcheck" {
  metadata {
    name      = "cluster-healthcheck"
    namespace = kubernetes_namespace.openclaw.metadata[0].name
  }
}

resource "kubernetes_role" "healthcheck_exec" {
  metadata {
    name      = "healthcheck-pod-exec"
    namespace = kubernetes_namespace.openclaw.metadata[0].name
  }
  rule {
    api_groups = [""]
    resources  = ["pods"]
    verbs      = ["get", "list"]
  }
  rule {
    api_groups = [""]
    resources  = ["pods/exec"]
    verbs      = ["create"]
  }
}

resource "kubernetes_role_binding" "healthcheck_exec" {
  metadata {
    name      = "healthcheck-pod-exec"
    namespace = kubernetes_namespace.openclaw.metadata[0].name
  }
  subject {
    kind      = "ServiceAccount"
    name      = kubernetes_service_account.healthcheck.metadata[0].name
    namespace = kubernetes_namespace.openclaw.metadata[0].name
  }
  role_ref {
    api_group = "rbac.authorization.k8s.io"
    kind      = "Role"
    name      = kubernetes_role.healthcheck_exec.metadata[0].name
  }
}

# --- CronJob: Task processor — polls Forgejo issues and triggers OpenClaw ---

resource "kubernetes_cron_job_v1" "task_processor" {
  metadata {
    name      = "task-processor"
    namespace = kubernetes_namespace.openclaw.metadata[0].name
    labels = {
      app  = "task-processor"
      tier = local.tiers.aux
    }
  }
  spec {
    schedule                      = "*/5 * * * *"
    concurrency_policy            = "Forbid"
    failed_jobs_history_limit     = 3
    successful_jobs_history_limit = 3

    job_template {
      metadata {
        labels = {
          app = "task-processor"
        }
      }
      spec {
        active_deadline_seconds    = 600
        backoff_limit              = 0
        ttl_seconds_after_finished = 86400
        template {
          metadata {
            labels = {
              app = "task-processor"
            }
          }
          spec {
            service_account_name = kubernetes_service_account.healthcheck.metadata[0].name
            restart_policy       = "Never"

            container {
              name  = "task-processor"
              image = "bitnami/kubectl:latest"
              command = ["bash", "-c", <<-EOF
                # Find the openclaw pod
                POD=$(kubectl get pods -n openclaw -l app=openclaw -o jsonpath='{.items[0].metadata.name}' 2>/dev/null)
                if [ -z "$POD" ]; then
                  echo "ERROR: OpenClaw pod not found"
                  exit 1
                fi
                echo "Executing task processor in pod $POD..."
                kubectl exec -n openclaw "$POD" -c openclaw -- \
                  env FORGEJO_TOKEN="$FORGEJO_TOKEN" \
                      FORGEJO_URL="http://forgejo.forgejo.svc.cluster.local" \
                      OPENCLAW_TOKEN="$OPENCLAW_TOKEN" \
                      OPENCLAW_URL="https://integrate.api.nvidia.com" \
                  bash /workspace/infra/scripts/task-processor.sh
              EOF
              ]

              env {
                name = "FORGEJO_TOKEN"
                value_from {
                  secret_key_ref {
                    name = "openclaw-secrets"
                    key  = "forgejo_api_token"
                  }
                }
              }
              env {
                name = "OPENCLAW_TOKEN"
                value_from {
                  secret_key_ref {
                    name = "openclaw-secrets"
                    key  = "nvidia_api_key"
                  }
                }
              }

              resources {
                requests = {
                  cpu    = "50m"
                  memory = "64Mi"
                }
                limits = {
                  memory = "64Mi"
                }
              }
            }
          }
        }
      }
    }
  }
  lifecycle {
    # KYVERNO_LIFECYCLE_V1: Kyverno admission webhook mutates dns_config with ndots=2
    ignore_changes = [spec[0].job_template[0].spec[0].template[0].spec[0].dns_config]
  }
}

# --- OpenLobster: Multi-user Telegram AI assistant (trial) ---

resource "kubernetes_persistent_volume_claim" "openlobster_data_proxmox" {
  wait_until_bound = false
  metadata {
    name      = "openlobster-data-proxmox"
    namespace = kubernetes_namespace.openclaw.metadata[0].name
    annotations = {
      "resize.topolvm.io/threshold"     = "80%"
      "resize.topolvm.io/increase"      = "100%"
      "resize.topolvm.io/storage_limit" = "5Gi"
    }
  }
  spec {
    access_modes       = ["ReadWriteOnce"]
    storage_class_name = "proxmox-lvm"
    resources {
      requests = {
        storage = "1Gi"
      }
    }
  }
}

resource "random_password" "openlobster_graphql_token" {
  length  = 32
  special = false
}

resource "kubernetes_deployment" "openlobster" {
  metadata {
    name      = "openlobster"
    namespace = kubernetes_namespace.openclaw.metadata[0].name
    labels = {
      app  = "openlobster"
      tier = local.tiers.aux
    }
  }
  spec {
    strategy {
      type = "Recreate"
    }
    replicas = 0
    selector {
      match_labels = {
        app = "openlobster"
      }
    }
    template {
      metadata {
        labels = {
          app = "openlobster"
        }
      }
      spec {
        # node4 has corrupted containerd content store — avoid it
        affinity {
          node_affinity {
            required_during_scheduling_ignored_during_execution {
              node_selector_term {
                match_expressions {
                  key      = "kubernetes.io/hostname"
                  operator = "NotIn"
                  values   = ["k8s-node4"]
                }
              }
            }
          }
        }
        container {
          name  = "openlobster"
          image = "ghcr.io/neirth/openlobster/openlobster:latest"
          port {
            container_port = 8080
          }
          env {
            name  = "OPENLOBSTER_GRAPHQL_AUTH_TOKEN"
            value = random_password.openlobster_graphql_token.result
          }
          env {
            name = "OPENLOBSTER_PROVIDERS_ANTHROPIC_API_KEY"
            value_from {
              secret_key_ref {
                name = "openclaw-secrets"
                key  = "anthropic_api_key"
              }
            }
          }
          env {
            name  = "OPENLOBSTER_PROVIDERS_ANTHROPIC_MODEL"
            value = "claude-sonnet-4-20250514"
          }
          env {
            name = "OPENLOBSTER_CHANNELS_TELEGRAM_TOKEN"
            value_from {
              secret_key_ref {
                name = "openclaw-secrets"
                key  = "telegram_bot_token"
              }
            }
          }
          env {
            name  = "OPENLOBSTER_DATABASE_DRIVER"
            value = "sqlite"
          }
          env {
            name  = "OPENLOBSTER_DATABASE_DSN"
            value = "/app/data/openlobster.db"
          }
          env {
            name  = "OPENLOBSTER_AGENT_NAME"
            value = "Lobster"
          }
          env {
            name  = "OPENLOBSTER_MEMORY_BACKEND"
            value = "file"
          }
          volume_mount {
            name       = "openlobster-data"
            mount_path = "/app/data"
          }
          resources {
            requests = {
              cpu    = "10m"
              memory = "64Mi"
            }
            limits = {
              memory = "256Mi"
            }
          }
        }
        volume {
          name = "openlobster-data"
          persistent_volume_claim {
            claim_name = kubernetes_persistent_volume_claim.openlobster_data_proxmox.metadata[0].name
          }
        }
      }
    }
  }
  lifecycle {
    ignore_changes = [spec[0].template[0].spec[0].dns_config] # KYVERNO_LIFECYCLE_V1
  }
}

resource "kubernetes_service" "openlobster" {
  metadata {
    name      = "openlobster"
    namespace = kubernetes_namespace.openclaw.metadata[0].name
    labels = {
      app = "openlobster"
    }
  }
  spec {
    selector = {
      app = "openlobster"
    }
    port {
      port        = 80
      target_port = 8080
    }
  }
}

module "openlobster_ingress" {
  source          = "../../modules/kubernetes/ingress_factory"
  dns_type        = "proxied"
  namespace       = kubernetes_namespace.openclaw.metadata[0].name
  name            = "openlobster"
  tls_secret_name = var.tls_secret_name
  host            = "openlobster"
  port            = 80
  protected       = true
}
-												[ci skip] phase 5+6: update CI pipelines for SOPS, add sensitive=true to secret vars

Phase 5 — CI pipelines:
- default.yml: add SOPS decrypt in prepare step, change git add . to
  specific paths (stacks/ state/ .woodpecker/), cleanup on success+failure
- renew-tls.yml: change git add . to git add secrets/ state/

Phase 6 — sensitive=true:
- Add sensitive = true to 256 variable declarations across 149 stack files
- Prevents secret values from appearing in terraform plan output
- Does NOT modify shared modules (ingress_factory, nfs_volume) to avoid
  breaking module interface contracts

Note: CI pipeline SOPS decryption requires sops_age_key Woodpecker secret
to be created before the pipeline will work with SOPS. Until then, the old
terraform.tfvars path continues to function.

											
										
										
											2026-03-07 14:30:36 +00:00
+								variable "tls_secret_name" {
-												Remove all CPU limits cluster-wide to eliminate CFS throttling

CPU limits cause CFS throttling even when nodes have idle capacity.
Move to a request-only CPU model: keep CPU requests for scheduling
fairness but remove all CPU limits. Memory limits stay (incompressible).

Changes across 108 files:
- Kyverno LimitRange policy: remove cpu from default/max in all 6 tiers
- Kyverno ResourceQuota policy: remove limits.cpu from all 5 tiers
- Custom ResourceQuotas: remove limits.cpu from 8 namespace quotas
- Custom LimitRanges: remove cpu from default/max (nextcloud, onlyoffice)
- RBAC module: remove cpu_limits variable and quota reference
- Freedify factory: remove cpu_limit variable and limits reference
- 86 deployment files: remove cpu from all limits blocks
- 6 Helm values files: remove cpu under limits sections

											
										
										
											2026-03-14 08:51:45 +00:00
+								  type      = string
-												[ci skip] phase 5+6: update CI pipelines for SOPS, add sensitive=true to secret vars

Phase 5 — CI pipelines:
- default.yml: add SOPS decrypt in prepare step, change git add . to
  specific paths (stacks/ state/ .woodpecker/), cleanup on success+failure
- renew-tls.yml: change git add . to git add secrets/ state/

Phase 6 — sensitive=true:
- Add sensitive = true to 256 variable declarations across 149 stack files
- Prevents secret values from appearing in terraform plan output
- Does NOT modify shared modules (ingress_factory, nfs_volume) to avoid
  breaking module interface contracts

Note: CI pipeline SOPS decryption requires sops_age_key Woodpecker secret
to be created before the pipeline will work with SOPS. Until then, the old
terraform.tfvars path continues to function.

											
										
										
											2026-03-07 14:30:36 +00:00
+								  sensitive = true
 								}
-												migrate all secrets from SOPS to Vault KV

- Add vault provider to root terragrunt.hcl (generated providers.tf)
- Delete stacks/vault/vault_provider.tf (now in generated providers.tf)
- Add 124 variable declarations + 43 vault_kv_secret_v2 resources to
  vault/main.tf to populate Vault KV at secret/<stack-name>
- Migrate 43 consuming stacks to read secrets from Vault KV via
  data "vault_kv_secret_v2" instead of SOPS var-file
- Add dependency "vault" to all migrated stacks' terragrunt.hcl
- Complex types (maps/lists) stored as JSON strings, decoded with
  jsondecode() in locals blocks

Bootstrap secrets (vault_root_token, vault_authentik_client_id,
vault_authentik_client_secret) remain in SOPS permanently.

Apply order: vault stack first (populates KV), then all others.

											
										
										
											2026-03-14 17:15:48 +00:00
+								variable "nfs_server" { type = string }
 								data "vault_kv_secret_v2" "secrets" {
 								  mount = "secret"
 								  name  = "openclaw"
-												[ci skip] add Forgejo task pipeline for OpenClaw AI agent

Forgejo issues as a task queue for OpenClaw:
- Forgejo OAuth2 with Authentik SSO, self-registration disabled
- Webhook-triggered task processing (instant) + CronJob backup (5min poll)
- Tasks processed via Mistral Large 3 (NVIDIA NIM API)
- Results posted as issue comments, auto-labeled and closed
- Comment follow-ups and reopened issues supported
- n8n RBAC for OpenClaw pod exec (future workflow integration)

											
										
										
											2026-03-07 21:09:31 +00:00
+								}
-												migrate all secrets from SOPS to Vault KV

- Add vault provider to root terragrunt.hcl (generated providers.tf)
- Delete stacks/vault/vault_provider.tf (now in generated providers.tf)
- Add 124 variable declarations + 43 vault_kv_secret_v2 resources to
  vault/main.tf to populate Vault KV at secret/<stack-name>
- Migrate 43 consuming stacks to read secrets from Vault KV via
  data "vault_kv_secret_v2" instead of SOPS var-file
- Add dependency "vault" to all migrated stacks' terragrunt.hcl
- Complex types (maps/lists) stored as JSON strings, decoded with
  jsondecode() in locals blocks

Bootstrap secrets (vault_root_token, vault_authentik_client_id,
vault_authentik_client_secret) remain in SOPS permanently.

Apply order: vault stack first (populates KV), then all others.

											
										
										
											2026-03-14 17:15:48 +00:00
 								locals {
 								  skill_secrets = jsondecode(data.vault_kv_secret_v2.secrets.data["skill_secrets"])
-												fix: eliminate memory overcommit to prevent node OOM crashes

Set requests = limits (Guaranteed QoS) across LimitRange defaults and
explicit pod resources. Node2 crashed 2026-03-14 from 250% memory
overcommit (61GB limits on 24GB node).

Changes:
- LimitRange: default = defaultRequest for all 6 tiers
- Grafana: 3 → 2 replicas
- Grampsweb: document why replicas=0
- Prometheus: 1Gi/4Gi → 3Gi/3Gi
- OpenClaw: 512Mi/2Gi → 768Mi/768Mi
- Immich server: 256Mi/2Gi → 512Mi/512Mi
- Immich postgresql: 256Mi/1Gi → 512Mi/512Mi
- Calibre: 256Mi/1536Mi → 256Mi/256Mi
- Linkwarden: 256Mi/1536Mi → 768Mi/768Mi
- N8N: 256Mi/1Gi → 512Mi/512Mi
- MySQL cluster: 1Gi/3-4Gi → 2Gi/2Gi
- pg-cluster (CNPG): 512Mi/4Gi → 512Mi/512Mi
- DBaaS ResourceQuota limits.memory: 64Gi → 12Gi

[ci skip]

											
										
										
											2026-03-14 16:01:41 +00:00
+								}
-												[ci skip] Phase 3: Create 66 service stacks and migrate state

Generated individual stack directories for all 66 services under stacks/.
Each stack has terragrunt.hcl (depends on platform) and main.tf (thin
wrapper calling existing module). Migrated all 64 active service states
from root terraform.tfstate to individual state files. Root state is now
empty. Verified with terragrunt plan on multiple stacks (no changes).

											
										
										
											2026-02-22 13:56:34 +00:00
-												[ci skip] Flatten module wrappers into stack roots

Remove the module "xxx" { source = "./module" } indirection layer
from all 66 service stacks. Resources are now defined directly in
each stack's main.tf instead of through a wrapper module.

- Merge module/main.tf contents into stack main.tf
- Apply variable replacements (var.tier -> local.tiers.X, renamed vars)
- Fix shared module paths (one fewer ../ at each level)
- Move extra files/dirs (factory/, chart_values, subdirs) to stack root
- Update state files to strip module.<name>. prefix
- Update CLAUDE.md to reflect flat structure

Verified: terragrunt plan shows 0 add, 0 destroy across all stacks.

											
										
										
											2026-02-22 15:13:55 +00:00
+								resource "kubernetes_namespace" "openclaw" {
 								  metadata {
 								    name = "openclaw"
 								    labels = {
-												openclaw: replace cc-config NFS with dotfiles repo clone [ci skip]

- Add init container "install-dotfiles" that clones the dotfiles repo
  and installs skills/agents/hooks to OpenClaw's home directory
- Remove nfs_cc_config module and its volume mount
- Skills/agents now come from the same chezmoi-managed dotfiles repo
  that manages the Mac config, eliminating the dual-sync problem

											
										
										
											2026-03-15 16:04:02 +00:00
+								      tier                                    = local.tiers.aux
 								      "resource-governance/custom-limitrange" = "true"
 								      "resource-governance/custom-quota"      = "true"
-												[ci skip] Flatten module wrappers into stack roots

Remove the module "xxx" { source = "./module" } indirection layer
from all 66 service stacks. Resources are now defined directly in
each stack's main.tf instead of through a wrapper module.

- Merge module/main.tf contents into stack main.tf
- Apply variable replacements (var.tier -> local.tiers.X, renamed vars)
- Fix shared module paths (one fewer ../ at each level)
- Move extra files/dirs (factory/, chart_values, subdirs) to stack root
- Update state files to strip module.<name>. prefix
- Update CLAUDE.md to reflect flat structure

Verified: terragrunt plan shows 0 add, 0 destroy across all stacks.

											
										
										
											2026-02-22 15:13:55 +00:00
+								    }
 								  }
-												[infra] Suppress Goldilocks vpa-update-mode label drift on all namespaces [ci skip]

## Context

Wave 3B-continued: the Goldilocks VPA dashboard (stacks/vpa) runs a Kyverno
ClusterPolicy `goldilocks-vpa-auto-mode` that mutates every namespace with
`metadata.labels["goldilocks.fairwinds.com/vpa-update-mode"] = "off"`. This
is intentional — Terraform owns container resource limits, and Goldilocks
should only provide recommendations, never auto-update. The label is how
Goldilocks decides per-namespace whether to run its VPA in `off` mode.

Effect on Terraform: every `kubernetes_namespace` resource shows the label
as pending-removal (`-> null`) on every `scripts/tg plan`. Dawarich survey
2026-04-18 confirmed the drift. Cluster-side count: 88 namespaces carry the
label (`kubectl get ns -o json | jq ... | wc -l`). Every TF-managed namespace
is affected.

This commit brings the intentional admission drift under the same
`# KYVERNO_LIFECYCLE_V1` discoverability marker introduced in c9d221d5 for
the ndots dns_config pattern. The marker now stands generically for any
Kyverno admission-webhook drift suppression; the inline comment records
which specific policy stamps which specific field so future grep audits
show why each suppression exists.

## This change

107 `.tf` files touched — every stack's `resource "kubernetes_namespace"`
resource gets:

```hcl
lifecycle {
  # KYVERNO_LIFECYCLE_V1: goldilocks-vpa-auto-mode ClusterPolicy stamps this label on every namespace
  ignore_changes = [metadata[0].labels["goldilocks.fairwinds.com/vpa-update-mode"]]
}
```

Injection was done with a brace-depth-tracking Python pass (`/tmp/add_goldilocks_ignore.py`):
match `^resource "kubernetes_namespace" ` → track `{` / `}` until the
outermost closing brace → insert the lifecycle block before the closing
brace. The script is idempotent (skips any file that already mentions
`goldilocks.fairwinds.com/vpa-update-mode`) so re-running is safe.

Vault stack picked up 2 namespaces in the same file (k8s-users produces
one, plus a second explicit ns) — confirmed via file diff (+8 lines).

## What is NOT in this change

- `stacks/trading-bot/main.tf` — entire file is `/* … */` commented out
  (paused 2026-04-06 per user decision). Reverted after the script ran.
- `stacks/_template/main.tf.example` — per-stack skeleton, intentionally
  minimal. User keeps it that way. Not touched by the script (file
  has no real `resource "kubernetes_namespace"` — only a placeholder
  comment).
- `.terraform/` copies (e.g. `stacks/metallb/.terraform/modules/...`) —
  gitignored, won't commit; the live path was edited.
- `terraform fmt` cleanup of adjacent pre-existing alignment issues in
  authentik, freedify, hermes-agent, nvidia, vault, meshcentral. Reverted
  to keep the commit scoped to the Goldilocks sweep. Those files will
  need a separate fmt-only commit or will be cleaned up on next real
  apply to that stack.

## Verification

Dawarich (one of the hundred-plus touched stacks) showed the pattern
before and after:

```
$ cd stacks/dawarich && ../../scripts/tg plan

Before:
  Plan: 0 to add, 2 to change, 0 to destroy.
   # kubernetes_namespace.dawarich will be updated in-place
     (goldilocks.fairwinds.com/vpa-update-mode -> null)
   # module.tls_secret.kubernetes_secret.tls_secret will be updated in-place
     (Kyverno generate.* labels — fixed in 8d94688d)

After:
  No changes. Your infrastructure matches the configuration.
```

Injection count check:
```
$ rg -c 'KYVERNO_LIFECYCLE_V1: goldilocks-vpa-auto-mode' stacks/ | awk -F: '{s+=$2} END {print s}'
108
```

## Reproduce locally
1. `git pull`
2. Pick any stack: `cd stacks/<name> && ../../scripts/tg plan`
3. Expect: no drift on the namespace's goldilocks.fairwinds.com/vpa-update-mode label.

Closes: code-dwx

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-18 21:15:27 +00:00
+								  lifecycle {
 								    # KYVERNO_LIFECYCLE_V1: goldilocks-vpa-auto-mode ClusterPolicy stamps this label on every namespace
 								    ignore_changes = [metadata[0].labels["goldilocks.fairwinds.com/vpa-update-mode"]]
 								  }
-												[ci skip] Flatten module wrappers into stack roots

Remove the module "xxx" { source = "./module" } indirection layer
from all 66 service stacks. Resources are now defined directly in
each stack's main.tf instead of through a wrapper module.

- Merge module/main.tf contents into stack main.tf
- Apply variable replacements (var.tier -> local.tiers.X, renamed vars)
- Fix shared module paths (one fewer ../ at each level)
- Move extra files/dirs (factory/, chart_values, subdirs) to stack root
- Update state files to strip module.<name>. prefix
- Update CLAUDE.md to reflect flat structure

Verified: terragrunt plan shows 0 add, 0 destroy across all stacks.

											
										
										
											2026-02-22 15:13:55 +00:00
+								}
 								module "tls_secret" {
 								  source          = "../../modules/kubernetes/setup_tls_secret"
 								  namespace       = kubernetes_namespace.openclaw.metadata[0].name
 								  tls_secret_name = var.tls_secret_name
 								}
-												migrate consuming stacks to ESO + remove k8s-dashboard static token

Phase 9: ExternalSecret migration across 26 stacks:

Fully migrated (vault data source removed, ESO delivers secrets):
- speedtest, shadowsocks, wealthfolio, plotting-book, f1-stream, tandoor
- n8n, dawarich, diun, netbox, onlyoffice, tuya-bridge
- hackmd (ESO template for DB URL), health (ESO template for DB URL)
- trading-bot (ESO template for DATABASE_URL + 7 secret env vars)
- forgejo (removed unused vault data source)

Partially migrated (vault kept for plan-time, ESO added for runtime):
- immich, linkwarden, nextcloud, paperless-ngx (jsondecode for homepage)
- claude-memory, rybbit, url, webhook_handler (plan-time in locals/jobs)
- woodpecker, openclaw, resume (plan-time in helm values/jobs/modules)

17 stacks unchanged (all plan-time: homepage annotations, configmaps,
module inputs) — vault data source works with OIDC auth.

Phase 17a: Remove k8s-dashboard static admin token secret.
Users now get tokens via: vault write kubernetes/creds/dashboard-admin

											
										
										
											2026-03-15 19:05:04 +00:00
+								resource "kubernetes_manifest" "external_secret" {
 								  manifest = {
 								    apiVersion = "external-secrets.io/v1beta1"
 								    kind       = "ExternalSecret"
 								    metadata = {
 								      name      = "openclaw-secrets"
 								      namespace = "openclaw"
 								    }
 								    spec = {
 								      refreshInterval = "15m"
 								      secretStoreRef = {
 								        name = "vault-kv"
 								        kind = "ClusterSecretStore"
 								      }
 								      target = {
 								        name = "openclaw-secrets"
 								      }
 								      dataFrom = [{
 								        extract = {
 								          key = "openclaw"
 								        }
 								      }]
 								    }
 								  }
 								  depends_on = [kubernetes_namespace.openclaw]
 								}
-												[ci skip] Flatten module wrappers into stack roots

Remove the module "xxx" { source = "./module" } indirection layer
from all 66 service stacks. Resources are now defined directly in
each stack's main.tf instead of through a wrapper module.

- Merge module/main.tf contents into stack main.tf
- Apply variable replacements (var.tier -> local.tiers.X, renamed vars)
- Fix shared module paths (one fewer ../ at each level)
- Move extra files/dirs (factory/, chart_values, subdirs) to stack root
- Update state files to strip module.<name>. prefix
- Update CLAUDE.md to reflect flat structure

Verified: terragrunt plan shows 0 add, 0 destroy across all stacks.

											
										
										
											2026-02-22 15:13:55 +00:00
+								resource "kubernetes_service_account" "openclaw" {
 								  metadata {
 								    name      = "openclaw"
 								    namespace = kubernetes_namespace.openclaw.metadata[0].name
 								  }
 								}
 								resource "kubernetes_cluster_role_binding" "openclaw" {
 								  metadata {
 								    name = "openclaw-cluster-admin"
 								  }
 								  subject {
 								    kind      = "ServiceAccount"
 								    name      = kubernetes_service_account.openclaw.metadata[0].name
 								    namespace = kubernetes_namespace.openclaw.metadata[0].name
 								  }
 								  role_ref {
 								    api_group = "rbac.authorization.k8s.io"
 								    kind      = "ClusterRole"
 								    name      = "cluster-admin"
 								  }
 								}
 								resource "kubernetes_secret" "ssh_key" {
 								  metadata {
 								    name      = "ssh-key"
 								    namespace = kubernetes_namespace.openclaw.metadata[0].name
 								  }
 								  data = {
-												migrate all secrets from SOPS to Vault KV

- Add vault provider to root terragrunt.hcl (generated providers.tf)
- Delete stacks/vault/vault_provider.tf (now in generated providers.tf)
- Add 124 variable declarations + 43 vault_kv_secret_v2 resources to
  vault/main.tf to populate Vault KV at secret/<stack-name>
- Migrate 43 consuming stacks to read secrets from Vault KV via
  data "vault_kv_secret_v2" instead of SOPS var-file
- Add dependency "vault" to all migrated stacks' terragrunt.hcl
- Complex types (maps/lists) stored as JSON strings, decoded with
  jsondecode() in locals blocks

Bootstrap secrets (vault_root_token, vault_authentik_client_id,
vault_authentik_client_secret) remain in SOPS permanently.

Apply order: vault stack first (populates KV), then all others.

											
										
										
											2026-03-14 17:15:48 +00:00
+								    "id_rsa" = data.vault_kv_secret_v2.secrets.data["ssh_key"]
-												[ci skip] Flatten module wrappers into stack roots

Remove the module "xxx" { source = "./module" } indirection layer
from all 66 service stacks. Resources are now defined directly in
each stack's main.tf instead of through a wrapper module.

- Merge module/main.tf contents into stack main.tf
- Apply variable replacements (var.tier -> local.tiers.X, renamed vars)
- Fix shared module paths (one fewer ../ at each level)
- Move extra files/dirs (factory/, chart_values, subdirs) to stack root
- Update state files to strip module.<name>. prefix
- Update CLAUDE.md to reflect flat structure

Verified: terragrunt plan shows 0 add, 0 destroy across all stacks.

											
										
										
											2026-02-22 15:13:55 +00:00
+								  }
 								  type = "generic"
 								}
 								resource "kubernetes_config_map" "git_crypt_key" {
 								  metadata {
 								    name      = "git-crypt-key"
 								    namespace = kubernetes_namespace.openclaw.metadata[0].name
 								  }
 								  data = {
 								    "key" = filebase64("${path.root}/../../.git/git-crypt/keys/default")
 								  }
 								}
 								resource "kubernetes_config_map" "openclaw_config" {
 								  metadata {
 								    name      = "openclaw-config"
 								    namespace = kubernetes_namespace.openclaw.metadata[0].name
 								  }
 								  data = {
 								    "openclaw.json" = jsonencode({
 								      gateway = {
-												[ci skip] openclaw: fix Telegram, update to v2026.2.26, fix startup issues

- Update OpenClaw from v2026.2.9 to v2026.2.26 (fixes Telegram channel)
- Add gateway.mode=local + wizard block (required for channel startup)
- Add dangerouslyAllowHostHeaderOriginFallback (v2026.2.26 requirement)
- Run doctor --fix at container startup to auto-enable Telegram
- Create required dirs (canvas, devices, cron, sessions, credentials)
- Fix permissions: chown -R 1000:1000 for node user
- Telegram: DM allowlist, user 8281953845 only

											
										
										
											2026-03-01 15:47:54 +00:00
+								        mode           = "local"
-												[ci skip] Flatten module wrappers into stack roots

Remove the module "xxx" { source = "./module" } indirection layer
from all 66 service stacks. Resources are now defined directly in
each stack's main.tf instead of through a wrapper module.

- Merge module/main.tf contents into stack main.tf
- Apply variable replacements (var.tier -> local.tiers.X, renamed vars)
- Fix shared module paths (one fewer ../ at each level)
- Move extra files/dirs (factory/, chart_values, subdirs) to stack root
- Update state files to strip module.<name>. prefix
- Update CLAUDE.md to reflect flat structure

Verified: terragrunt plan shows 0 add, 0 destroy across all stacks.

											
										
										
											2026-02-22 15:13:55 +00:00
+								        bind           = "lan"
 								        trustedProxies = ["10.0.0.0/8"]
 								        controlUi = {
-												[ci skip] openclaw: fix Telegram, update to v2026.2.26, fix startup issues

- Update OpenClaw from v2026.2.9 to v2026.2.26 (fixes Telegram channel)
- Add gateway.mode=local + wizard block (required for channel startup)
- Add dangerouslyAllowHostHeaderOriginFallback (v2026.2.26 requirement)
- Run doctor --fix at container startup to auto-enable Telegram
- Create required dirs (canvas, devices, cron, sessions, credentials)
- Fix permissions: chown -R 1000:1000 for node user
- Telegram: DM allowlist, user 8281953845 only

											
										
										
											2026-03-01 15:47:54 +00:00
+								          dangerouslyDisableDeviceAuth             = true
 								          dangerouslyAllowHostHeaderOriginFallback = true
-												[ci skip] Flatten module wrappers into stack roots

Remove the module "xxx" { source = "./module" } indirection layer
from all 66 service stacks. Resources are now defined directly in
each stack's main.tf instead of through a wrapper module.

- Merge module/main.tf contents into stack main.tf
- Apply variable replacements (var.tier -> local.tiers.X, renamed vars)
- Fix shared module paths (one fewer ../ at each level)
- Move extra files/dirs (factory/, chart_values, subdirs) to stack root
- Update state files to strip module.<name>. prefix
- Update CLAUDE.md to reflect flat structure

Verified: terragrunt plan shows 0 add, 0 destroy across all stacks.

											
										
										
											2026-02-22 15:13:55 +00:00
+								        }
 								      }
 								      agents = {
 								        defaults = {
 								          contextTokens     = 1000000
 								          bootstrapMaxChars = 30000
-												fix(openclaw): change agent workspace from /workspace/infra to /workspace

Keeps infra repo as a subdirectory, allows OpenClaw to write to /workspace directly.

											
										
										
											2026-03-19 23:32:28 +00:00
+								          workspace         = "/workspace"
-												[ci skip] openclaw: disable sandbox mode for unrestricted execution

- Set agents.defaults.sandbox.mode = off
- Combined with exec.host=gateway and exec.security=full,
  OpenClaw can now run any command on the container host

											
										
										
											2026-03-01 16:51:35 +00:00
+								          sandbox = {
 								            mode = "off"
 								          }
-												[ci skip] Flatten module wrappers into stack roots

Remove the module "xxx" { source = "./module" } indirection layer
from all 66 service stacks. Resources are now defined directly in
each stack's main.tf instead of through a wrapper module.

- Merge module/main.tf contents into stack main.tf
- Apply variable replacements (var.tier -> local.tiers.X, renamed vars)
- Fix shared module paths (one fewer ../ at each level)
- Move extra files/dirs (factory/, chart_values, subdirs) to stack root
- Update state files to strip module.<name>. prefix
- Update CLAUDE.md to reflect flat structure

Verified: terragrunt plan shows 0 add, 0 destroy across all stacks.

											
										
										
											2026-02-22 15:13:55 +00:00
+								          model = {
-												openclaw: switch primary to ChatGPT Plus OAuth (openai-codex/gpt-5.4-mini)

Bumps image 2026.2.26 → 2026.5.4 (openai-codex provider plugin landed in
2026.4.21+). Auth profile is OAuth via the device-pairing flow against the
Codex backend (account ancaelena98@gmail.com); token persists in
/home/node/.openclaw/agents/main/agent/auth-state.json on NFS so it survives
pod restarts. Plus tier accepts gpt-5.4-mini (1,200–7,000 local msgs/5h);
gpt-5-mini and gpt-5.1-codex-mini both return errors on Plus, so we pin
gpt-5.4-mini explicitly. doctor --fix auto-promotes the highest-tier model
(gpt-5-pro) after model discovery, so the container command pins the mini
back as default after doctor runs but before gateway start.

											
										
										
											2026-05-06 22:06:32 +00:00
+								            # ChatGPT Plus OAuth via openai-codex plugin (account: ancaelena98@gmail.com).
 								            # gpt-5.4-mini is the only mini variant the Codex backend accepts for Plus tier;
 								            # gpt-5-mini / gpt-5.1-codex-mini return model_not_found / "not supported with
 								            # ChatGPT account". Plus rate-card: 1,200–7,000 local msgs / 5h on gpt-5.4-mini.
 								            primary   = "openai-codex/gpt-5.4-mini"
 								            fallbacks = ["openai-codex/gpt-5.5", "nim/qwen/qwen3-coder-480b-a35b-instruct", "modelrelay/auto-fastest"]
-												[ci skip] Flatten module wrappers into stack roots

Remove the module "xxx" { source = "./module" } indirection layer
from all 66 service stacks. Resources are now defined directly in
each stack's main.tf instead of through a wrapper module.

- Merge module/main.tf contents into stack main.tf
- Apply variable replacements (var.tier -> local.tiers.X, renamed vars)
- Fix shared module paths (one fewer ../ at each level)
- Move extra files/dirs (factory/, chart_values, subdirs) to stack root
- Update state files to strip module.<name>. prefix
- Update CLAUDE.md to reflect flat structure

Verified: terragrunt plan shows 0 add, 0 destroy across all stacks.

											
										
										
											2026-02-22 15:13:55 +00:00
+								          }
 								          models = {
-												[ci skip] openclaw: add modelrelay sidecar as fallback model router

- Deploy modelrelay as sidecar container (auto-routes to fastest free model)
- Configured with NVIDIA NIM + OpenRouter API keys
- Primary: Mistral Large 3 (NIM), Fallback 1: Nemotron Ultra (NIM),
  Fallback 2: modelrelay/auto-fastest (80+ free models)
- Modelrelay web UI available at pod:7352

											
										
										
											2026-03-01 15:57:31 +00:00
+								            "modelrelay/auto-fastest"                                = {}
-												[ci skip] openclaw: switch to free agentic models via NVIDIA NIM, OpenRouter, Llama API

- Primary: Mistral Large 3 (675B) on NIM - always warm, excellent tool calling
- Fallback 1: Nemotron Ultra 253B on NIM
- Fallback 2: Llama 4 Maverick on Llama API (different provider for resilience)
- 10 models total across 3 providers, all free
- Removed: Modal (GLM-5), Gemini, Ollama providers
- Added: NVIDIA NIM provider with DeepSeek V3.2, Qwen 3.5, Qwen 3 Coder, GLM-5
- Bumped maxTokens from 8192 to 16384 for agentic output room

											
										
										
											2026-03-01 13:22:47 +00:00
+								            "nim/deepseek-ai/deepseek-v3.2"                          = {}
 								            "nim/qwen/qwen3.5-397b-a17b"                             = {}
 								            "nim/mistralai/mistral-large-3-675b-instruct-2512"       = {}
 								            "nim/qwen/qwen3-coder-480b-a35b-instruct"                = {}
 								            "nim/nvidia/llama-3.1-nemotron-ultra-253b-v1"            = {}
 								            "nim/z-ai/glm5"                                          = {}
 								            "llama-as-openai/Llama-4-Maverick-17B-128E-Instruct-FP8" = {}
 								            "llama-as-openai/Llama-4-Scout-17B-16E-Instruct-FP8"     = {}
 								            "openrouter/stepfun/step-3.5-flash:free"                 = {}
 								            "openrouter/arcee-ai/trinity-large-preview:free"         = {}
-												openclaw: switch primary to ChatGPT Plus OAuth (openai-codex/gpt-5.4-mini)

Bumps image 2026.2.26 → 2026.5.4 (openai-codex provider plugin landed in
2026.4.21+). Auth profile is OAuth via the device-pairing flow against the
Codex backend (account ancaelena98@gmail.com); token persists in
/home/node/.openclaw/agents/main/agent/auth-state.json on NFS so it survives
pod restarts. Plus tier accepts gpt-5.4-mini (1,200–7,000 local msgs/5h);
gpt-5-mini and gpt-5.1-codex-mini both return errors on Plus, so we pin
gpt-5.4-mini explicitly. doctor --fix auto-promotes the highest-tier model
(gpt-5-pro) after model discovery, so the container command pins the mini
back as default after doctor runs but before gateway start.

											
										
										
											2026-05-06 22:06:32 +00:00
+								            "openai-codex/gpt-5.4-mini"                              = {}
 								            "openai-codex/gpt-5.5"                                   = {}
-												[ci skip] Flatten module wrappers into stack roots

Remove the module "xxx" { source = "./module" } indirection layer
from all 66 service stacks. Resources are now defined directly in
each stack's main.tf instead of through a wrapper module.

- Merge module/main.tf contents into stack main.tf
- Apply variable replacements (var.tier -> local.tiers.X, renamed vars)
- Fix shared module paths (one fewer ../ at each level)
- Move extra files/dirs (factory/, chart_values, subdirs) to stack root
- Update state files to strip module.<name>. prefix
- Update CLAUDE.md to reflect flat structure

Verified: terragrunt plan shows 0 add, 0 destroy across all stacks.

											
										
										
											2026-02-22 15:13:55 +00:00
+								          }
 								        }
 								      }
 								      tools = {
 								        profile = "full"
-												[ci skip] openclaw: remove all tool/command restrictions

- Set tools.deny = [] (was blocking sessions, subagents, browser)
- All tools now available: sessions, subagents, browser, etc.

											
										
										
											2026-03-01 15:58:12 +00:00
+								        deny    = []
-												[ci skip] openclaw: set workspace + enable elevated + native commands

- Set workspace to /workspace/infra (was defaulting to ~/.openclaw/workspace)
- Enable tools.elevated for unrestricted access
- Enable commands.native and commands.nativeSkills
- All tools, commands, and skills now fully accessible

											
										
										
											2026-03-01 17:12:03 +00:00
+								        elevated = {
 								          enabled = true
 								        }
-												[ci skip] Flatten module wrappers into stack roots

Remove the module "xxx" { source = "./module" } indirection layer
from all 66 service stacks. Resources are now defined directly in
each stack's main.tf instead of through a wrapper module.

- Merge module/main.tf contents into stack main.tf
- Apply variable replacements (var.tier -> local.tiers.X, renamed vars)
- Fix shared module paths (one fewer ../ at each level)
- Move extra files/dirs (factory/, chart_values, subdirs) to stack root
- Update state files to strip module.<name>. prefix
- Update CLAUDE.md to reflect flat structure

Verified: terragrunt plan shows 0 add, 0 destroy across all stacks.

											
										
										
											2026-02-22 15:13:55 +00:00
+								        exec = {
-												[ci skip] openclaw: fix exec host — use gateway instead of node

host=node requires a companion app (not available in container).
host=gateway runs commands directly on the gateway process host.

											
										
										
											2026-03-01 16:47:14 +00:00
+								          host        = "gateway"
-												[ci skip] openclaw: fix exec config — use host=node, security=full

Valid options: host=sandbox|gateway|node, security=deny|allowlist|full.
Using node (run on container host) with full (no command restrictions).

											
										
										
											2026-03-01 16:42:22 +00:00
+								          security    = "full"
-												[ci skip] Flatten module wrappers into stack roots

Remove the module "xxx" { source = "./module" } indirection layer
from all 66 service stacks. Resources are now defined directly in
each stack's main.tf instead of through a wrapper module.

- Merge module/main.tf contents into stack main.tf
- Apply variable replacements (var.tier -> local.tiers.X, renamed vars)
- Fix shared module paths (one fewer ../ at each level)
- Move extra files/dirs (factory/, chart_values, subdirs) to stack root
- Update state files to strip module.<name>. prefix
- Update CLAUDE.md to reflect flat structure

Verified: terragrunt plan shows 0 add, 0 destroy across all stacks.

											
										
										
											2026-02-22 15:13:55 +00:00
+								          ask         = "off"
-												fix(openclaw): change agent workspace from /workspace/infra to /workspace

Keeps infra repo as a subdirectory, allows OpenClaw to write to /workspace directly.

											
										
										
											2026-03-19 23:32:28 +00:00
+								          pathPrepend = ["/tools", "/workspace"]
-												[ci skip] Flatten module wrappers into stack roots

Remove the module "xxx" { source = "./module" } indirection layer
from all 66 service stacks. Resources are now defined directly in
each stack's main.tf instead of through a wrapper module.

- Merge module/main.tf contents into stack main.tf
- Apply variable replacements (var.tier -> local.tiers.X, renamed vars)
- Fix shared module paths (one fewer ../ at each level)
- Move extra files/dirs (factory/, chart_values, subdirs) to stack root
- Update state files to strip module.<name>. prefix
- Update CLAUDE.md to reflect flat structure

Verified: terragrunt plan shows 0 add, 0 destroy across all stacks.

											
										
										
											2026-02-22 15:13:55 +00:00
+								        }
 								        web = {
 								          search = {
 								            enabled    = true
 								            provider   = "brave"
-												migrate all secrets from SOPS to Vault KV

- Add vault provider to root terragrunt.hcl (generated providers.tf)
- Delete stacks/vault/vault_provider.tf (now in generated providers.tf)
- Add 124 variable declarations + 43 vault_kv_secret_v2 resources to
  vault/main.tf to populate Vault KV at secret/<stack-name>
- Migrate 43 consuming stacks to read secrets from Vault KV via
  data "vault_kv_secret_v2" instead of SOPS var-file
- Add dependency "vault" to all migrated stacks' terragrunt.hcl
- Complex types (maps/lists) stored as JSON strings, decoded with
  jsondecode() in locals blocks

Bootstrap secrets (vault_root_token, vault_authentik_client_id,
vault_authentik_client_secret) remain in SOPS permanently.

Apply order: vault stack first (populates KV), then all others.

											
										
										
											2026-03-14 17:15:48 +00:00
+								            apiKey     = data.vault_kv_secret_v2.secrets.data["brave_api_key"]
-												[ci skip] Flatten module wrappers into stack roots

Remove the module "xxx" { source = "./module" } indirection layer
from all 66 service stacks. Resources are now defined directly in
each stack's main.tf instead of through a wrapper module.

- Merge module/main.tf contents into stack main.tf
- Apply variable replacements (var.tier -> local.tiers.X, renamed vars)
- Fix shared module paths (one fewer ../ at each level)
- Move extra files/dirs (factory/, chart_values, subdirs) to stack root
- Update state files to strip module.<name>. prefix
- Update CLAUDE.md to reflect flat structure

Verified: terragrunt plan shows 0 add, 0 destroy across all stacks.

											
										
										
											2026-02-22 15:13:55 +00:00
+								            maxResults = 5
 								          }
 								          fetch = {
 								            enabled        = true
 								            maxChars       = 50000
 								            timeoutSeconds = 30
 								          }
 								        }
 								      }
-												fix: eliminate memory overcommit to prevent node OOM crashes

Set requests = limits (Guaranteed QoS) across LimitRange defaults and
explicit pod resources. Node2 crashed 2026-03-14 from 250% memory
overcommit (61GB limits on 24GB node).

Changes:
- LimitRange: default = defaultRequest for all 6 tiers
- Grafana: 3 → 2 replicas
- Grampsweb: document why replicas=0
- Prometheus: 1Gi/4Gi → 3Gi/3Gi
- OpenClaw: 512Mi/2Gi → 768Mi/768Mi
- Immich server: 256Mi/2Gi → 512Mi/512Mi
- Immich postgresql: 256Mi/1Gi → 512Mi/512Mi
- Calibre: 256Mi/1536Mi → 256Mi/256Mi
- Linkwarden: 256Mi/1536Mi → 768Mi/768Mi
- N8N: 256Mi/1Gi → 512Mi/512Mi
- MySQL cluster: 1Gi/3-4Gi → 2Gi/2Gi
- pg-cluster (CNPG): 512Mi/4Gi → 512Mi/512Mi
- DBaaS ResourceQuota limits.memory: 64Gi → 12Gi

[ci skip]

											
										
										
											2026-03-14 16:01:41 +00:00
+								      plugins = {
-												enable memory-core plugin for OpenClaw [ci skip]

- Add memory-core to plugins.allow and plugins.slots.memory
- Add /app/extensions to plugin load paths
- Update CLAUDE.md memory instructions to reference native tools

											
										
										
											2026-03-15 02:39:14 +00:00
+								        allow = ["memory-core"]
 								        slots = { memory = "memory-core" }
-												fix: eliminate memory overcommit to prevent node OOM crashes

Set requests = limits (Guaranteed QoS) across LimitRange defaults and
explicit pod resources. Node2 crashed 2026-03-14 from 250% memory
overcommit (61GB limits on 24GB node).

Changes:
- LimitRange: default = defaultRequest for all 6 tiers
- Grafana: 3 → 2 replicas
- Grampsweb: document why replicas=0
- Prometheus: 1Gi/4Gi → 3Gi/3Gi
- OpenClaw: 512Mi/2Gi → 768Mi/768Mi
- Immich server: 256Mi/2Gi → 512Mi/512Mi
- Immich postgresql: 256Mi/1Gi → 512Mi/512Mi
- Calibre: 256Mi/1536Mi → 256Mi/256Mi
- Linkwarden: 256Mi/1536Mi → 768Mi/768Mi
- N8N: 256Mi/1Gi → 512Mi/512Mi
- MySQL cluster: 1Gi/3-4Gi → 2Gi/2Gi
- pg-cluster (CNPG): 512Mi/4Gi → 512Mi/512Mi
- DBaaS ResourceQuota limits.memory: 64Gi → 12Gi

[ci skip]

											
										
										
											2026-03-14 16:01:41 +00:00
+								        load = {
-												enable memory-core plugin for OpenClaw [ci skip]

- Add memory-core to plugins.allow and plugins.slots.memory
- Add /app/extensions to plugin load paths
- Update CLAUDE.md memory instructions to reference native tools

											
										
										
											2026-03-15 02:39:14 +00:00
+								          paths = ["/home/node/.openclaw/extensions", "/app/extensions"]
-												fix: eliminate memory overcommit to prevent node OOM crashes

Set requests = limits (Guaranteed QoS) across LimitRange defaults and
explicit pod resources. Node2 crashed 2026-03-14 from 250% memory
overcommit (61GB limits on 24GB node).

Changes:
- LimitRange: default = defaultRequest for all 6 tiers
- Grafana: 3 → 2 replicas
- Grampsweb: document why replicas=0
- Prometheus: 1Gi/4Gi → 3Gi/3Gi
- OpenClaw: 512Mi/2Gi → 768Mi/768Mi
- Immich server: 256Mi/2Gi → 512Mi/512Mi
- Immich postgresql: 256Mi/1Gi → 512Mi/512Mi
- Calibre: 256Mi/1536Mi → 256Mi/256Mi
- Linkwarden: 256Mi/1536Mi → 768Mi/768Mi
- N8N: 256Mi/1Gi → 512Mi/512Mi
- MySQL cluster: 1Gi/3-4Gi → 2Gi/2Gi
- pg-cluster (CNPG): 512Mi/4Gi → 512Mi/512Mi
- DBaaS ResourceQuota limits.memory: 64Gi → 12Gi

[ci skip]

											
										
										
											2026-03-14 16:01:41 +00:00
+								        }
 								      }
-												[ci skip] openclaw: set workspace + enable elevated + native commands

- Set workspace to /workspace/infra (was defaulting to ~/.openclaw/workspace)
- Enable tools.elevated for unrestricted access
- Enable commands.native and commands.nativeSkills
- All tools, commands, and skills now fully accessible

											
										
										
											2026-03-01 17:12:03 +00:00
+								      commands = {
 								        native       = true
 								        nativeSkills = true
 								      }
-												[ci skip] openclaw: add Telegram channel + install terragrunt in init container

- Add Telegram bot integration (DM allowlist, user 8281953845 only)
- Install terragrunt v0.99.4 in init container alongside terraform
- Remove terraform init from init (terragrunt handles this per-stack)
- Add openclaw_telegram_bot_token variable

											
										
										
											2026-03-01 13:44:58 +00:00
+								      channels = {
 								        telegram = {
-												[ci skip] openclaw: fix Telegram, update to v2026.2.26, fix startup issues

- Update OpenClaw from v2026.2.9 to v2026.2.26 (fixes Telegram channel)
- Add gateway.mode=local + wizard block (required for channel startup)
- Add dangerouslyAllowHostHeaderOriginFallback (v2026.2.26 requirement)
- Run doctor --fix at container startup to auto-enable Telegram
- Create required dirs (canvas, devices, cron, sessions, credentials)
- Fix permissions: chown -R 1000:1000 for node user
- Telegram: DM allowlist, user 8281953845 only

											
										
										
											2026-03-01 15:47:54 +00:00
+								          enabled     = true
-												migrate all secrets from SOPS to Vault KV

- Add vault provider to root terragrunt.hcl (generated providers.tf)
- Delete stacks/vault/vault_provider.tf (now in generated providers.tf)
- Add 124 variable declarations + 43 vault_kv_secret_v2 resources to
  vault/main.tf to populate Vault KV at secret/<stack-name>
- Migrate 43 consuming stacks to read secrets from Vault KV via
  data "vault_kv_secret_v2" instead of SOPS var-file
- Add dependency "vault" to all migrated stacks' terragrunt.hcl
- Complex types (maps/lists) stored as JSON strings, decoded with
  jsondecode() in locals blocks

Bootstrap secrets (vault_root_token, vault_authentik_client_id,
vault_authentik_client_secret) remain in SOPS permanently.

Apply order: vault stack first (populates KV), then all others.

											
										
										
											2026-03-14 17:15:48 +00:00
+								          botToken    = data.vault_kv_secret_v2.secrets.data["telegram_bot_token"]
-												[ci skip] openclaw: fix Telegram, update to v2026.2.26, fix startup issues

- Update OpenClaw from v2026.2.9 to v2026.2.26 (fixes Telegram channel)
- Add gateway.mode=local + wizard block (required for channel startup)
- Add dangerouslyAllowHostHeaderOriginFallback (v2026.2.26 requirement)
- Run doctor --fix at container startup to auto-enable Telegram
- Create required dirs (canvas, devices, cron, sessions, credentials)
- Fix permissions: chown -R 1000:1000 for node user
- Telegram: DM allowlist, user 8281953845 only

											
										
										
											2026-03-01 15:47:54 +00:00
+								          dmPolicy    = "allowlist"
 								          allowFrom   = ["tg:8281953845"]
 								          groupPolicy = "allowlist"
 								          streamMode  = "partial"
-												[ci skip] openclaw: add Telegram channel + install terragrunt in init container

- Add Telegram bot integration (DM allowlist, user 8281953845 only)
- Install terragrunt v0.99.4 in init container alongside terraform
- Remove terraform init from init (terragrunt handles this per-stack)
- Add openclaw_telegram_bot_token variable

											
										
										
											2026-03-01 13:44:58 +00:00
+								        }
 								      }
-												[ci skip] Flatten module wrappers into stack roots

Remove the module "xxx" { source = "./module" } indirection layer
from all 66 service stacks. Resources are now defined directly in
each stack's main.tf instead of through a wrapper module.

- Merge module/main.tf contents into stack main.tf
- Apply variable replacements (var.tier -> local.tiers.X, renamed vars)
- Fix shared module paths (one fewer ../ at each level)
- Move extra files/dirs (factory/, chart_values, subdirs) to stack root
- Update state files to strip module.<name>. prefix
- Update CLAUDE.md to reflect flat structure

Verified: terragrunt plan shows 0 add, 0 destroy across all stacks.

											
										
										
											2026-02-22 15:13:55 +00:00
+								      models = {
 								        mode = "merge"
 								        providers = {
-												[ci skip] openclaw: add modelrelay sidecar as fallback model router

- Deploy modelrelay as sidecar container (auto-routes to fastest free model)
- Configured with NVIDIA NIM + OpenRouter API keys
- Primary: Mistral Large 3 (NIM), Fallback 1: Nemotron Ultra (NIM),
  Fallback 2: modelrelay/auto-fastest (80+ free models)
- Modelrelay web UI available at pod:7352

											
										
										
											2026-03-01 15:57:31 +00:00
+								          modelrelay = {
 								            baseUrl = "http://127.0.0.1:7352/v1"
 								            api     = "openai-completions"
 								            apiKey  = "modelrelay"
 								            models = [
 								              { id = "auto-fastest", name = "Auto (Fastest)", reasoning = false, input = ["text"], contextWindow = 200000, maxTokens = 16384, cost = { input = 0, output = 0, cacheRead = 0, cacheWrite = 0 } },
 								            ]
 								          }
-												[ci skip] openclaw: switch to free agentic models via NVIDIA NIM, OpenRouter, Llama API

- Primary: Mistral Large 3 (675B) on NIM - always warm, excellent tool calling
- Fallback 1: Nemotron Ultra 253B on NIM
- Fallback 2: Llama 4 Maverick on Llama API (different provider for resilience)
- 10 models total across 3 providers, all free
- Removed: Modal (GLM-5), Gemini, Ollama providers
- Added: NVIDIA NIM provider with DeepSeek V3.2, Qwen 3.5, Qwen 3 Coder, GLM-5
- Bumped maxTokens from 8192 to 16384 for agentic output room

											
										
										
											2026-03-01 13:22:47 +00:00
+								          nim = {
 								            baseUrl = "https://integrate.api.nvidia.com/v1"
-												[ci skip] Flatten module wrappers into stack roots

Remove the module "xxx" { source = "./module" } indirection layer
from all 66 service stacks. Resources are now defined directly in
each stack's main.tf instead of through a wrapper module.

- Merge module/main.tf contents into stack main.tf
- Apply variable replacements (var.tier -> local.tiers.X, renamed vars)
- Fix shared module paths (one fewer ../ at each level)
- Move extra files/dirs (factory/, chart_values, subdirs) to stack root
- Update state files to strip module.<name>. prefix
- Update CLAUDE.md to reflect flat structure

Verified: terragrunt plan shows 0 add, 0 destroy across all stacks.

											
										
										
											2026-02-22 15:13:55 +00:00
+								            api     = "openai-completions"
-												migrate all secrets from SOPS to Vault KV

- Add vault provider to root terragrunt.hcl (generated providers.tf)
- Delete stacks/vault/vault_provider.tf (now in generated providers.tf)
- Add 124 variable declarations + 43 vault_kv_secret_v2 resources to
  vault/main.tf to populate Vault KV at secret/<stack-name>
- Migrate 43 consuming stacks to read secrets from Vault KV via
  data "vault_kv_secret_v2" instead of SOPS var-file
- Add dependency "vault" to all migrated stacks' terragrunt.hcl
- Complex types (maps/lists) stored as JSON strings, decoded with
  jsondecode() in locals blocks

Bootstrap secrets (vault_root_token, vault_authentik_client_id,
vault_authentik_client_secret) remain in SOPS permanently.

Apply order: vault stack first (populates KV), then all others.

											
										
										
											2026-03-14 17:15:48 +00:00
+								            apiKey  = data.vault_kv_secret_v2.secrets.data["nvidia_api_key"]
-												[ci skip] Flatten module wrappers into stack roots

Remove the module "xxx" { source = "./module" } indirection layer
from all 66 service stacks. Resources are now defined directly in
each stack's main.tf instead of through a wrapper module.

- Merge module/main.tf contents into stack main.tf
- Apply variable replacements (var.tier -> local.tiers.X, renamed vars)
- Fix shared module paths (one fewer ../ at each level)
- Move extra files/dirs (factory/, chart_values, subdirs) to stack root
- Update state files to strip module.<name>. prefix
- Update CLAUDE.md to reflect flat structure

Verified: terragrunt plan shows 0 add, 0 destroy across all stacks.

											
										
										
											2026-02-22 15:13:55 +00:00
+								            models = [
-												[ci skip] openclaw: switch to free agentic models via NVIDIA NIM, OpenRouter, Llama API

- Primary: Mistral Large 3 (675B) on NIM - always warm, excellent tool calling
- Fallback 1: Nemotron Ultra 253B on NIM
- Fallback 2: Llama 4 Maverick on Llama API (different provider for resilience)
- 10 models total across 3 providers, all free
- Removed: Modal (GLM-5), Gemini, Ollama providers
- Added: NVIDIA NIM provider with DeepSeek V3.2, Qwen 3.5, Qwen 3 Coder, GLM-5
- Bumped maxTokens from 8192 to 16384 for agentic output room

											
										
										
											2026-03-01 13:22:47 +00:00
+								              { id = "deepseek-ai/deepseek-v3.2", name = "DeepSeek V3.2", reasoning = false, input = ["text"], contextWindow = 164000, maxTokens = 16384, cost = { input = 0, output = 0, cacheRead = 0, cacheWrite = 0 } },
 								              { id = "qwen/qwen3.5-397b-a17b", name = "Qwen 3.5", reasoning = true, input = ["text"], contextWindow = 262000, maxTokens = 16384, cost = { input = 0, output = 0, cacheRead = 0, cacheWrite = 0 } },
 								              { id = "mistralai/mistral-large-3-675b-instruct-2512", name = "Mistral Large 3", reasoning = false, input = ["text"], contextWindow = 262000, maxTokens = 16384, cost = { input = 0, output = 0, cacheRead = 0, cacheWrite = 0 } },
 								              { id = "qwen/qwen3-coder-480b-a35b-instruct", name = "Qwen 3 Coder", reasoning = false, input = ["text"], contextWindow = 262000, maxTokens = 16384, cost = { input = 0, output = 0, cacheRead = 0, cacheWrite = 0 } },
 								              { id = "nvidia/llama-3.1-nemotron-ultra-253b-v1", name = "Nemotron Ultra 253B", reasoning = true, input = ["text"], contextWindow = 128000, maxTokens = 16384, cost = { input = 0, output = 0, cacheRead = 0, cacheWrite = 0 } },
 								              { id = "z-ai/glm5", name = "GLM-5", reasoning = false, input = ["text"], contextWindow = 128000, maxTokens = 16384, cost = { input = 0, output = 0, cacheRead = 0, cacheWrite = 0 } },
-												[ci skip] Flatten module wrappers into stack roots

Remove the module "xxx" { source = "./module" } indirection layer
from all 66 service stacks. Resources are now defined directly in
each stack's main.tf instead of through a wrapper module.

- Merge module/main.tf contents into stack main.tf
- Apply variable replacements (var.tier -> local.tiers.X, renamed vars)
- Fix shared module paths (one fewer ../ at each level)
- Move extra files/dirs (factory/, chart_values, subdirs) to stack root
- Update state files to strip module.<name>. prefix
- Update CLAUDE.md to reflect flat structure

Verified: terragrunt plan shows 0 add, 0 destroy across all stacks.

											
										
										
											2026-02-22 15:13:55 +00:00
+								            ]
 								          }
-												[ci skip] openclaw: switch to free agentic models via NVIDIA NIM, OpenRouter, Llama API

- Primary: Mistral Large 3 (675B) on NIM - always warm, excellent tool calling
- Fallback 1: Nemotron Ultra 253B on NIM
- Fallback 2: Llama 4 Maverick on Llama API (different provider for resilience)
- 10 models total across 3 providers, all free
- Removed: Modal (GLM-5), Gemini, Ollama providers
- Added: NVIDIA NIM provider with DeepSeek V3.2, Qwen 3.5, Qwen 3 Coder, GLM-5
- Bumped maxTokens from 8192 to 16384 for agentic output room

											
										
										
											2026-03-01 13:22:47 +00:00
+								          openrouter = {
 								            baseUrl = "https://openrouter.ai/api/v1"
-												[ci skip] Flatten module wrappers into stack roots

Remove the module "xxx" { source = "./module" } indirection layer
from all 66 service stacks. Resources are now defined directly in
each stack's main.tf instead of through a wrapper module.

- Merge module/main.tf contents into stack main.tf
- Apply variable replacements (var.tier -> local.tiers.X, renamed vars)
- Fix shared module paths (one fewer ../ at each level)
- Move extra files/dirs (factory/, chart_values, subdirs) to stack root
- Update state files to strip module.<name>. prefix
- Update CLAUDE.md to reflect flat structure

Verified: terragrunt plan shows 0 add, 0 destroy across all stacks.

											
										
										
											2026-02-22 15:13:55 +00:00
+								            api     = "openai-completions"
-												migrate all secrets from SOPS to Vault KV

- Add vault provider to root terragrunt.hcl (generated providers.tf)
- Delete stacks/vault/vault_provider.tf (now in generated providers.tf)
- Add 124 variable declarations + 43 vault_kv_secret_v2 resources to
  vault/main.tf to populate Vault KV at secret/<stack-name>
- Migrate 43 consuming stacks to read secrets from Vault KV via
  data "vault_kv_secret_v2" instead of SOPS var-file
- Add dependency "vault" to all migrated stacks' terragrunt.hcl
- Complex types (maps/lists) stored as JSON strings, decoded with
  jsondecode() in locals blocks

Bootstrap secrets (vault_root_token, vault_authentik_client_id,
vault_authentik_client_secret) remain in SOPS permanently.

Apply order: vault stack first (populates KV), then all others.

											
										
										
											2026-03-14 17:15:48 +00:00
+								            apiKey  = data.vault_kv_secret_v2.secrets.data["openrouter_api_key"]
-												[ci skip] Flatten module wrappers into stack roots

Remove the module "xxx" { source = "./module" } indirection layer
from all 66 service stacks. Resources are now defined directly in
each stack's main.tf instead of through a wrapper module.

- Merge module/main.tf contents into stack main.tf
- Apply variable replacements (var.tier -> local.tiers.X, renamed vars)
- Fix shared module paths (one fewer ../ at each level)
- Move extra files/dirs (factory/, chart_values, subdirs) to stack root
- Update state files to strip module.<name>. prefix
- Update CLAUDE.md to reflect flat structure

Verified: terragrunt plan shows 0 add, 0 destroy across all stacks.

											
										
										
											2026-02-22 15:13:55 +00:00
+								            models = [
-												[ci skip] openclaw: switch to free agentic models via NVIDIA NIM, OpenRouter, Llama API

- Primary: Mistral Large 3 (675B) on NIM - always warm, excellent tool calling
- Fallback 1: Nemotron Ultra 253B on NIM
- Fallback 2: Llama 4 Maverick on Llama API (different provider for resilience)
- 10 models total across 3 providers, all free
- Removed: Modal (GLM-5), Gemini, Ollama providers
- Added: NVIDIA NIM provider with DeepSeek V3.2, Qwen 3.5, Qwen 3 Coder, GLM-5
- Bumped maxTokens from 8192 to 16384 for agentic output room

											
										
										
											2026-03-01 13:22:47 +00:00
+								              { id = "stepfun/step-3.5-flash:free", name = "Step 3.5 Flash", reasoning = true, input = ["text"], contextWindow = 256000, maxTokens = 16384, cost = { input = 0, output = 0, cacheRead = 0, cacheWrite = 0 } },
 								              { id = "arcee-ai/trinity-large-preview:free", name = "Trinity Large", reasoning = false, input = ["text"], contextWindow = 131000, maxTokens = 16384, cost = { input = 0, output = 0, cacheRead = 0, cacheWrite = 0 } },
-												[ci skip] Flatten module wrappers into stack roots

Remove the module "xxx" { source = "./module" } indirection layer
from all 66 service stacks. Resources are now defined directly in
each stack's main.tf instead of through a wrapper module.

- Merge module/main.tf contents into stack main.tf
- Apply variable replacements (var.tier -> local.tiers.X, renamed vars)
- Fix shared module paths (one fewer ../ at each level)
- Move extra files/dirs (factory/, chart_values, subdirs) to stack root
- Update state files to strip module.<name>. prefix
- Update CLAUDE.md to reflect flat structure

Verified: terragrunt plan shows 0 add, 0 destroy across all stacks.

											
										
										
											2026-02-22 15:13:55 +00:00
+								            ]
 								          }
 								          llama-as-openai = {
 								            baseUrl = "https://api.llama.com/compat/v1"
-												migrate all secrets from SOPS to Vault KV

- Add vault provider to root terragrunt.hcl (generated providers.tf)
- Delete stacks/vault/vault_provider.tf (now in generated providers.tf)
- Add 124 variable declarations + 43 vault_kv_secret_v2 resources to
  vault/main.tf to populate Vault KV at secret/<stack-name>
- Migrate 43 consuming stacks to read secrets from Vault KV via
  data "vault_kv_secret_v2" instead of SOPS var-file
- Add dependency "vault" to all migrated stacks' terragrunt.hcl
- Complex types (maps/lists) stored as JSON strings, decoded with
  jsondecode() in locals blocks

Bootstrap secrets (vault_root_token, vault_authentik_client_id,
vault_authentik_client_secret) remain in SOPS permanently.

Apply order: vault stack first (populates KV), then all others.

											
										
										
											2026-03-14 17:15:48 +00:00
+								            apiKey  = data.vault_kv_secret_v2.secrets.data["llama_api_key"]
-												[ci skip] Flatten module wrappers into stack roots

Remove the module "xxx" { source = "./module" } indirection layer
from all 66 service stacks. Resources are now defined directly in
each stack's main.tf instead of through a wrapper module.

- Merge module/main.tf contents into stack main.tf
- Apply variable replacements (var.tier -> local.tiers.X, renamed vars)
- Fix shared module paths (one fewer ../ at each level)
- Move extra files/dirs (factory/, chart_values, subdirs) to stack root
- Update state files to strip module.<name>. prefix
- Update CLAUDE.md to reflect flat structure

Verified: terragrunt plan shows 0 add, 0 destroy across all stacks.

											
										
										
											2026-02-22 15:13:55 +00:00
+								            api     = "openai-completions"
 								            models = [
-												[ci skip] openclaw: switch to free agentic models via NVIDIA NIM, OpenRouter, Llama API

- Primary: Mistral Large 3 (675B) on NIM - always warm, excellent tool calling
- Fallback 1: Nemotron Ultra 253B on NIM
- Fallback 2: Llama 4 Maverick on Llama API (different provider for resilience)
- 10 models total across 3 providers, all free
- Removed: Modal (GLM-5), Gemini, Ollama providers
- Added: NVIDIA NIM provider with DeepSeek V3.2, Qwen 3.5, Qwen 3 Coder, GLM-5
- Bumped maxTokens from 8192 to 16384 for agentic output room

											
										
										
											2026-03-01 13:22:47 +00:00
+								              { id = "Llama-4-Maverick-17B-128E-Instruct-FP8", name = "Llama 4 Maverick", reasoning = false, input = ["text"], contextWindow = 200000, maxTokens = 16384, cost = { input = 0, output = 0, cacheRead = 0, cacheWrite = 0 } },
 								              { id = "Llama-4-Scout-17B-16E-Instruct-FP8", name = "Llama 4 Scout", reasoning = false, input = ["text"], contextWindow = 200000, maxTokens = 16384, cost = { input = 0, output = 0, cacheRead = 0, cacheWrite = 0 } },
-												[ci skip] Flatten module wrappers into stack roots

Remove the module "xxx" { source = "./module" } indirection layer
from all 66 service stacks. Resources are now defined directly in
each stack's main.tf instead of through a wrapper module.

- Merge module/main.tf contents into stack main.tf
- Apply variable replacements (var.tier -> local.tiers.X, renamed vars)
- Fix shared module paths (one fewer ../ at each level)
- Move extra files/dirs (factory/, chart_values, subdirs) to stack root
- Update state files to strip module.<name>. prefix
- Update CLAUDE.md to reflect flat structure

Verified: terragrunt plan shows 0 add, 0 destroy across all stacks.

											
										
										
											2026-02-22 15:13:55 +00:00
+								            ]
 								          }
 								        }
 								      }
-												[ci skip] openclaw: fix Telegram, update to v2026.2.26, fix startup issues

- Update OpenClaw from v2026.2.9 to v2026.2.26 (fixes Telegram channel)
- Add gateway.mode=local + wizard block (required for channel startup)
- Add dangerouslyAllowHostHeaderOriginFallback (v2026.2.26 requirement)
- Run doctor --fix at container startup to auto-enable Telegram
- Create required dirs (canvas, devices, cron, sessions, credentials)
- Fix permissions: chown -R 1000:1000 for node user
- Telegram: DM allowlist, user 8281953845 only

											
										
										
											2026-03-01 15:47:54 +00:00
+								      wizard = {
 								        lastRunAt      = "2026-03-01T15:11:54.176Z"
 								        lastRunVersion = "2026.2.9"
 								        lastRunCommand = "configure"
 								        lastRunMode    = "local"
 								      }
-												[ci skip] Flatten module wrappers into stack roots

Remove the module "xxx" { source = "./module" } indirection layer
from all 66 service stacks. Resources are now defined directly in
each stack's main.tf instead of through a wrapper module.

- Merge module/main.tf contents into stack main.tf
- Apply variable replacements (var.tier -> local.tiers.X, renamed vars)
- Fix shared module paths (one fewer ../ at each level)
- Move extra files/dirs (factory/, chart_values, subdirs) to stack root
- Update state files to strip module.<name>. prefix
- Update CLAUDE.md to reflect flat structure

Verified: terragrunt plan shows 0 add, 0 destroy across all stacks.

											
										
										
											2026-02-22 15:13:55 +00:00
+								    })
 								  }
 								}
 								resource "random_password" "gateway_token" {
 								  length  = 32
 								  special = false
 								}
-												openclaw: realtime usage dashboard via Prometheus exporter sidecar

Stdlib-only Python exporter ($1) reads ~/.openclaw/agents/*/sessions/*.jsonl
(assistant messages with usage) plus auth-profiles.json (OAuth expiry,
Plus-tier label) and exposes Prometheus text format on :9099/metrics.
Container is python:3.12-slim; pod template gets prometheus.io/scrape
annotations so the existing kubernetes-pods job picks it up — no
ServiceMonitor needed.

Metrics exported:
  openclaw_codex_messages_total{provider,model,session_kind}    counter
  openclaw_codex_input/output/cache_read/cache_write_tokens_total
  openclaw_codex_message_errors_total{reason}
  openclaw_codex_active_sessions{kind}                          gauge
  openclaw_codex_oauth_expiry_seconds{provider,account,plan}    gauge
  openclaw_codex_last_run_timestamp                             gauge

Grafana dashboard "OpenClaw — Codex Usage" (Applications folder, 30s
refresh): messages/5h vs Plus rate-card, % of 1,200 floor, tokens/5h,
cache hit %, OAuth expiry days, active sessions, last-turn age, errors,
plus per-model timeseries + bar gauge + error table.

Plus rate-card thresholds in the gauge are conservative (1,200/5h floor;
real cap is dynamic 1,200–7,000). Re-baseline if throttling shows up
below 80%.

											
										
										
											2026-05-07 09:04:25 +00:00
+								# Prometheus exporter script — read by the openclaw-exporter sidecar.
 								# Stdlib-only Python so no pip install at startup. Reads sessions JSONL +
 								# auth-profiles.json from the NFS-backed openclaw home volume (mounted ro).
 								resource "kubernetes_config_map" "openclaw_exporter" {
 								  metadata {
 								    name      = "openclaw-exporter"
 								    namespace = kubernetes_namespace.openclaw.metadata[0].name
 								  }
 								  data = {
 								    "exporter.py" = file("${path.module}/files/exporter.py")
 								  }
 								}
-												truenas deprecation: migrate all non-immich storage to proxmox NFS

- Migrate 7 backup CronJobs to Proxmox host NFS (192.168.1.127)
  (etcd, mysql, postgresql, nextcloud, redis, vaultwarden, plotting-book)
- Migrate headscale backup, ebook2audiobook, osm_routing to Proxmox NFS
- Migrate servarr (lidarr, readarr, soulseek) NFS refs to Proxmox
- Remove 79 orphaned TrueNAS NFS module declarations from 49 stacks
- Delete stacks/platform/modules/ (27 dead module copies, 65MB)
- Update nfs-truenas StorageClass to point to Proxmox (192.168.1.127)
- Remove iscsi DNS record from config.tfvars
- Fix woodpecker persistence config and alertmanager PV

Only Immich (8 PVCs, ~1.4TB) remains on TrueNAS.

											
										
										
											2026-04-12 14:35:39 +01:00
+								module "nfs_tools_host" {
-												[ci skip] complete NFS CSI migration: complex stacks + platform modules

Migrate remaining multi-volume stacks and all platform modules from
inline NFS volumes to CSI-backed PV/PVC with nfs-truenas StorageClass
(soft,timeo=30,retrans=3 mount options).

Complex stacks: openclaw (4 vols), immich (8 vols), frigate (2 vols),
nextcloud (2 vols + old PV replaced), rybbit (1 vol)

Remaining stacks: affine, ebook2audiobook, f1-stream, osm_routing,
real-estate-crawler

Platform modules: monitoring (prometheus, loki, alertmanager PVs
converted from native NFS to CSI), redis, dbaas, technitium,
headscale, vaultwarden, uptime-kuma, mailserver, infra-maintenance

											
										
										
											2026-03-02 01:24:07 +00:00
+								  source     = "../../modules/kubernetes/nfs_volume"
-												truenas deprecation: migrate all non-immich storage to proxmox NFS

- Migrate 7 backup CronJobs to Proxmox host NFS (192.168.1.127)
  (etcd, mysql, postgresql, nextcloud, redis, vaultwarden, plotting-book)
- Migrate headscale backup, ebook2audiobook, osm_routing to Proxmox NFS
- Migrate servarr (lidarr, readarr, soulseek) NFS refs to Proxmox
- Remove 79 orphaned TrueNAS NFS module declarations from 49 stacks
- Delete stacks/platform/modules/ (27 dead module copies, 65MB)
- Update nfs-truenas StorageClass to point to Proxmox (192.168.1.127)
- Remove iscsi DNS record from config.tfvars
- Fix woodpecker persistence config and alertmanager PV

Only Immich (8 PVCs, ~1.4TB) remains on TrueNAS.

											
										
										
											2026-04-12 14:35:39 +01:00
+								  name       = "openclaw-tools-host"
-												[ci skip] complete NFS CSI migration: complex stacks + platform modules

Migrate remaining multi-volume stacks and all platform modules from
inline NFS volumes to CSI-backed PV/PVC with nfs-truenas StorageClass
(soft,timeo=30,retrans=3 mount options).

Complex stacks: openclaw (4 vols), immich (8 vols), frigate (2 vols),
nextcloud (2 vols + old PV replaced), rybbit (1 vol)

Remaining stacks: affine, ebook2audiobook, f1-stream, osm_routing,
real-estate-crawler

Platform modules: monitoring (prometheus, loki, alertmanager PVs
converted from native NFS to CSI), redis, dbaas, technitium,
headscale, vaultwarden, uptime-kuma, mailserver, infra-maintenance

											
										
										
											2026-03-02 01:24:07 +00:00
+								  namespace  = kubernetes_namespace.openclaw.metadata[0].name
-												truenas deprecation: migrate all non-immich storage to proxmox NFS

- Migrate 7 backup CronJobs to Proxmox host NFS (192.168.1.127)
  (etcd, mysql, postgresql, nextcloud, redis, vaultwarden, plotting-book)
- Migrate headscale backup, ebook2audiobook, osm_routing to Proxmox NFS
- Migrate servarr (lidarr, readarr, soulseek) NFS refs to Proxmox
- Remove 79 orphaned TrueNAS NFS module declarations from 49 stacks
- Delete stacks/platform/modules/ (27 dead module copies, 65MB)
- Update nfs-truenas StorageClass to point to Proxmox (192.168.1.127)
- Remove iscsi DNS record from config.tfvars
- Fix woodpecker persistence config and alertmanager PV

Only Immich (8 PVCs, ~1.4TB) remains on TrueNAS.

											
										
										
											2026-04-12 14:35:39 +01:00
+								  nfs_server = "192.168.1.127"
 								  nfs_path   = "/srv/nfs/openclaw/tools"
-												[ci skip] complete NFS CSI migration: complex stacks + platform modules

Migrate remaining multi-volume stacks and all platform modules from
inline NFS volumes to CSI-backed PV/PVC with nfs-truenas StorageClass
(soft,timeo=30,retrans=3 mount options).

Complex stacks: openclaw (4 vols), immich (8 vols), frigate (2 vols),
nextcloud (2 vols + old PV replaced), rybbit (1 vol)

Remaining stacks: affine, ebook2audiobook, f1-stream, osm_routing,
real-estate-crawler

Platform modules: monitoring (prometheus, loki, alertmanager PVs
converted from native NFS to CSI), redis, dbaas, technitium,
headscale, vaultwarden, uptime-kuma, mailserver, infra-maintenance

											
										
										
											2026-03-02 01:24:07 +00:00
+								}
-												feat(storage): migrate 38 NFS PVCs to proxmox-lvm (Wave 2)

Add proxmox-lvm PVCs with pvc-autoresizer annotations for all
remaining single-pod app data services. Deployments updated to
use new block storage PVCs. Old NFS modules retained for rollback.

Services: affine, changedetection, diun, excalidraw, f1-stream,
hackmd, isponsorblocktv, matrix, n8n, send, grampsweb, health,
onlyoffice, owntracks, paperless-ngx, privatebin, resume,
speedtest, stirling-pdf, tandoor, rybbit (clickhouse), tor-proxy
(torrserver), whisper+piper, frigate (config), ollama (ui),
servarr (prowlarr/listenarr/qbittorrent), aiostreams, freshrss
(extensions), meshcentral (data+files), openclaw (data+home+
openlobster), technitium, mailserver (data+roundcube html+enigma),
dbaas (pgadmin).

Strategy set to Recreate where needed for RWO volumes.

											
										
										
											2026-04-04 19:25:12 +03:00
+								resource "kubernetes_persistent_volume_claim" "home_proxmox" {
 								  wait_until_bound = false
 								  metadata {
 								    name      = "openclaw-home-proxmox"
 								    namespace = kubernetes_namespace.openclaw.metadata[0].name
 								    annotations = {
 								      "resize.topolvm.io/threshold"     = "80%"
 								      "resize.topolvm.io/increase"      = "100%"
 								      "resize.topolvm.io/storage_limit" = "5Gi"
 								    }
 								  }
 								  spec {
 								    access_modes       = ["ReadWriteOnce"]
 								    storage_class_name = "proxmox-lvm"
 								    resources {
 								      requests = {
 								        storage = "1Gi"
 								      }
 								    }
 								  }
 								}
-												truenas deprecation: migrate all non-immich storage to proxmox NFS

- Migrate 7 backup CronJobs to Proxmox host NFS (192.168.1.127)
  (etcd, mysql, postgresql, nextcloud, redis, vaultwarden, plotting-book)
- Migrate headscale backup, ebook2audiobook, osm_routing to Proxmox NFS
- Migrate servarr (lidarr, readarr, soulseek) NFS refs to Proxmox
- Remove 79 orphaned TrueNAS NFS module declarations from 49 stacks
- Delete stacks/platform/modules/ (27 dead module copies, 65MB)
- Update nfs-truenas StorageClass to point to Proxmox (192.168.1.127)
- Remove iscsi DNS record from config.tfvars
- Fix woodpecker persistence config and alertmanager PV

Only Immich (8 PVCs, ~1.4TB) remains on TrueNAS.

											
										
										
											2026-04-12 14:35:39 +01:00
+								module "nfs_workspace_host" {
-												[ci skip] complete NFS CSI migration: complex stacks + platform modules

Migrate remaining multi-volume stacks and all platform modules from
inline NFS volumes to CSI-backed PV/PVC with nfs-truenas StorageClass
(soft,timeo=30,retrans=3 mount options).

Complex stacks: openclaw (4 vols), immich (8 vols), frigate (2 vols),
nextcloud (2 vols + old PV replaced), rybbit (1 vol)

Remaining stacks: affine, ebook2audiobook, f1-stream, osm_routing,
real-estate-crawler

Platform modules: monitoring (prometheus, loki, alertmanager PVs
converted from native NFS to CSI), redis, dbaas, technitium,
headscale, vaultwarden, uptime-kuma, mailserver, infra-maintenance

											
										
										
											2026-03-02 01:24:07 +00:00
+								  source     = "../../modules/kubernetes/nfs_volume"
-												truenas deprecation: migrate all non-immich storage to proxmox NFS

- Migrate 7 backup CronJobs to Proxmox host NFS (192.168.1.127)
  (etcd, mysql, postgresql, nextcloud, redis, vaultwarden, plotting-book)
- Migrate headscale backup, ebook2audiobook, osm_routing to Proxmox NFS
- Migrate servarr (lidarr, readarr, soulseek) NFS refs to Proxmox
- Remove 79 orphaned TrueNAS NFS module declarations from 49 stacks
- Delete stacks/platform/modules/ (27 dead module copies, 65MB)
- Update nfs-truenas StorageClass to point to Proxmox (192.168.1.127)
- Remove iscsi DNS record from config.tfvars
- Fix woodpecker persistence config and alertmanager PV

Only Immich (8 PVCs, ~1.4TB) remains on TrueNAS.

											
										
										
											2026-04-12 14:35:39 +01:00
+								  name       = "openclaw-workspace-host"
-												[ci skip] complete NFS CSI migration: complex stacks + platform modules

Migrate remaining multi-volume stacks and all platform modules from
inline NFS volumes to CSI-backed PV/PVC with nfs-truenas StorageClass
(soft,timeo=30,retrans=3 mount options).

Complex stacks: openclaw (4 vols), immich (8 vols), frigate (2 vols),
nextcloud (2 vols + old PV replaced), rybbit (1 vol)

Remaining stacks: affine, ebook2audiobook, f1-stream, osm_routing,
real-estate-crawler

Platform modules: monitoring (prometheus, loki, alertmanager PVs
converted from native NFS to CSI), redis, dbaas, technitium,
headscale, vaultwarden, uptime-kuma, mailserver, infra-maintenance

											
										
										
											2026-03-02 01:24:07 +00:00
+								  namespace  = kubernetes_namespace.openclaw.metadata[0].name
-												truenas deprecation: migrate all non-immich storage to proxmox NFS

- Migrate 7 backup CronJobs to Proxmox host NFS (192.168.1.127)
  (etcd, mysql, postgresql, nextcloud, redis, vaultwarden, plotting-book)
- Migrate headscale backup, ebook2audiobook, osm_routing to Proxmox NFS
- Migrate servarr (lidarr, readarr, soulseek) NFS refs to Proxmox
- Remove 79 orphaned TrueNAS NFS module declarations from 49 stacks
- Delete stacks/platform/modules/ (27 dead module copies, 65MB)
- Update nfs-truenas StorageClass to point to Proxmox (192.168.1.127)
- Remove iscsi DNS record from config.tfvars
- Fix woodpecker persistence config and alertmanager PV

Only Immich (8 PVCs, ~1.4TB) remains on TrueNAS.

											
										
										
											2026-04-12 14:35:39 +01:00
+								  nfs_server = "192.168.1.127"
 								  nfs_path   = "/srv/nfs/openclaw/workspace"
-												[ci skip] complete NFS CSI migration: complex stacks + platform modules

Migrate remaining multi-volume stacks and all platform modules from
inline NFS volumes to CSI-backed PV/PVC with nfs-truenas StorageClass
(soft,timeo=30,retrans=3 mount options).

Complex stacks: openclaw (4 vols), immich (8 vols), frigate (2 vols),
nextcloud (2 vols + old PV replaced), rybbit (1 vol)

Remaining stacks: affine, ebook2audiobook, f1-stream, osm_routing,
real-estate-crawler

Platform modules: monitoring (prometheus, loki, alertmanager PVs
converted from native NFS to CSI), redis, dbaas, technitium,
headscale, vaultwarden, uptime-kuma, mailserver, infra-maintenance

											
										
										
											2026-03-02 01:24:07 +00:00
+								}
-												feat(storage): migrate 38 NFS PVCs to proxmox-lvm (Wave 2)

Add proxmox-lvm PVCs with pvc-autoresizer annotations for all
remaining single-pod app data services. Deployments updated to
use new block storage PVCs. Old NFS modules retained for rollback.

Services: affine, changedetection, diun, excalidraw, f1-stream,
hackmd, isponsorblocktv, matrix, n8n, send, grampsweb, health,
onlyoffice, owntracks, paperless-ngx, privatebin, resume,
speedtest, stirling-pdf, tandoor, rybbit (clickhouse), tor-proxy
(torrserver), whisper+piper, frigate (config), ollama (ui),
servarr (prowlarr/listenarr/qbittorrent), aiostreams, freshrss
(extensions), meshcentral (data+files), openclaw (data+home+
openlobster), technitium, mailserver (data+roundcube html+enigma),
dbaas (pgadmin).

Strategy set to Recreate where needed for RWO volumes.

											
										
										
											2026-04-04 19:25:12 +03:00
+								resource "kubernetes_persistent_volume_claim" "data_proxmox" {
 								  wait_until_bound = false
 								  metadata {
 								    name      = "openclaw-data-proxmox"
 								    namespace = kubernetes_namespace.openclaw.metadata[0].name
 								    annotations = {
 								      "resize.topolvm.io/threshold"     = "80%"
 								      "resize.topolvm.io/increase"      = "100%"
 								      "resize.topolvm.io/storage_limit" = "5Gi"
 								    }
 								  }
 								  spec {
 								    access_modes       = ["ReadWriteOnce"]
 								    storage_class_name = "proxmox-lvm"
 								    resources {
 								      requests = {
 								        storage = "1Gi"
 								      }
 								    }
 								  }
 								}
-												openclaw: replace cc-config NFS with dotfiles repo clone [ci skip]

- Add init container "install-dotfiles" that clones the dotfiles repo
  and installs skills/agents/hooks to OpenClaw's home directory
- Remove nfs_cc_config module and its volume mount
- Skills/agents now come from the same chezmoi-managed dotfiles repo
  that manages the Mac config, eliminating the dual-sync problem

											
										
										
											2026-03-15 16:04:02 +00:00
+								## cc-config NFS volume removed — replaced by dotfiles repo clone in init container
 								## See init_container "install-dotfiles" in the deployment
-												Remove all CPU limits cluster-wide to eliminate CFS throttling

CPU limits cause CFS throttling even when nodes have idle capacity.
Move to a request-only CPU model: keep CPU requests for scheduling
fairness but remove all CPU limits. Memory limits stay (incompressible).

Changes across 108 files:
- Kyverno LimitRange policy: remove cpu from default/max in all 6 tiers
- Kyverno ResourceQuota policy: remove limits.cpu from all 5 tiers
- Custom ResourceQuotas: remove limits.cpu from 8 namespace quotas
- Custom LimitRanges: remove cpu from default/max (nextcloud, onlyoffice)
- RBAC module: remove cpu_limits variable and quota reference
- Freedify factory: remove cpu_limit variable and limits reference
- 86 deployment files: remove cpu from all limits blocks
- 6 Helm values files: remove cpu under limits sections

											
										
										
											2026-03-14 08:51:45 +00:00
-												[ci skip] Flatten module wrappers into stack roots

Remove the module "xxx" { source = "./module" } indirection layer
from all 66 service stacks. Resources are now defined directly in
each stack's main.tf instead of through a wrapper module.

- Merge module/main.tf contents into stack main.tf
- Apply variable replacements (var.tier -> local.tiers.X, renamed vars)
- Fix shared module paths (one fewer ../ at each level)
- Move extra files/dirs (factory/, chart_values, subdirs) to stack root
- Update state files to strip module.<name>. prefix
- Update CLAUDE.md to reflect flat structure

Verified: terragrunt plan shows 0 add, 0 destroy across all stacks.

											
										
										
											2026-02-22 15:13:55 +00:00
+								resource "kubernetes_deployment" "openclaw" {
 								  metadata {
 								    name      = "openclaw"
 								    namespace = kubernetes_namespace.openclaw.metadata[0].name
 								    labels = {
 								      app  = "openclaw"
 								      tier = local.tiers.aux
 								    }
 								  }
 								  spec {
 								    strategy {
 								      type = "Recreate"
 								    }
 								    replicas = 1
 								    selector {
 								      match_labels = {
 								        app = "openclaw"
 								      }
 								    }
 								    template {
 								      metadata {
 								        labels = {
 								          app = "openclaw"
 								        }
-												migrate consuming stacks to ESO + remove k8s-dashboard static token

Phase 9: ExternalSecret migration across 26 stacks:

Fully migrated (vault data source removed, ESO delivers secrets):
- speedtest, shadowsocks, wealthfolio, plotting-book, f1-stream, tandoor
- n8n, dawarich, diun, netbox, onlyoffice, tuya-bridge
- hackmd (ESO template for DB URL), health (ESO template for DB URL)
- trading-bot (ESO template for DATABASE_URL + 7 secret env vars)
- forgejo (removed unused vault data source)

Partially migrated (vault kept for plan-time, ESO added for runtime):
- immich, linkwarden, nextcloud, paperless-ngx (jsondecode for homepage)
- claude-memory, rybbit, url, webhook_handler (plan-time in locals/jobs)
- woodpecker, openclaw, resume (plan-time in helm values/jobs/modules)

17 stacks unchanged (all plan-time: homepage annotations, configmaps,
module inputs) — vault data source works with OIDC auth.

Phase 17a: Remove k8s-dashboard static admin token secret.
Users now get tokens via: vault write kubernetes/creds/dashboard-admin

											
										
										
											2026-03-15 19:05:04 +00:00
+								        annotations = {
 								          "reloader.stakater.com/search" = "true"
-												openclaw: realtime usage dashboard via Prometheus exporter sidecar

Stdlib-only Python exporter ($1) reads ~/.openclaw/agents/*/sessions/*.jsonl
(assistant messages with usage) plus auth-profiles.json (OAuth expiry,
Plus-tier label) and exposes Prometheus text format on :9099/metrics.
Container is python:3.12-slim; pod template gets prometheus.io/scrape
annotations so the existing kubernetes-pods job picks it up — no
ServiceMonitor needed.

Metrics exported:
  openclaw_codex_messages_total{provider,model,session_kind}    counter
  openclaw_codex_input/output/cache_read/cache_write_tokens_total
  openclaw_codex_message_errors_total{reason}
  openclaw_codex_active_sessions{kind}                          gauge
  openclaw_codex_oauth_expiry_seconds{provider,account,plan}    gauge
  openclaw_codex_last_run_timestamp                             gauge

Grafana dashboard "OpenClaw — Codex Usage" (Applications folder, 30s
refresh): messages/5h vs Plus rate-card, % of 1,200 floor, tokens/5h,
cache hit %, OAuth expiry days, active sessions, last-turn age, errors,
plus per-model timeseries + bar gauge + error table.

Plus rate-card thresholds in the gauge are conservative (1,200/5h floor;
real cap is dynamic 1,200–7,000). Re-baseline if throttling shows up
below 80%.

											
										
										
											2026-05-07 09:04:25 +00:00
+								          # Prometheus auto-discovers pods with these annotations.
 								          # Scraped by the openclaw-exporter sidecar — exposes /metrics on :9099.
 								          "prometheus.io/scrape" = "true"
 								          "prometheus.io/port"   = "9099"
 								          "prometheus.io/path"   = "/metrics"
-												migrate consuming stacks to ESO + remove k8s-dashboard static token

Phase 9: ExternalSecret migration across 26 stacks:

Fully migrated (vault data source removed, ESO delivers secrets):
- speedtest, shadowsocks, wealthfolio, plotting-book, f1-stream, tandoor
- n8n, dawarich, diun, netbox, onlyoffice, tuya-bridge
- hackmd (ESO template for DB URL), health (ESO template for DB URL)
- trading-bot (ESO template for DATABASE_URL + 7 secret env vars)
- forgejo (removed unused vault data source)

Partially migrated (vault kept for plan-time, ESO added for runtime):
- immich, linkwarden, nextcloud, paperless-ngx (jsondecode for homepage)
- claude-memory, rybbit, url, webhook_handler (plan-time in locals/jobs)
- woodpecker, openclaw, resume (plan-time in helm values/jobs/modules)

17 stacks unchanged (all plan-time: homepage annotations, configmaps,
module inputs) — vault data source works with OIDC auth.

Phase 17a: Remove k8s-dashboard static admin token secret.
Users now get tokens via: vault write kubernetes/creds/dashboard-admin

											
										
										
											2026-03-15 19:05:04 +00:00
+								        }
-												[ci skip] Flatten module wrappers into stack roots

Remove the module "xxx" { source = "./module" } indirection layer
from all 66 service stacks. Resources are now defined directly in
each stack's main.tf instead of through a wrapper module.

- Merge module/main.tf contents into stack main.tf
- Apply variable replacements (var.tier -> local.tiers.X, renamed vars)
- Fix shared module paths (one fewer ../ at each level)
- Move extra files/dirs (factory/, chart_values, subdirs) to stack root
- Update state files to strip module.<name>. prefix
- Update CLAUDE.md to reflect flat structure

Verified: terragrunt plan shows 0 add, 0 destroy across all stacks.

											
										
										
											2026-02-22 15:13:55 +00:00
+								      }
 								      spec {
 								        service_account_name = kubernetes_service_account.openclaw.metadata[0].name
-												sync regenerated providers.tf + upstream changes

- Terragrunt-regenerated providers.tf across stacks (vault_root_token
  variable removed from root generate block)
- Upstream monitoring/openclaw/CLAUDE.md changes from rebase

											
										
										
											2026-03-22 02:56:04 +02:00
+								        # Init 0: fix /workspace ownership so node user can write
 								        init_container {
 								          name    = "fix-workspace-perms"
 								          image   = "busybox:1.37"
 								          command = ["sh", "-c", "chown 1000:1000 /workspace"]
 								          volume_mount {
 								            name       = "workspace"
 								            mount_path = "/workspace"
 								          }
 								        }
-												openclaw: replace cc-config NFS with dotfiles repo clone [ci skip]

- Add init container "install-dotfiles" that clones the dotfiles repo
  and installs skills/agents/hooks to OpenClaw's home directory
- Remove nfs_cc_config module and its volume mount
- Skills/agents now come from the same chezmoi-managed dotfiles repo
  that manages the Mac config, eliminating the dual-sync problem

											
										
										
											2026-03-15 16:04:02 +00:00
+								        # Init 1: copy openclaw.json from ConfigMap into writable NFS home
-												[ci skip] Flatten module wrappers into stack roots

Remove the module "xxx" { source = "./module" } indirection layer
from all 66 service stacks. Resources are now defined directly in
each stack's main.tf instead of through a wrapper module.

- Merge module/main.tf contents into stack main.tf
- Apply variable replacements (var.tier -> local.tiers.X, renamed vars)
- Fix shared module paths (one fewer ../ at each level)
- Move extra files/dirs (factory/, chart_values, subdirs) to stack root
- Update state files to strip module.<name>. prefix
- Update CLAUDE.md to reflect flat structure

Verified: terragrunt plan shows 0 add, 0 destroy across all stacks.

											
										
										
											2026-02-22 15:13:55 +00:00
+								        init_container {
-												fix openclaw config mount and OOM: use init container, increase memory to 2Gi

- Replace subPath ConfigMap mount with init container that copies openclaw.json
  to writable NFS home (OpenClaw writes back to the file at runtime)
- Remove invalid memory-api plugin references causing "Config invalid"
- Increase memory to 2Gi (req+limit) with NODE_OPTIONS=--max-old-space-size=1536
- Fix tg wrapper to inject -auto-approve when apply --non-interactive is used

											
										
										
											2026-03-14 23:42:08 +00:00
+								          name    = "copy-config"
 								          image   = "busybox:1.37"
 								          command = ["sh", "-c", "cp /config/openclaw.json /home/node/.openclaw/openclaw.json && chown 1000:1000 /home/node/.openclaw/openclaw.json"]
-												[ci skip] Flatten module wrappers into stack roots

Remove the module "xxx" { source = "./module" } indirection layer
from all 66 service stacks. Resources are now defined directly in
each stack's main.tf instead of through a wrapper module.

- Merge module/main.tf contents into stack main.tf
- Apply variable replacements (var.tier -> local.tiers.X, renamed vars)
- Fix shared module paths (one fewer ../ at each level)
- Move extra files/dirs (factory/, chart_values, subdirs) to stack root
- Update state files to strip module.<name>. prefix
- Update CLAUDE.md to reflect flat structure

Verified: terragrunt plan shows 0 add, 0 destroy across all stacks.

											
										
										
											2026-02-22 15:13:55 +00:00
+								          volume_mount {
 								            name       = "openclaw-config"
-												fix openclaw config mount and OOM: use init container, increase memory to 2Gi

- Replace subPath ConfigMap mount with init container that copies openclaw.json
  to writable NFS home (OpenClaw writes back to the file at runtime)
- Remove invalid memory-api plugin references causing "Config invalid"
- Increase memory to 2Gi (req+limit) with NODE_OPTIONS=--max-old-space-size=1536
- Fix tg wrapper to inject -auto-approve when apply --non-interactive is used

											
										
										
											2026-03-14 23:42:08 +00:00
+								            mount_path = "/config"
-												[ci skip] Flatten module wrappers into stack roots

Remove the module "xxx" { source = "./module" } indirection layer
from all 66 service stacks. Resources are now defined directly in
each stack's main.tf instead of through a wrapper module.

- Merge module/main.tf contents into stack main.tf
- Apply variable replacements (var.tier -> local.tiers.X, renamed vars)
- Fix shared module paths (one fewer ../ at each level)
- Move extra files/dirs (factory/, chart_values, subdirs) to stack root
- Update state files to strip module.<name>. prefix
- Update CLAUDE.md to reflect flat structure

Verified: terragrunt plan shows 0 add, 0 destroy across all stacks.

											
										
										
											2026-02-22 15:13:55 +00:00
+								          }
-												Remove all CPU limits cluster-wide to eliminate CFS throttling

CPU limits cause CFS throttling even when nodes have idle capacity.
Move to a request-only CPU model: keep CPU requests for scheduling
fairness but remove all CPU limits. Memory limits stay (incompressible).

Changes across 108 files:
- Kyverno LimitRange policy: remove cpu from default/max in all 6 tiers
- Kyverno ResourceQuota policy: remove limits.cpu from all 5 tiers
- Custom ResourceQuotas: remove limits.cpu from 8 namespace quotas
- Custom LimitRanges: remove cpu from default/max (nextcloud, onlyoffice)
- RBAC module: remove cpu_limits variable and quota reference
- Freedify factory: remove cpu_limit variable and limits reference
- 86 deployment files: remove cpu from all limits blocks
- 6 Helm values files: remove cpu under limits sections

											
										
										
											2026-03-14 08:51:45 +00:00
+								          volume_mount {
-												fix openclaw config mount and OOM: use init container, increase memory to 2Gi

- Replace subPath ConfigMap mount with init container that copies openclaw.json
  to writable NFS home (OpenClaw writes back to the file at runtime)
- Remove invalid memory-api plugin references causing "Config invalid"
- Increase memory to 2Gi (req+limit) with NODE_OPTIONS=--max-old-space-size=1536
- Fix tg wrapper to inject -auto-approve when apply --non-interactive is used

											
										
										
											2026-03-14 23:42:08 +00:00
+								            name       = "openclaw-home"
 								            mount_path = "/home/node/.openclaw"
-												Remove all CPU limits cluster-wide to eliminate CFS throttling

CPU limits cause CFS throttling even when nodes have idle capacity.
Move to a request-only CPU model: keep CPU requests for scheduling
fairness but remove all CPU limits. Memory limits stay (incompressible).

Changes across 108 files:
- Kyverno LimitRange policy: remove cpu from default/max in all 6 tiers
- Kyverno ResourceQuota policy: remove limits.cpu from all 5 tiers
- Custom ResourceQuotas: remove limits.cpu from 8 namespace quotas
- Custom LimitRanges: remove cpu from default/max (nextcloud, onlyoffice)
- RBAC module: remove cpu_limits variable and quota reference
- Freedify factory: remove cpu_limit variable and limits reference
- 86 deployment files: remove cpu from all limits blocks
- 6 Helm values files: remove cpu under limits sections

											
										
										
											2026-03-14 08:51:45 +00:00
+								          }
-												[ci skip] Flatten module wrappers into stack roots

Remove the module "xxx" { source = "./module" } indirection layer
from all 66 service stacks. Resources are now defined directly in
each stack's main.tf instead of through a wrapper module.

- Merge module/main.tf contents into stack main.tf
- Apply variable replacements (var.tier -> local.tiers.X, renamed vars)
- Fix shared module paths (one fewer ../ at each level)
- Move extra files/dirs (factory/, chart_values, subdirs) to stack root
- Update state files to strip module.<name>. prefix
- Update CLAUDE.md to reflect flat structure

Verified: terragrunt plan shows 0 add, 0 destroy across all stacks.

											
										
										
											2026-02-22 15:13:55 +00:00
+								        }
-												openclaw: regenerate kubeconfig at pod start using projected SA tokenFile

The previously-baked kubeconfig at /home/node/.openclaw/kubeconfig retained
a service-account token bound to the original (long-dead) pod, so kubectl
calls from inside the openclaw container failed with "the server has asked
for the client to provide credentials" even though the openclaw SA has
cluster-admin and kubelet projects a fresh token at
/var/run/secrets/kubernetes.io/serviceaccount/token.

Add init-container "setup-kubeconfig" that writes a kubeconfig with
tokenFile + certificate-authority paths pointing at the projected
SA volume — kubelet auto-rotates the token, kubectl always reads
fresh creds, no Vault K8s-creds-engine refresh needed.

Verified end-to-end: agent ran `kubectl get nodes -o wide` inside the
pod and delivered a correct one-line summary to Telegram via
openai-codex/gpt-5.4-mini.

											
										
										
											2026-05-08 08:07:38 +00:00
+								        # Init 1b: regenerate kubeconfig pointing at the projected SA tokenFile
 								        # so kubectl always reads the fresh, kubelet-rotated token. Without
 								        # this the previously-baked kubeconfig retains a SA token bound to a
 								        # long-dead pod and kubectl returns "must be logged in to the server".
 								        init_container {
 								          name    = "setup-kubeconfig"
 								          image   = "busybox:1.37"
 								          command = ["sh", "-c", <<-EOT
 								            cat > /home/node/.openclaw/kubeconfig <<'KUBECONFIG_EOF'
 								            apiVersion: v1
 								            kind: Config
 								            clusters:
 								            - cluster:
 								                certificate-authority: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
 								                server: https://kubernetes.default.svc
 								              name: in-cluster
 								            contexts:
 								            - context:
 								                cluster: in-cluster
 								                user: openclaw
 								                namespace: openclaw
 								              name: in-cluster
 								            current-context: in-cluster
 								            users:
 								            - name: openclaw
 								              user:
 								                tokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
 								            KUBECONFIG_EOF
 								            chown 1000:1000 /home/node/.openclaw/kubeconfig
 								            chmod 0644 /home/node/.openclaw/kubeconfig
 								          EOT
 								          ]
 								          volume_mount {
 								            name       = "openclaw-home"
 								            mount_path = "/home/node/.openclaw"
 								          }
 								        }
-												openclaw: remove install-dotfiles init container to reduce NFS writes

The init container was cloning the dotfiles repo via git on every pod
start, causing 200+ small NFS writes that amplified through ZFS.
Dotfiles already exist on NFS from a previous clone — no need to
re-clone on every restart. To update dotfiles, run git pull manually.

Also cleaned up stale Uptime Kuma files (1.6GB old SQLite DB + 289MB
error log left over from migration to MariaDB).

											
										
										
											2026-03-29 01:11:33 +02:00
+								        # Init 2 removed: install-dotfiles init container was cloning dotfiles
 								        # repo via git on every pod start, causing 200+ small NFS writes.
 								        # Dotfiles already exist on NFS at /home/node/.openclaw/dotfiles from
 								        # a previous clone. To update, run git pull manually or via CronJob.
-												openclaw: replace cc-config NFS with dotfiles repo clone [ci skip]

- Add init container "install-dotfiles" that clones the dotfiles repo
  and installs skills/agents/hooks to OpenClaw's home directory
- Remove nfs_cc_config module and its volume mount
- Skills/agents now come from the same chezmoi-managed dotfiles repo
  that manages the Mac config, eliminating the dual-sync problem

											
										
										
											2026-03-15 16:04:02 +00:00
-												[ci skip] Flatten module wrappers into stack roots

Remove the module "xxx" { source = "./module" } indirection layer
from all 66 service stacks. Resources are now defined directly in
each stack's main.tf instead of through a wrapper module.

- Merge module/main.tf contents into stack main.tf
- Apply variable replacements (var.tier -> local.tiers.X, renamed vars)
- Fix shared module paths (one fewer ../ at each level)
- Move extra files/dirs (factory/, chart_values, subdirs) to stack root
- Update state files to strip module.<name>. prefix
- Update CLAUDE.md to reflect flat structure

Verified: terragrunt plan shows 0 add, 0 destroy across all stacks.

											
										
										
											2026-02-22 15:13:55 +00:00
+								        # Main container: OpenClaw
 								        container {
 								          name    = "openclaw"
-												openclaw: switch primary to ChatGPT Plus OAuth (openai-codex/gpt-5.4-mini)

Bumps image 2026.2.26 → 2026.5.4 (openai-codex provider plugin landed in
2026.4.21+). Auth profile is OAuth via the device-pairing flow against the
Codex backend (account ancaelena98@gmail.com); token persists in
/home/node/.openclaw/agents/main/agent/auth-state.json on NFS so it survives
pod restarts. Plus tier accepts gpt-5.4-mini (1,200–7,000 local msgs/5h);
gpt-5-mini and gpt-5.1-codex-mini both return errors on Plus, so we pin
gpt-5.4-mini explicitly. doctor --fix auto-promotes the highest-tier model
(gpt-5-pro) after model discovery, so the container command pins the mini
back as default after doctor runs but before gateway start.

											
										
										
											2026-05-06 22:06:32 +00:00
+								          image   = "ghcr.io/openclaw/openclaw:2026.5.4"
 								          # Doctor --fix auto-promotes the highest-tier codex model (gpt-5-pro) after
 								          # auth-profile-based model discovery; pin gpt-5.4-mini back to default after it.
 								          command = ["sh", "-c", "node openclaw.mjs doctor --fix 2>/dev/null; node openclaw.mjs models set openai-codex/gpt-5.4-mini 2>/dev/null; exec node openclaw.mjs gateway --allow-unconfigured --bind lan"]
-												[ci skip] Flatten module wrappers into stack roots

Remove the module "xxx" { source = "./module" } indirection layer
from all 66 service stacks. Resources are now defined directly in
each stack's main.tf instead of through a wrapper module.

- Merge module/main.tf contents into stack main.tf
- Apply variable replacements (var.tier -> local.tiers.X, renamed vars)
- Fix shared module paths (one fewer ../ at each level)
- Move extra files/dirs (factory/, chart_values, subdirs) to stack root
- Update state files to strip module.<name>. prefix
- Update CLAUDE.md to reflect flat structure

Verified: terragrunt plan shows 0 add, 0 destroy across all stacks.

											
										
										
											2026-02-22 15:13:55 +00:00
+								          port {
 								            container_port = 18789
 								          }
-												[ci skip] openclaw: fix slow startup — proper resources + readiness probe + VPA off

- Set explicit CPU (2 cores) and memory (2Gi) limits
  Root cause: Goldilocks VPA was throttling to 300m CPU, causing gateway
  to take 5+ minutes to start, and 1Gi memory caused OOM crashes
- Add TCP readiness probe on port 18789 to prevent 502 Bad Gateway
  during startup (Traefik was routing before gateway was listening)
- Disable Goldilocks VPA via namespace label (vpa-update-mode: off)

											
										
										
											2026-03-01 14:44:22 +00:00
+								          readiness_probe {
 								            tcp_socket {
 								              port = 18789
 								            }
 								            initial_delay_seconds = 30
 								            period_seconds        = 10
 								          }
-												fix openclaw config mount and OOM: use init container, increase memory to 2Gi

- Replace subPath ConfigMap mount with init container that copies openclaw.json
  to writable NFS home (OpenClaw writes back to the file at runtime)
- Remove invalid memory-api plugin references causing "Config invalid"
- Increase memory to 2Gi (req+limit) with NODE_OPTIONS=--max-old-space-size=1536
- Fix tg wrapper to inject -auto-approve when apply --non-interactive is used

											
										
										
											2026-03-14 23:42:08 +00:00
+								          env {
 								            name  = "NODE_OPTIONS"
 								            value = "--max-old-space-size=1536"
 								          }
-												[ci skip] Flatten module wrappers into stack roots

Remove the module "xxx" { source = "./module" } indirection layer
from all 66 service stacks. Resources are now defined directly in
each stack's main.tf instead of through a wrapper module.

- Merge module/main.tf contents into stack main.tf
- Apply variable replacements (var.tier -> local.tiers.X, renamed vars)
- Fix shared module paths (one fewer ../ at each level)
- Move extra files/dirs (factory/, chart_values, subdirs) to stack root
- Update state files to strip module.<name>. prefix
- Update CLAUDE.md to reflect flat structure

Verified: terragrunt plan shows 0 add, 0 destroy across all stacks.

											
										
										
											2026-02-22 15:13:55 +00:00
+								          env {
 								            name  = "OPENCLAW_GATEWAY_TOKEN"
 								            value = random_password.gateway_token.result
 								          }
 								          env {
 								            name  = "PATH"
 								            value = "/tools:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin"
 								          }
 								          env {
 								            name  = "TF_VAR_prod"
 								            value = "true"
 								          }
 								          env {
 								            name  = "KUBECONFIG"
 								            value = "/home/node/.openclaw/kubeconfig"
 								          }
 								          env {
 								            name  = "GIT_CONFIG_GLOBAL"
 								            value = "/home/node/.openclaw/.gitconfig"
 								          }
 								          # Skill secrets - Home Assistant
 								          env {
 								            name  = "HOME_ASSISTANT_URL"
 								            value = "https://ha-london.viktorbarzin.me"
 								          }
 								          env {
 								            name  = "HOME_ASSISTANT_TOKEN"
-												migrate all secrets from SOPS to Vault KV

- Add vault provider to root terragrunt.hcl (generated providers.tf)
- Delete stacks/vault/vault_provider.tf (now in generated providers.tf)
- Add 124 variable declarations + 43 vault_kv_secret_v2 resources to
  vault/main.tf to populate Vault KV at secret/<stack-name>
- Migrate 43 consuming stacks to read secrets from Vault KV via
  data "vault_kv_secret_v2" instead of SOPS var-file
- Add dependency "vault" to all migrated stacks' terragrunt.hcl
- Complex types (maps/lists) stored as JSON strings, decoded with
  jsondecode() in locals blocks

Bootstrap secrets (vault_root_token, vault_authentik_client_id,
vault_authentik_client_secret) remain in SOPS permanently.

Apply order: vault stack first (populates KV), then all others.

											
										
										
											2026-03-14 17:15:48 +00:00
+								            value = local.skill_secrets["home_assistant_token"]
-												[ci skip] Flatten module wrappers into stack roots

Remove the module "xxx" { source = "./module" } indirection layer
from all 66 service stacks. Resources are now defined directly in
each stack's main.tf instead of through a wrapper module.

- Merge module/main.tf contents into stack main.tf
- Apply variable replacements (var.tier -> local.tiers.X, renamed vars)
- Fix shared module paths (one fewer ../ at each level)
- Move extra files/dirs (factory/, chart_values, subdirs) to stack root
- Update state files to strip module.<name>. prefix
- Update CLAUDE.md to reflect flat structure

Verified: terragrunt plan shows 0 add, 0 destroy across all stacks.

											
										
										
											2026-02-22 15:13:55 +00:00
+								          }
 								          env {
 								            name  = "HOME_ASSISTANT_SOFIA_URL"
 								            value = "https://ha-sofia.viktorbarzin.me"
 								          }
 								          env {
 								            name  = "HOME_ASSISTANT_SOFIA_TOKEN"
-												migrate all secrets from SOPS to Vault KV

- Add vault provider to root terragrunt.hcl (generated providers.tf)
- Delete stacks/vault/vault_provider.tf (now in generated providers.tf)
- Add 124 variable declarations + 43 vault_kv_secret_v2 resources to
  vault/main.tf to populate Vault KV at secret/<stack-name>
- Migrate 43 consuming stacks to read secrets from Vault KV via
  data "vault_kv_secret_v2" instead of SOPS var-file
- Add dependency "vault" to all migrated stacks' terragrunt.hcl
- Complex types (maps/lists) stored as JSON strings, decoded with
  jsondecode() in locals blocks

Bootstrap secrets (vault_root_token, vault_authentik_client_id,
vault_authentik_client_secret) remain in SOPS permanently.

Apply order: vault stack first (populates KV), then all others.

											
										
										
											2026-03-14 17:15:48 +00:00
+								            value = local.skill_secrets["home_assistant_sofia_token"]
-												[ci skip] Flatten module wrappers into stack roots

Remove the module "xxx" { source = "./module" } indirection layer
from all 66 service stacks. Resources are now defined directly in
each stack's main.tf instead of through a wrapper module.

- Merge module/main.tf contents into stack main.tf
- Apply variable replacements (var.tier -> local.tiers.X, renamed vars)
- Fix shared module paths (one fewer ../ at each level)
- Move extra files/dirs (factory/, chart_values, subdirs) to stack root
- Update state files to strip module.<name>. prefix
- Update CLAUDE.md to reflect flat structure

Verified: terragrunt plan shows 0 add, 0 destroy across all stacks.

											
										
										
											2026-02-22 15:13:55 +00:00
+								          }
 								          # Skill secrets - Uptime Kuma
 								          env {
 								            name  = "UPTIME_KUMA_PASSWORD"
-												migrate all secrets from SOPS to Vault KV

- Add vault provider to root terragrunt.hcl (generated providers.tf)
- Delete stacks/vault/vault_provider.tf (now in generated providers.tf)
- Add 124 variable declarations + 43 vault_kv_secret_v2 resources to
  vault/main.tf to populate Vault KV at secret/<stack-name>
- Migrate 43 consuming stacks to read secrets from Vault KV via
  data "vault_kv_secret_v2" instead of SOPS var-file
- Add dependency "vault" to all migrated stacks' terragrunt.hcl
- Complex types (maps/lists) stored as JSON strings, decoded with
  jsondecode() in locals blocks

Bootstrap secrets (vault_root_token, vault_authentik_client_id,
vault_authentik_client_secret) remain in SOPS permanently.

Apply order: vault stack first (populates KV), then all others.

											
										
										
											2026-03-14 17:15:48 +00:00
+								            value = local.skill_secrets["uptime_kuma_password"]
-												[ci skip] Flatten module wrappers into stack roots

Remove the module "xxx" { source = "./module" } indirection layer
from all 66 service stacks. Resources are now defined directly in
each stack's main.tf instead of through a wrapper module.

- Merge module/main.tf contents into stack main.tf
- Apply variable replacements (var.tier -> local.tiers.X, renamed vars)
- Fix shared module paths (one fewer ../ at each level)
- Move extra files/dirs (factory/, chart_values, subdirs) to stack root
- Update state files to strip module.<name>. prefix
- Update CLAUDE.md to reflect flat structure

Verified: terragrunt plan shows 0 add, 0 destroy across all stacks.

											
										
										
											2026-02-22 15:13:55 +00:00
+								          }
-												fix: eliminate memory overcommit to prevent node OOM crashes

Set requests = limits (Guaranteed QoS) across LimitRange defaults and
explicit pod resources. Node2 crashed 2026-03-14 from 250% memory
overcommit (61GB limits on 24GB node).

Changes:
- LimitRange: default = defaultRequest for all 6 tiers
- Grafana: 3 → 2 replicas
- Grampsweb: document why replicas=0
- Prometheus: 1Gi/4Gi → 3Gi/3Gi
- OpenClaw: 512Mi/2Gi → 768Mi/768Mi
- Immich server: 256Mi/2Gi → 512Mi/512Mi
- Immich postgresql: 256Mi/1Gi → 512Mi/512Mi
- Calibre: 256Mi/1536Mi → 256Mi/256Mi
- Linkwarden: 256Mi/1536Mi → 768Mi/768Mi
- N8N: 256Mi/1Gi → 512Mi/512Mi
- MySQL cluster: 1Gi/3-4Gi → 2Gi/2Gi
- pg-cluster (CNPG): 512Mi/4Gi → 512Mi/512Mi
- DBaaS ResourceQuota limits.memory: 64Gi → 12Gi

[ci skip]

											
										
										
											2026-03-14 16:01:41 +00:00
+								          # Memory API
 								          env {
 								            name  = "MEMORY_API_URL"
 								            value = "http://claude-memory.claude-memory.svc.cluster.local"
 								          }
 								          env {
-												migrate consuming stacks to ESO + remove k8s-dashboard static token

Phase 9: ExternalSecret migration across 26 stacks:

Fully migrated (vault data source removed, ESO delivers secrets):
- speedtest, shadowsocks, wealthfolio, plotting-book, f1-stream, tandoor
- n8n, dawarich, diun, netbox, onlyoffice, tuya-bridge
- hackmd (ESO template for DB URL), health (ESO template for DB URL)
- trading-bot (ESO template for DATABASE_URL + 7 secret env vars)
- forgejo (removed unused vault data source)

Partially migrated (vault kept for plan-time, ESO added for runtime):
- immich, linkwarden, nextcloud, paperless-ngx (jsondecode for homepage)
- claude-memory, rybbit, url, webhook_handler (plan-time in locals/jobs)
- woodpecker, openclaw, resume (plan-time in helm values/jobs/modules)

17 stacks unchanged (all plan-time: homepage annotations, configmaps,
module inputs) — vault data source works with OIDC auth.

Phase 17a: Remove k8s-dashboard static admin token secret.
Users now get tokens via: vault write kubernetes/creds/dashboard-admin

											
										
										
											2026-03-15 19:05:04 +00:00
+								            name = "MEMORY_API_KEY"
 								            value_from {
 								              secret_key_ref {
 								                name = "openclaw-secrets"
 								                key  = "claude_memory_api_key"
 								              }
 								            }
-												fix: eliminate memory overcommit to prevent node OOM crashes

Set requests = limits (Guaranteed QoS) across LimitRange defaults and
explicit pod resources. Node2 crashed 2026-03-14 from 250% memory
overcommit (61GB limits on 24GB node).

Changes:
- LimitRange: default = defaultRequest for all 6 tiers
- Grafana: 3 → 2 replicas
- Grampsweb: document why replicas=0
- Prometheus: 1Gi/4Gi → 3Gi/3Gi
- OpenClaw: 512Mi/2Gi → 768Mi/768Mi
- Immich server: 256Mi/2Gi → 512Mi/512Mi
- Immich postgresql: 256Mi/1Gi → 512Mi/512Mi
- Calibre: 256Mi/1536Mi → 256Mi/256Mi
- Linkwarden: 256Mi/1536Mi → 768Mi/768Mi
- N8N: 256Mi/1Gi → 512Mi/512Mi
- MySQL cluster: 1Gi/3-4Gi → 2Gi/2Gi
- pg-cluster (CNPG): 512Mi/4Gi → 512Mi/512Mi
- DBaaS ResourceQuota limits.memory: 64Gi → 12Gi

[ci skip]

											
										
										
											2026-03-14 16:01:41 +00:00
+								          }
-												[ci skip] Flatten module wrappers into stack roots

Remove the module "xxx" { source = "./module" } indirection layer
from all 66 service stacks. Resources are now defined directly in
each stack's main.tf instead of through a wrapper module.

- Merge module/main.tf contents into stack main.tf
- Apply variable replacements (var.tier -> local.tiers.X, renamed vars)
- Fix shared module paths (one fewer ../ at each level)
- Move extra files/dirs (factory/, chart_values, subdirs) to stack root
- Update state files to strip module.<name>. prefix
- Update CLAUDE.md to reflect flat structure

Verified: terragrunt plan shows 0 add, 0 destroy across all stacks.

											
										
										
											2026-02-22 15:13:55 +00:00
+								          # Python packages path for skills
 								          env {
 								            name  = "PYTHONPATH"
 								            value = "/tools/python-libs"
 								          }
 								          volume_mount {
 								            name       = "tools"
 								            mount_path = "/tools"
 								          }
 								          volume_mount {
 								            name       = "workspace"
 								            mount_path = "/workspace"
 								          }
 								          volume_mount {
 								            name       = "data"
 								            mount_path = "/data"
 								          }
 								          volume_mount {
 								            name       = "ssh-key"
 								            mount_path = "/ssh"
 								          }
 								          volume_mount {
 								            name       = "openclaw-home"
 								            mount_path = "/home/node/.openclaw"
 								          }
 								          resources {
 								            limits = {
-												fix openclaw config mount and OOM: use init container, increase memory to 2Gi

- Replace subPath ConfigMap mount with init container that copies openclaw.json
  to writable NFS home (OpenClaw writes back to the file at runtime)
- Remove invalid memory-api plugin references causing "Config invalid"
- Increase memory to 2Gi (req+limit) with NODE_OPTIONS=--max-old-space-size=1536
- Fix tg wrapper to inject -auto-approve when apply --non-interactive is used

											
										
										
											2026-03-14 23:42:08 +00:00
+								              memory = "2Gi"
-												[ci skip] Flatten module wrappers into stack roots

Remove the module "xxx" { source = "./module" } indirection layer
from all 66 service stacks. Resources are now defined directly in
each stack's main.tf instead of through a wrapper module.

- Merge module/main.tf contents into stack main.tf
- Apply variable replacements (var.tier -> local.tiers.X, renamed vars)
- Fix shared module paths (one fewer ../ at each level)
- Move extra files/dirs (factory/, chart_values, subdirs) to stack root
- Update state files to strip module.<name>. prefix
- Update CLAUDE.md to reflect flat structure

Verified: terragrunt plan shows 0 add, 0 destroy across all stacks.

											
										
										
											2026-02-22 15:13:55 +00:00
+								            }
 								            requests = {
-												[ci skip] openclaw: fix slow startup — proper resources + readiness probe + VPA off

- Set explicit CPU (2 cores) and memory (2Gi) limits
  Root cause: Goldilocks VPA was throttling to 300m CPU, causing gateway
  to take 5+ minutes to start, and 1Gi memory caused OOM crashes
- Add TCP readiness probe on port 18789 to prevent 502 Bad Gateway
  during startup (Traefik was routing before gateway was listening)
- Disable Goldilocks VPA via namespace label (vpa-update-mode: off)

											
										
										
											2026-03-01 14:44:22 +00:00
+								              cpu    = "100m"
-												right-size cluster memory: reduce overprovisioned, fix under-provisioned services

Phase 1 - Quick wins (~4.5 Gi saved):
- democratic-csi: add explicit sidecar resources (64-80Mi vs 256Mi LimitRange default)
- caretta: 768Mi → 600Mi (VPA upper 485Mi)
- immich-ml: 4Gi → 3584Mi (VPA upper 2.95Gi, GPU margin)
- onlyoffice: 3Gi → 2304Mi (VPA upper 1.82Gi)

Phase 2 - Safety fixes (prevent OOMKills):
- frigate: 2Gi/8Gi → 5Gi/10Gi (VPA upper 7.7Gi, was 4% headroom)
- openclaw: 1280Mi req → 2Gi req=limit (documented 2Gi requirement)

Phase 3 - Additional right-sizing:
- authentik workers: 1Gi → 896Mi x3 (VPA upper 722Mi)
- shlink: 512Mi/768Mi → 960Mi req=limit (VPA upper 780Mi, safety increase)

Phase 4 - Burstable QoS for lower tiers:
- tier-3-edge: 128Mi/128Mi → 96Mi req / 192Mi limit
- tier-4-aux: 128Mi/128Mi → 64Mi req / 256Mi limit

Phase 5 - Monitoring:
- Add ClusterMemoryRequestsHigh alert (>85% allocatable, 15m)
- Add ContainerNearOOM alert (>85% limit, 30m)
- Add PodUnschedulable alert (5m, critical)

Cluster: 92.7% → 90.8% memory requests. Stirling-pdf now schedulable.

											
										
										
											2026-03-15 15:30:18 +00:00
+								              memory = "2Gi"
-												[ci skip] Flatten module wrappers into stack roots

Remove the module "xxx" { source = "./module" } indirection layer
from all 66 service stacks. Resources are now defined directly in
each stack's main.tf instead of through a wrapper module.

- Merge module/main.tf contents into stack main.tf
- Apply variable replacements (var.tier -> local.tiers.X, renamed vars)
- Fix shared module paths (one fewer ../ at each level)
- Move extra files/dirs (factory/, chart_values, subdirs) to stack root
- Update state files to strip module.<name>. prefix
- Update CLAUDE.md to reflect flat structure

Verified: terragrunt plan shows 0 add, 0 destroy across all stacks.

											
										
										
											2026-02-22 15:13:55 +00:00
+								            }
 								          }
 								        }
-												sync regenerated providers.tf + upstream changes

- Terragrunt-regenerated providers.tf across stacks (vault_root_token
  variable removed from root generate block)
- Upstream monitoring/openclaw/CLAUDE.md changes from rebase

											
										
										
											2026-03-22 02:56:04 +02:00
+								        # Sidecar: playwright-mcp — headless browser for agents
 								        container {
 								          name  = "playwright-mcp"
 								          image = "docker.io/viktorbarzin/playwright-mcp:v1"
 								          args  = ["--headless", "--browser", "chromium", "--no-sandbox", "--port", "3000", "--host", "0.0.0.0"]
 								          port {
 								            container_port = 3000
 								          }
 								          resources {
 								            requests = {
 								              cpu    = "50m"
 								              memory = "256Mi"
 								            }
 								            limits = {
 								              memory = "512Mi"
 								            }
 								          }
 								        }
-												openclaw: realtime usage dashboard via Prometheus exporter sidecar

Stdlib-only Python exporter ($1) reads ~/.openclaw/agents/*/sessions/*.jsonl
(assistant messages with usage) plus auth-profiles.json (OAuth expiry,
Plus-tier label) and exposes Prometheus text format on :9099/metrics.
Container is python:3.12-slim; pod template gets prometheus.io/scrape
annotations so the existing kubernetes-pods job picks it up — no
ServiceMonitor needed.

Metrics exported:
  openclaw_codex_messages_total{provider,model,session_kind}    counter
  openclaw_codex_input/output/cache_read/cache_write_tokens_total
  openclaw_codex_message_errors_total{reason}
  openclaw_codex_active_sessions{kind}                          gauge
  openclaw_codex_oauth_expiry_seconds{provider,account,plan}    gauge
  openclaw_codex_last_run_timestamp                             gauge

Grafana dashboard "OpenClaw — Codex Usage" (Applications folder, 30s
refresh): messages/5h vs Plus rate-card, % of 1,200 floor, tokens/5h,
cache hit %, OAuth expiry days, active sessions, last-turn age, errors,
plus per-model timeseries + bar gauge + error table.

Plus rate-card thresholds in the gauge are conservative (1,200/5h floor;
real cap is dynamic 1,200–7,000). Re-baseline if throttling shows up
below 80%.

											
										
										
											2026-05-07 09:04:25 +00:00
+								        # Sidecar: openclaw-exporter — Prometheus exporter for Codex/OAuth usage.
 								        # Reads sessions JSONL files + auth-profiles.json, exposes /metrics on :9099.
 								        # Stdlib-only Python; no pip install at startup.
 								        container {
 								          name    = "openclaw-exporter"
 								          image   = "docker.io/library/python:3.12-slim"
 								          command = ["python3", "/scripts/exporter.py"]
 								          port {
 								            container_port = 9099
 								            name           = "metrics"
 								          }
 								          env {
 								            name  = "OPENCLAW_HOME"
 								            value = "/home/node/.openclaw"
 								          }
 								          env {
 								            name  = "METRICS_PORT"
 								            value = "9099"
 								          }
 								          volume_mount {
 								            name       = "openclaw-exporter-script"
 								            mount_path = "/scripts"
 								            read_only  = true
 								          }
 								          volume_mount {
 								            name       = "openclaw-home"
 								            mount_path = "/home/node/.openclaw"
 								            read_only  = true
 								          }
 								          readiness_probe {
 								            http_get {
 								              path = "/healthz"
 								              port = 9099
 								            }
 								            initial_delay_seconds = 5
 								            period_seconds        = 30
 								          }
 								          resources {
 								            requests = {
 								              cpu    = "10m"
 								              memory = "64Mi"
 								            }
 								            limits = {
 								              memory = "128Mi"
 								            }
 								          }
 								        }
-												[ci skip] openclaw: add modelrelay sidecar as fallback model router

- Deploy modelrelay as sidecar container (auto-routes to fastest free model)
- Configured with NVIDIA NIM + OpenRouter API keys
- Primary: Mistral Large 3 (NIM), Fallback 1: Nemotron Ultra (NIM),
  Fallback 2: modelrelay/auto-fastest (80+ free models)
- Modelrelay web UI available at pod:7352

											
										
										
											2026-03-01 15:57:31 +00:00
+								        # Sidecar: modelrelay — auto-routes to fastest healthy free model
 								        container {
 								          name  = "modelrelay"
-												fix: openclaw policy violation + reduce memory requests for capacity

- openclaw: fix Kyverno policy violation (node:22-alpine ->
  docker.io/library/node:22-alpine), reduce request to 1536Mi
  with 2Gi limit for overcommit
- rybbit/clickhouse: reduce 1Gi -> 768Mi (frees 256Mi)
- stirling-pdf: reduce 1536Mi -> 1200Mi (frees 336Mi)

											
										
										
											2026-03-15 10:37:58 +00:00
+								          image = "docker.io/library/node:22-alpine"
-												[ci skip] openclaw: add modelrelay sidecar as fallback model router

- Deploy modelrelay as sidecar container (auto-routes to fastest free model)
- Configured with NVIDIA NIM + OpenRouter API keys
- Primary: Mistral Large 3 (NIM), Fallback 1: Nemotron Ultra (NIM),
  Fallback 2: modelrelay/auto-fastest (80+ free models)
- Modelrelay web UI available at pod:7352

											
										
										
											2026-03-01 15:57:31 +00:00
+								          command = ["sh", "-c", <<-EOF
 								            if [ ! -f /tools/modelrelay/node_modules/.package-lock.json ]; then
 								              mkdir -p /tools/modelrelay
 								              cd /tools/modelrelay
 								              npm init -y > /dev/null 2>&1
 								              npm install modelrelay > /dev/null 2>&1
 								            fi
 								            cd /tools/modelrelay
 								            exec npx modelrelay --port 7352
 								          EOF
 								          ]
 								          port {
 								            container_port = 7352
 								          }
 								          env {
-												migrate consuming stacks to ESO + remove k8s-dashboard static token

Phase 9: ExternalSecret migration across 26 stacks:

Fully migrated (vault data source removed, ESO delivers secrets):
- speedtest, shadowsocks, wealthfolio, plotting-book, f1-stream, tandoor
- n8n, dawarich, diun, netbox, onlyoffice, tuya-bridge
- hackmd (ESO template for DB URL), health (ESO template for DB URL)
- trading-bot (ESO template for DATABASE_URL + 7 secret env vars)
- forgejo (removed unused vault data source)

Partially migrated (vault kept for plan-time, ESO added for runtime):
- immich, linkwarden, nextcloud, paperless-ngx (jsondecode for homepage)
- claude-memory, rybbit, url, webhook_handler (plan-time in locals/jobs)
- woodpecker, openclaw, resume (plan-time in helm values/jobs/modules)

17 stacks unchanged (all plan-time: homepage annotations, configmaps,
module inputs) — vault data source works with OIDC auth.

Phase 17a: Remove k8s-dashboard static admin token secret.
Users now get tokens via: vault write kubernetes/creds/dashboard-admin

											
										
										
											2026-03-15 19:05:04 +00:00
+								            name = "NVIDIA_API_KEY"
 								            value_from {
 								              secret_key_ref {
 								                name = "openclaw-secrets"
 								                key  = "nvidia_api_key"
 								              }
 								            }
-												[ci skip] openclaw: add modelrelay sidecar as fallback model router

- Deploy modelrelay as sidecar container (auto-routes to fastest free model)
- Configured with NVIDIA NIM + OpenRouter API keys
- Primary: Mistral Large 3 (NIM), Fallback 1: Nemotron Ultra (NIM),
  Fallback 2: modelrelay/auto-fastest (80+ free models)
- Modelrelay web UI available at pod:7352

											
										
										
											2026-03-01 15:57:31 +00:00
+								          }
 								          env {
-												migrate consuming stacks to ESO + remove k8s-dashboard static token

Phase 9: ExternalSecret migration across 26 stacks:

Fully migrated (vault data source removed, ESO delivers secrets):
- speedtest, shadowsocks, wealthfolio, plotting-book, f1-stream, tandoor
- n8n, dawarich, diun, netbox, onlyoffice, tuya-bridge
- hackmd (ESO template for DB URL), health (ESO template for DB URL)
- trading-bot (ESO template for DATABASE_URL + 7 secret env vars)
- forgejo (removed unused vault data source)

Partially migrated (vault kept for plan-time, ESO added for runtime):
- immich, linkwarden, nextcloud, paperless-ngx (jsondecode for homepage)
- claude-memory, rybbit, url, webhook_handler (plan-time in locals/jobs)
- woodpecker, openclaw, resume (plan-time in helm values/jobs/modules)

17 stacks unchanged (all plan-time: homepage annotations, configmaps,
module inputs) — vault data source works with OIDC auth.

Phase 17a: Remove k8s-dashboard static admin token secret.
Users now get tokens via: vault write kubernetes/creds/dashboard-admin

											
										
										
											2026-03-15 19:05:04 +00:00
+								            name = "OPENROUTER_API_KEY"
 								            value_from {
 								              secret_key_ref {
 								                name = "openclaw-secrets"
 								                key  = "openrouter_api_key"
 								              }
 								            }
-												[ci skip] openclaw: add modelrelay sidecar as fallback model router

- Deploy modelrelay as sidecar container (auto-routes to fastest free model)
- Configured with NVIDIA NIM + OpenRouter API keys
- Primary: Mistral Large 3 (NIM), Fallback 1: Nemotron Ultra (NIM),
  Fallback 2: modelrelay/auto-fastest (80+ free models)
- Modelrelay web UI available at pod:7352

											
										
										
											2026-03-01 15:57:31 +00:00
+								          }
 								          volume_mount {
 								            name       = "tools"
 								            mount_path = "/tools"
 								          }
 								          resources {
 								            limits = {
-												right-size memory: set requests=limits based on actual usage

- Set memory requests = limits across 56 stacks to prevent overcommit
- Right-sized limits based on actual pod usage (2x actual, rounded up)
- Scaled down trading-bot (replicas=0) to free memory
- Fixed OOMKilled services: forgejo, dawarich, health, meshcentral,
  paperless-ngx, vault auto-unseal, rybbit, whisper, openclaw, clickhouse
- Added startup+liveness probes to calibre-web
- Bumped inotify limits on nodes 2,3 (max_user_instances 128->8192)

Post node2 OOM incident (2026-03-14). Previous kubelet config had no
kubeReserved/systemReserved set, allowing pods to starve the kernel.

											
										
										
											2026-03-14 21:01:24 +00:00
+								              memory = "256Mi"
-												[ci skip] openclaw: add modelrelay sidecar as fallback model router

- Deploy modelrelay as sidecar container (auto-routes to fastest free model)
- Configured with NVIDIA NIM + OpenRouter API keys
- Primary: Mistral Large 3 (NIM), Fallback 1: Nemotron Ultra (NIM),
  Fallback 2: modelrelay/auto-fastest (80+ free models)
- Modelrelay web UI available at pod:7352

											
										
										
											2026-03-01 15:57:31 +00:00
+								            }
 								            requests = {
 								              cpu    = "25m"
-												fix: reduce openclaw memory requests for scheduling

- openclaw: request 1280Mi (limit 2Gi), modelrelay request 128Mi
  (limit 256Mi). Total request 1408Mi fits available capacity.

											
										
										
											2026-03-15 10:47:34 +00:00
+								              memory = "128Mi"
-												[ci skip] openclaw: add modelrelay sidecar as fallback model router

- Deploy modelrelay as sidecar container (auto-routes to fastest free model)
- Configured with NVIDIA NIM + OpenRouter API keys
- Primary: Mistral Large 3 (NIM), Fallback 1: Nemotron Ultra (NIM),
  Fallback 2: modelrelay/auto-fastest (80+ free models)
- Modelrelay web UI available at pod:7352

											
										
										
											2026-03-01 15:57:31 +00:00
+								            }
 								          }
 								        }
-												[ci skip] Flatten module wrappers into stack roots

Remove the module "xxx" { source = "./module" } indirection layer
from all 66 service stacks. Resources are now defined directly in
each stack's main.tf instead of through a wrapper module.

- Merge module/main.tf contents into stack main.tf
- Apply variable replacements (var.tier -> local.tiers.X, renamed vars)
- Fix shared module paths (one fewer ../ at each level)
- Move extra files/dirs (factory/, chart_values, subdirs) to stack root
- Update state files to strip module.<name>. prefix
- Update CLAUDE.md to reflect flat structure

Verified: terragrunt plan shows 0 add, 0 destroy across all stacks.

											
										
										
											2026-02-22 15:13:55 +00:00
+								        volume {
 								          name = "tools"
-												[ci skip] complete NFS CSI migration: complex stacks + platform modules

Migrate remaining multi-volume stacks and all platform modules from
inline NFS volumes to CSI-backed PV/PVC with nfs-truenas StorageClass
(soft,timeo=30,retrans=3 mount options).

Complex stacks: openclaw (4 vols), immich (8 vols), frigate (2 vols),
nextcloud (2 vols + old PV replaced), rybbit (1 vol)

Remaining stacks: affine, ebook2audiobook, f1-stream, osm_routing,
real-estate-crawler

Platform modules: monitoring (prometheus, loki, alertmanager PVs
converted from native NFS to CSI), redis, dbaas, technitium,
headscale, vaultwarden, uptime-kuma, mailserver, infra-maintenance

											
										
										
											2026-03-02 01:24:07 +00:00
+								          persistent_volume_claim {
-												truenas deprecation: migrate all non-immich storage to proxmox NFS

- Migrate 7 backup CronJobs to Proxmox host NFS (192.168.1.127)
  (etcd, mysql, postgresql, nextcloud, redis, vaultwarden, plotting-book)
- Migrate headscale backup, ebook2audiobook, osm_routing to Proxmox NFS
- Migrate servarr (lidarr, readarr, soulseek) NFS refs to Proxmox
- Remove 79 orphaned TrueNAS NFS module declarations from 49 stacks
- Delete stacks/platform/modules/ (27 dead module copies, 65MB)
- Update nfs-truenas StorageClass to point to Proxmox (192.168.1.127)
- Remove iscsi DNS record from config.tfvars
- Fix woodpecker persistence config and alertmanager PV

Only Immich (8 PVCs, ~1.4TB) remains on TrueNAS.

											
										
										
											2026-04-12 14:35:39 +01:00
+								            claim_name = module.nfs_tools_host.claim_name
-												[ci skip] openclaw: cache tools on NFS for fast restarts

- Switch /tools volume from emptyDir to NFS (/mnt/main/openclaw/tools)
- Skip download of kubectl, terraform, terragrunt, pip packages if cached
- Startup time: ~2.5min → ~38s on subsequent restarts

											
										
										
											2026-03-01 13:59:07 +00:00
+								          }
-												[ci skip] Flatten module wrappers into stack roots

Remove the module "xxx" { source = "./module" } indirection layer
from all 66 service stacks. Resources are now defined directly in
each stack's main.tf instead of through a wrapper module.

- Merge module/main.tf contents into stack main.tf
- Apply variable replacements (var.tier -> local.tiers.X, renamed vars)
- Fix shared module paths (one fewer ../ at each level)
- Move extra files/dirs (factory/, chart_values, subdirs) to stack root
- Update state files to strip module.<name>. prefix
- Update CLAUDE.md to reflect flat structure

Verified: terragrunt plan shows 0 add, 0 destroy across all stacks.

											
										
										
											2026-02-22 15:13:55 +00:00
+								        }
 								        volume {
 								          name = "openclaw-home"
-												[ci skip] complete NFS CSI migration: complex stacks + platform modules

Migrate remaining multi-volume stacks and all platform modules from
inline NFS volumes to CSI-backed PV/PVC with nfs-truenas StorageClass
(soft,timeo=30,retrans=3 mount options).

Complex stacks: openclaw (4 vols), immich (8 vols), frigate (2 vols),
nextcloud (2 vols + old PV replaced), rybbit (1 vol)

Remaining stacks: affine, ebook2audiobook, f1-stream, osm_routing,
real-estate-crawler

Platform modules: monitoring (prometheus, loki, alertmanager PVs
converted from native NFS to CSI), redis, dbaas, technitium,
headscale, vaultwarden, uptime-kuma, mailserver, infra-maintenance

											
										
										
											2026-03-02 01:24:07 +00:00
+								          persistent_volume_claim {
-												feat(storage): migrate 38 NFS PVCs to proxmox-lvm (Wave 2)

Add proxmox-lvm PVCs with pvc-autoresizer annotations for all
remaining single-pod app data services. Deployments updated to
use new block storage PVCs. Old NFS modules retained for rollback.

Services: affine, changedetection, diun, excalidraw, f1-stream,
hackmd, isponsorblocktv, matrix, n8n, send, grampsweb, health,
onlyoffice, owntracks, paperless-ngx, privatebin, resume,
speedtest, stirling-pdf, tandoor, rybbit (clickhouse), tor-proxy
(torrserver), whisper+piper, frigate (config), ollama (ui),
servarr (prowlarr/listenarr/qbittorrent), aiostreams, freshrss
(extensions), meshcentral (data+files), openclaw (data+home+
openlobster), technitium, mailserver (data+roundcube html+enigma),
dbaas (pgadmin).

Strategy set to Recreate where needed for RWO volumes.

											
										
										
											2026-04-04 19:25:12 +03:00
+								            claim_name = kubernetes_persistent_volume_claim.home_proxmox.metadata[0].name
-												[ci skip] openclaw: persist home directory on NFS

- Switch openclaw-home from emptyDir to NFS (/mnt/main/openclaw/home)
- Persists SOUL.md, IDENTITY.md, sessions, memory DB, telegram state,
  device identity, and all runtime files across pod restarts
- Init container still refreshes openclaw.json and kubeconfig on each start

											
										
										
											2026-03-01 16:12:07 +00:00
+								          }
-												[ci skip] Flatten module wrappers into stack roots

Remove the module "xxx" { source = "./module" } indirection layer
from all 66 service stacks. Resources are now defined directly in
each stack's main.tf instead of through a wrapper module.

- Merge module/main.tf contents into stack main.tf
- Apply variable replacements (var.tier -> local.tiers.X, renamed vars)
- Fix shared module paths (one fewer ../ at each level)
- Move extra files/dirs (factory/, chart_values, subdirs) to stack root
- Update state files to strip module.<name>. prefix
- Update CLAUDE.md to reflect flat structure

Verified: terragrunt plan shows 0 add, 0 destroy across all stacks.

											
										
										
											2026-02-22 15:13:55 +00:00
+								        }
 								        volume {
 								          name = "workspace"
-												[ci skip] complete NFS CSI migration: complex stacks + platform modules

Migrate remaining multi-volume stacks and all platform modules from
inline NFS volumes to CSI-backed PV/PVC with nfs-truenas StorageClass
(soft,timeo=30,retrans=3 mount options).

Complex stacks: openclaw (4 vols), immich (8 vols), frigate (2 vols),
nextcloud (2 vols + old PV replaced), rybbit (1 vol)

Remaining stacks: affine, ebook2audiobook, f1-stream, osm_routing,
real-estate-crawler

Platform modules: monitoring (prometheus, loki, alertmanager PVs
converted from native NFS to CSI), redis, dbaas, technitium,
headscale, vaultwarden, uptime-kuma, mailserver, infra-maintenance

											
										
										
											2026-03-02 01:24:07 +00:00
+								          persistent_volume_claim {
-												truenas deprecation: migrate all non-immich storage to proxmox NFS

- Migrate 7 backup CronJobs to Proxmox host NFS (192.168.1.127)
  (etcd, mysql, postgresql, nextcloud, redis, vaultwarden, plotting-book)
- Migrate headscale backup, ebook2audiobook, osm_routing to Proxmox NFS
- Migrate servarr (lidarr, readarr, soulseek) NFS refs to Proxmox
- Remove 79 orphaned TrueNAS NFS module declarations from 49 stacks
- Delete stacks/platform/modules/ (27 dead module copies, 65MB)
- Update nfs-truenas StorageClass to point to Proxmox (192.168.1.127)
- Remove iscsi DNS record from config.tfvars
- Fix woodpecker persistence config and alertmanager PV

Only Immich (8 PVCs, ~1.4TB) remains on TrueNAS.

											
										
										
											2026-04-12 14:35:39 +01:00
+								            claim_name = module.nfs_workspace_host.claim_name
-												[ci skip] Flatten module wrappers into stack roots

Remove the module "xxx" { source = "./module" } indirection layer
from all 66 service stacks. Resources are now defined directly in
each stack's main.tf instead of through a wrapper module.

- Merge module/main.tf contents into stack main.tf
- Apply variable replacements (var.tier -> local.tiers.X, renamed vars)
- Fix shared module paths (one fewer ../ at each level)
- Move extra files/dirs (factory/, chart_values, subdirs) to stack root
- Update state files to strip module.<name>. prefix
- Update CLAUDE.md to reflect flat structure

Verified: terragrunt plan shows 0 add, 0 destroy across all stacks.

											
										
										
											2026-02-22 15:13:55 +00:00
+								          }
 								        }
 								        volume {
 								          name = "data"
-												[ci skip] complete NFS CSI migration: complex stacks + platform modules

Migrate remaining multi-volume stacks and all platform modules from
inline NFS volumes to CSI-backed PV/PVC with nfs-truenas StorageClass
(soft,timeo=30,retrans=3 mount options).

Complex stacks: openclaw (4 vols), immich (8 vols), frigate (2 vols),
nextcloud (2 vols + old PV replaced), rybbit (1 vol)

Remaining stacks: affine, ebook2audiobook, f1-stream, osm_routing,
real-estate-crawler

Platform modules: monitoring (prometheus, loki, alertmanager PVs
converted from native NFS to CSI), redis, dbaas, technitium,
headscale, vaultwarden, uptime-kuma, mailserver, infra-maintenance

											
										
										
											2026-03-02 01:24:07 +00:00
+								          persistent_volume_claim {
-												feat(storage): migrate 38 NFS PVCs to proxmox-lvm (Wave 2)

Add proxmox-lvm PVCs with pvc-autoresizer annotations for all
remaining single-pod app data services. Deployments updated to
use new block storage PVCs. Old NFS modules retained for rollback.

Services: affine, changedetection, diun, excalidraw, f1-stream,
hackmd, isponsorblocktv, matrix, n8n, send, grampsweb, health,
onlyoffice, owntracks, paperless-ngx, privatebin, resume,
speedtest, stirling-pdf, tandoor, rybbit (clickhouse), tor-proxy
(torrserver), whisper+piper, frigate (config), ollama (ui),
servarr (prowlarr/listenarr/qbittorrent), aiostreams, freshrss
(extensions), meshcentral (data+files), openclaw (data+home+
openlobster), technitium, mailserver (data+roundcube html+enigma),
dbaas (pgadmin).

Strategy set to Recreate where needed for RWO volumes.

											
										
										
											2026-04-04 19:25:12 +03:00
+								            claim_name = kubernetes_persistent_volume_claim.data_proxmox.metadata[0].name
-												[ci skip] Flatten module wrappers into stack roots

Remove the module "xxx" { source = "./module" } indirection layer
from all 66 service stacks. Resources are now defined directly in
each stack's main.tf instead of through a wrapper module.

- Merge module/main.tf contents into stack main.tf
- Apply variable replacements (var.tier -> local.tiers.X, renamed vars)
- Fix shared module paths (one fewer ../ at each level)
- Move extra files/dirs (factory/, chart_values, subdirs) to stack root
- Update state files to strip module.<name>. prefix
- Update CLAUDE.md to reflect flat structure

Verified: terragrunt plan shows 0 add, 0 destroy across all stacks.

											
										
										
											2026-02-22 15:13:55 +00:00
+								          }
 								        }
 								        volume {
 								          name = "ssh-key"
 								          secret {
 								            secret_name  = kubernetes_secret.ssh_key.metadata[0].name
 								            default_mode = "0600"
 								          }
 								        }
 								        volume {
 								          name = "openclaw-config"
 								          config_map {
 								            name = kubernetes_config_map.openclaw_config.metadata[0].name
 								          }
 								        }
-												openclaw: realtime usage dashboard via Prometheus exporter sidecar

Stdlib-only Python exporter ($1) reads ~/.openclaw/agents/*/sessions/*.jsonl
(assistant messages with usage) plus auth-profiles.json (OAuth expiry,
Plus-tier label) and exposes Prometheus text format on :9099/metrics.
Container is python:3.12-slim; pod template gets prometheus.io/scrape
annotations so the existing kubernetes-pods job picks it up — no
ServiceMonitor needed.

Metrics exported:
  openclaw_codex_messages_total{provider,model,session_kind}    counter
  openclaw_codex_input/output/cache_read/cache_write_tokens_total
  openclaw_codex_message_errors_total{reason}
  openclaw_codex_active_sessions{kind}                          gauge
  openclaw_codex_oauth_expiry_seconds{provider,account,plan}    gauge
  openclaw_codex_last_run_timestamp                             gauge

Grafana dashboard "OpenClaw — Codex Usage" (Applications folder, 30s
refresh): messages/5h vs Plus rate-card, % of 1,200 floor, tokens/5h,
cache hit %, OAuth expiry days, active sessions, last-turn age, errors,
plus per-model timeseries + bar gauge + error table.

Plus rate-card thresholds in the gauge are conservative (1,200/5h floor;
real cap is dynamic 1,200–7,000). Re-baseline if throttling shows up
below 80%.

											
										
										
											2026-05-07 09:04:25 +00:00
+								        volume {
 								          name = "openclaw-exporter-script"
 								          config_map {
 								            name         = kubernetes_config_map.openclaw_exporter.metadata[0].name
 								            default_mode = "0555"
 								          }
 								        }
-												[ci skip] Flatten module wrappers into stack roots

Remove the module "xxx" { source = "./module" } indirection layer
from all 66 service stacks. Resources are now defined directly in
each stack's main.tf instead of through a wrapper module.

- Merge module/main.tf contents into stack main.tf
- Apply variable replacements (var.tier -> local.tiers.X, renamed vars)
- Fix shared module paths (one fewer ../ at each level)
- Move extra files/dirs (factory/, chart_values, subdirs) to stack root
- Update state files to strip module.<name>. prefix
- Update CLAUDE.md to reflect flat structure

Verified: terragrunt plan shows 0 add, 0 destroy across all stacks.

											
										
										
											2026-02-22 15:13:55 +00:00
+								      }
 								    }
 								  }
-												[infra] Sweep dns_config ignore_changes across all pod-owning resources [ci skip]

## Context

Wave 3A (commit c9d221d5) added the `# KYVERNO_LIFECYCLE_V1` marker to the
27 pre-existing `ignore_changes = [...dns_config]` sites so they could be
grepped and audited. It did NOT address pod-owning resources that were
simply missing the suppression entirely. Post-Wave-3A sampling (2026-04-18)
found that navidrome, f1-stream, frigate, servarr, monitoring, crowdsec,
and many other stacks showed perpetual `dns_config` drift every plan
because their `kubernetes_deployment` / `kubernetes_stateful_set` /
`kubernetes_cron_job_v1` resources had no `lifecycle {}` block at all.

Root cause (same as Wave 3A): Kyverno's admission webhook stamps
`dns_config { option { name = "ndots"; value = "2" } }` on every pod's
`spec.template.spec.dns_config` to prevent NxDomain search-domain flooding
(see `k8s-ndots-search-domain-nxdomain-flood` skill). Without `ignore_changes`
on every Terraform-managed pod-owner, Terraform repeatedly tries to strip
the injected field.

## This change

Extends the Wave 3A convention by sweeping EVERY `kubernetes_deployment`,
`kubernetes_stateful_set`, `kubernetes_daemon_set`, `kubernetes_cron_job_v1`,
`kubernetes_job_v1` (+ their `_v1` variants) in the repo and ensuring each
carries the right `ignore_changes` path:

- **kubernetes_deployment / stateful_set / daemon_set / job_v1**:
  `spec[0].template[0].spec[0].dns_config`
- **kubernetes_cron_job_v1**:
  `spec[0].job_template[0].spec[0].template[0].spec[0].dns_config`
  (extra `job_template[0]` nesting — the CronJob's PodTemplateSpec is
  one level deeper)

Each injection / extension is tagged `# KYVERNO_LIFECYCLE_V1: Kyverno
admission webhook mutates dns_config with ndots=2` inline so the
suppression is discoverable via `rg 'KYVERNO_LIFECYCLE_V1' stacks/`.

Two insertion paths are handled by a Python pass (`/tmp/add_dns_config_ignore.py`):

1. **No existing `lifecycle {}`**: inject a brand-new block just before the
   resource's closing `}`. 108 new blocks on 93 files.
2. **Existing `lifecycle {}` (usually for `DRIFT_WORKAROUND: CI owns image tag`
   from Wave 4, commit a62b43d1)**: extend its `ignore_changes` list with the
   dns_config path. Handles both inline (`= [x]`) and multiline
   (`= [\n  x,\n]`) forms; ensures the last pre-existing list item carries
   a trailing comma so the extended list is valid HCL. 34 extensions.

The script skips anything already mentioning `dns_config` inside an
`ignore_changes`, so re-running is a no-op.

## Scale

- 142 total lifecycle injections/extensions
- 93 `.tf` files touched
- 108 brand-new `lifecycle {}` blocks + 34 extensions of existing ones
- Every Tier 0 and Tier 1 stack with a pod-owning resource is covered
- Together with Wave 3A's 27 pre-existing markers → **169 greppable
  `KYVERNO_LIFECYCLE_V1` dns_config sites across the repo**

## What is NOT in this change

- `stacks/trading-bot/main.tf` — entirely commented-out block (`/* … */`).
  Python script touched the file, reverted manually.
- `_template/main.tf.example` skeleton — kept minimal on purpose; any
  future stack created from it should either inherit the Wave 3A one-line
  form or add its own on first `kubernetes_deployment`.
- `terraform fmt` fixes to pre-existing alignment issues in meshcentral,
  nvidia/modules/nvidia, vault — unrelated to this commit. Left for a
  separate fmt-only pass.
- Non-pod resources (`kubernetes_service`, `kubernetes_secret`,
  `kubernetes_manifest`, etc.) — they don't own pods so they don't get
  Kyverno dns_config mutation.

## Verification

Random sample post-commit:
```
$ cd stacks/navidrome && ../../scripts/tg plan  → No changes.
$ cd stacks/f1-stream && ../../scripts/tg plan  → No changes.
$ cd stacks/frigate && ../../scripts/tg plan    → No changes.

$ rg -c 'KYVERNO_LIFECYCLE_V1' stacks/ --include='*.tf' --include='*.tf.example' \
    | awk -F: '{s+=$2} END {print s}'
169
```

## Reproduce locally
1. `git pull`
2. `rg 'KYVERNO_LIFECYCLE_V1' stacks/ | wc -l` → 169+
3. `cd stacks/navidrome && ../../scripts/tg plan` → expect 0 drift on
   the deployment's dns_config field.

Refs: code-seq (Wave 3B dns_config class closed; kubernetes_manifest
annotation class handled separately in 8d94688d for tls_secret)

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-18 21:19:48 +00:00
+								  lifecycle {
 								    # KYVERNO_LIFECYCLE_V1: Kyverno admission webhook mutates dns_config with ndots=2
 								    ignore_changes = [spec[0].template[0].spec[0].dns_config]
 								  }
-												[ci skip] Flatten module wrappers into stack roots

Remove the module "xxx" { source = "./module" } indirection layer
from all 66 service stacks. Resources are now defined directly in
each stack's main.tf instead of through a wrapper module.

- Merge module/main.tf contents into stack main.tf
- Apply variable replacements (var.tier -> local.tiers.X, renamed vars)
- Fix shared module paths (one fewer ../ at each level)
- Move extra files/dirs (factory/, chart_values, subdirs) to stack root
- Update state files to strip module.<name>. prefix
- Update CLAUDE.md to reflect flat structure

Verified: terragrunt plan shows 0 add, 0 destroy across all stacks.

											
										
										
											2026-02-22 15:13:55 +00:00
+								}
 								resource "kubernetes_service" "openclaw" {
 								  metadata {
 								    name      = "openclaw"
 								    namespace = kubernetes_namespace.openclaw.metadata[0].name
 								    labels = {
 								      app = "openclaw"
 								    }
 								  }
 								  spec {
 								    selector = {
 								      app = "openclaw"
 								    }
 								    port {
 								      port        = 80
 								      target_port = 18789
 								    }
 								  }
 								}
 								module "ingress" {
 								  source          = "../../modules/kubernetes/ingress_factory"
-												[infra] Auto-create Cloudflare DNS records from ingress_factory

## Context

Deploying new services required manually adding hostnames to
cloudflare_proxied_names/cloudflare_non_proxied_names in config.tfvars —
a separate file from the service stack. This was frequently forgotten,
leaving services unreachable externally.

## This change:

- Add `dns_type` parameter to `ingress_factory` and `reverse_proxy/factory`
  modules. Setting `dns_type = "proxied"` or `"non-proxied"` auto-creates
  the Cloudflare DNS record (CNAME to tunnel or A/AAAA to public IP).
- Simplify cloudflared tunnel from 100 per-hostname rules to wildcard
  `*.viktorbarzin.me → Traefik`. Traefik still handles host-based routing.
- Add global Cloudflare provider via terragrunt.hcl (separate
  cloudflare_provider.tf with Vault-sourced API key).
- Migrate 118 hostnames from centralized config.tfvars to per-service
  dns_type. 17 hostnames remain centrally managed (Helm ingresses,
  special cases).
- Update docs, AGENTS.md, CLAUDE.md, dns.md runbook.

```
BEFORE                          AFTER
config.tfvars (manual list)     stacks/<svc>/main.tf
        |                         module "ingress" {
        v                           dns_type = "proxied"
stacks/cloudflared/               }
  for_each = list                     |
  cloudflare_record               auto-creates
  tunnel per-hostname             cloudflare_record + annotation
```

## What is NOT in this change:

- Uptime Kuma monitor migration (still reads from config.tfvars)
- 17 remaining centrally-managed hostnames (Helm, special cases)
- Removal of allow_overwrite (keep until migration confirmed stable)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-16 13:45:04 +00:00
+								  dns_type        = "non-proxied"
-												[ci skip] Flatten module wrappers into stack roots

Remove the module "xxx" { source = "./module" } indirection layer
from all 66 service stacks. Resources are now defined directly in
each stack's main.tf instead of through a wrapper module.

- Merge module/main.tf contents into stack main.tf
- Apply variable replacements (var.tier -> local.tiers.X, renamed vars)
- Fix shared module paths (one fewer ../ at each level)
- Move extra files/dirs (factory/, chart_values, subdirs) to stack root
- Update state files to strip module.<name>. prefix
- Update CLAUDE.md to reflect flat structure

Verified: terragrunt plan shows 0 add, 0 destroy across all stacks.

											
										
										
											2026-02-22 15:13:55 +00:00
+								  namespace       = kubernetes_namespace.openclaw.metadata[0].name
 								  name            = "openclaw"
 								  tls_secret_name = var.tls_secret_name
 								  port            = 80
 								  protected       = true
-												[ci skip] add Homepage gethomepage.dev annotations to all services

Add Kubernetes ingress annotations for Homepage auto-discovery across
~88 services organized into 11 groups. Enable serviceAccount for RBAC,
configure group layouts, and add Grafana/Frigate/Speedtest widgets.

											
										
										
											2026-03-07 16:41:36 +00:00
+								  extra_annotations = {
 								    "gethomepage.dev/enabled"      = "true"
 								    "gethomepage.dev/name"         = "OpenClaw"
 								    "gethomepage.dev/description"  = "AI assistant"
 								    "gethomepage.dev/icon"         = "openai.png"
 								    "gethomepage.dev/group"        = "AI & Data"
 								    "gethomepage.dev/pod-selector" = ""
 								  }
-												[ci skip] Flatten module wrappers into stack roots

Remove the module "xxx" { source = "./module" } indirection layer
from all 66 service stacks. Resources are now defined directly in
each stack's main.tf instead of through a wrapper module.

- Merge module/main.tf contents into stack main.tf
- Apply variable replacements (var.tier -> local.tiers.X, renamed vars)
- Fix shared module paths (one fewer ../ at each level)
- Move extra files/dirs (factory/, chart_values, subdirs) to stack root
- Update state files to strip module.<name>. prefix
- Update CLAUDE.md to reflect flat structure

Verified: terragrunt plan shows 0 add, 0 destroy across all stacks.

											
										
										
											2026-02-22 15:13:55 +00:00
+								}
-												[ci skip] add Forgejo task pipeline for OpenClaw AI agent

Forgejo issues as a task queue for OpenClaw:
- Forgejo OAuth2 with Authentik SSO, self-registration disabled
- Webhook-triggered task processing (instant) + CronJob backup (5min poll)
- Tasks processed via Mistral Large 3 (NVIDIA NIM API)
- Results posted as issue comments, auto-labeled and closed
- Comment follow-ups and reopened issues supported
- n8n RBAC for OpenClaw pod exec (future workflow integration)

											
										
										
											2026-03-07 21:09:31 +00:00
+								# --- Webhook receiver: triggers task-processor Job on Forgejo issue events ---
 								resource "kubernetes_config_map" "task_webhook" {
 								  metadata {
 								    name      = "task-webhook"
 								    namespace = kubernetes_namespace.openclaw.metadata[0].name
 								  }
 								  data = {
 								    "server.py" = <<-PYEOF
 								      from http.server import HTTPServer, BaseHTTPRequestHandler
 								      import subprocess, time, json, os
 								      BOT_USER = os.environ.get('FORGEJO_BOT_USER', 'viktor')
 								      class Handler(BaseHTTPRequestHandler):
 								          def do_POST(self):
 								              try:
 								                  body = self.rfile.read(int(self.headers.get('Content-Length', 0)))
 								                  data = json.loads(body)
 								                  action = data.get('action', '')
 								                  # Trigger on: new issue, reopened issue, or new comment
 								                  trigger = False
 								                  if action in ('opened', 'reopened'):
 								                      issue = data.get('issue', {})
 								                      print(f"Issue #{issue.get('number','?')} {action}: {issue.get('title','?')}")
 								                      trigger = True
 								                  elif action == 'created' and 'comment' in data:
 								                      comment = data.get('comment', {})
 								                      commenter = comment.get('user', {}).get('login', '')
 								                      # Skip comments from the bot itself to avoid loops
 								                      if commenter != BOT_USER:
 								                          issue = data.get('issue', {})
 								                          print(f"Comment on #{issue.get('number','?')} by {commenter}")
 								                          trigger = True
 								                      else:
 								                          print(f"Skipping own comment on #{data.get('issue',{}).get('number','?')}")
 								                  if trigger:
 								                      job_name = f"task-processor-{int(time.time())}"
 								                      subprocess.run([
 								                          'kubectl', 'create', 'job', job_name,
 								                          '--from=cronjob/task-processor',
 								                          '-n', 'openclaw'
 								                      ], check=True)
 								                      self.send_response(200)
 								                      self.end_headers()
 								                      self.wfile.write(b'{"ok":true}')
 								                  else:
 								                      self.send_response(200)
 								                      self.end_headers()
 								                      self.wfile.write(b'{"ok":true,"skipped":true}')
 								              except Exception as e:
 								                  print(f"Error: {e}")
 								                  self.send_response(500)
 								                  self.end_headers()
 								                  self.wfile.write(f'{{"error":"{e}"}}'.encode())
 								          def do_GET(self):
 								              self.send_response(200)
 								              self.end_headers()
 								              self.wfile.write(b'{"status":"ok"}')
 								          def log_message(self, fmt, *args):
 								              print(f"[webhook] {args[0]} {args[1]} {args[2]}")
 								      print("Task webhook receiver listening on :8080")
 								      HTTPServer(('', 8080), Handler).serve_forever()
 								    PYEOF
 								  }
 								}
 								resource "kubernetes_service_account" "task_webhook" {
 								  metadata {
 								    name      = "task-webhook"
 								    namespace = kubernetes_namespace.openclaw.metadata[0].name
 								  }
 								}
 								resource "kubernetes_role" "task_webhook" {
 								  metadata {
 								    name      = "task-webhook-job-creator"
 								    namespace = kubernetes_namespace.openclaw.metadata[0].name
 								  }
 								  rule {
 								    api_groups = ["batch"]
 								    resources  = ["jobs", "cronjobs"]
 								    verbs      = ["get", "list", "create"]
 								  }
 								}
 								resource "kubernetes_role_binding" "task_webhook" {
 								  metadata {
 								    name      = "task-webhook-job-creator"
 								    namespace = kubernetes_namespace.openclaw.metadata[0].name
 								  }
 								  subject {
 								    kind      = "ServiceAccount"
 								    name      = kubernetes_service_account.task_webhook.metadata[0].name
 								    namespace = kubernetes_namespace.openclaw.metadata[0].name
 								  }
 								  role_ref {
 								    api_group = "rbac.authorization.k8s.io"
 								    kind      = "Role"
 								    name      = kubernetes_role.task_webhook.metadata[0].name
 								  }
 								}
 								resource "kubernetes_deployment" "task_webhook" {
 								  metadata {
 								    name      = "task-webhook"
 								    namespace = kubernetes_namespace.openclaw.metadata[0].name
 								    labels = {
 								      app  = "task-webhook"
 								      tier = local.tiers.aux
 								    }
 								  }
 								  spec {
 								    replicas = 1
 								    selector {
 								      match_labels = {
 								        app = "task-webhook"
 								      }
 								    }
 								    template {
 								      metadata {
 								        labels = {
 								          app = "task-webhook"
 								        }
 								      }
 								      spec {
 								        service_account_name = kubernetes_service_account.task_webhook.metadata[0].name
 								        container {
-												Remove all CPU limits cluster-wide to eliminate CFS throttling

CPU limits cause CFS throttling even when nodes have idle capacity.
Move to a request-only CPU model: keep CPU requests for scheduling
fairness but remove all CPU limits. Memory limits stay (incompressible).

Changes across 108 files:
- Kyverno LimitRange policy: remove cpu from default/max in all 6 tiers
- Kyverno ResourceQuota policy: remove limits.cpu from all 5 tiers
- Custom ResourceQuotas: remove limits.cpu from 8 namespace quotas
- Custom LimitRanges: remove cpu from default/max (nextcloud, onlyoffice)
- RBAC module: remove cpu_limits variable and quota reference
- Freedify factory: remove cpu_limit variable and limits reference
- 86 deployment files: remove cpu from all limits blocks
- 6 Helm values files: remove cpu under limits sections

											
										
										
											2026-03-14 08:51:45 +00:00
+								          name    = "webhook"
 								          image   = "python:3-alpine"
-												[ci skip] add Forgejo task pipeline for OpenClaw AI agent

Forgejo issues as a task queue for OpenClaw:
- Forgejo OAuth2 with Authentik SSO, self-registration disabled
- Webhook-triggered task processing (instant) + CronJob backup (5min poll)
- Tasks processed via Mistral Large 3 (NVIDIA NIM API)
- Results posted as issue comments, auto-labeled and closed
- Comment follow-ups and reopened issues supported
- n8n RBAC for OpenClaw pod exec (future workflow integration)

											
										
										
											2026-03-07 21:09:31 +00:00
+								          command = ["sh", "-c", "apk add --no-cache curl > /dev/null 2>&1 && curl -sfL https://dl.k8s.io/release/v1.34.2/bin/linux/amd64/kubectl -o /usr/local/bin/kubectl && chmod +x /usr/local/bin/kubectl && exec python3 -u /app/server.py"]
 								          port {
 								            container_port = 8080
 								          }
 								          volume_mount {
 								            name       = "app"
 								            mount_path = "/app"
 								          }
 								          resources {
 								            requests = {
 								              cpu    = "5m"
-												right-size memory: set requests=limits based on actual usage

- Set memory requests = limits across 56 stacks to prevent overcommit
- Right-sized limits based on actual pod usage (2x actual, rounded up)
- Scaled down trading-bot (replicas=0) to free memory
- Fixed OOMKilled services: forgejo, dawarich, health, meshcentral,
  paperless-ngx, vault auto-unseal, rybbit, whisper, openclaw, clickhouse
- Added startup+liveness probes to calibre-web
- Bumped inotify limits on nodes 2,3 (max_user_instances 128->8192)

Post node2 OOM incident (2026-03-14). Previous kubelet config had no
kubeReserved/systemReserved set, allowing pods to starve the kernel.

											
										
										
											2026-03-14 21:01:24 +00:00
+								              memory = "64Mi"
-												[ci skip] add Forgejo task pipeline for OpenClaw AI agent

Forgejo issues as a task queue for OpenClaw:
- Forgejo OAuth2 with Authentik SSO, self-registration disabled
- Webhook-triggered task processing (instant) + CronJob backup (5min poll)
- Tasks processed via Mistral Large 3 (NVIDIA NIM API)
- Results posted as issue comments, auto-labeled and closed
- Comment follow-ups and reopened issues supported
- n8n RBAC for OpenClaw pod exec (future workflow integration)

											
										
										
											2026-03-07 21:09:31 +00:00
+								            }
 								            limits = {
 								              memory = "64Mi"
 								            }
 								          }
 								        }
 								        volume {
 								          name = "app"
 								          config_map {
 								            name = kubernetes_config_map.task_webhook.metadata[0].name
 								          }
 								        }
 								      }
 								    }
 								  }
-												[infra] Sweep dns_config ignore_changes across all pod-owning resources [ci skip]

## Context

Wave 3A (commit c9d221d5) added the `# KYVERNO_LIFECYCLE_V1` marker to the
27 pre-existing `ignore_changes = [...dns_config]` sites so they could be
grepped and audited. It did NOT address pod-owning resources that were
simply missing the suppression entirely. Post-Wave-3A sampling (2026-04-18)
found that navidrome, f1-stream, frigate, servarr, monitoring, crowdsec,
and many other stacks showed perpetual `dns_config` drift every plan
because their `kubernetes_deployment` / `kubernetes_stateful_set` /
`kubernetes_cron_job_v1` resources had no `lifecycle {}` block at all.

Root cause (same as Wave 3A): Kyverno's admission webhook stamps
`dns_config { option { name = "ndots"; value = "2" } }` on every pod's
`spec.template.spec.dns_config` to prevent NxDomain search-domain flooding
(see `k8s-ndots-search-domain-nxdomain-flood` skill). Without `ignore_changes`
on every Terraform-managed pod-owner, Terraform repeatedly tries to strip
the injected field.

## This change

Extends the Wave 3A convention by sweeping EVERY `kubernetes_deployment`,
`kubernetes_stateful_set`, `kubernetes_daemon_set`, `kubernetes_cron_job_v1`,
`kubernetes_job_v1` (+ their `_v1` variants) in the repo and ensuring each
carries the right `ignore_changes` path:

- **kubernetes_deployment / stateful_set / daemon_set / job_v1**:
  `spec[0].template[0].spec[0].dns_config`
- **kubernetes_cron_job_v1**:
  `spec[0].job_template[0].spec[0].template[0].spec[0].dns_config`
  (extra `job_template[0]` nesting — the CronJob's PodTemplateSpec is
  one level deeper)

Each injection / extension is tagged `# KYVERNO_LIFECYCLE_V1: Kyverno
admission webhook mutates dns_config with ndots=2` inline so the
suppression is discoverable via `rg 'KYVERNO_LIFECYCLE_V1' stacks/`.

Two insertion paths are handled by a Python pass (`/tmp/add_dns_config_ignore.py`):

1. **No existing `lifecycle {}`**: inject a brand-new block just before the
   resource's closing `}`. 108 new blocks on 93 files.
2. **Existing `lifecycle {}` (usually for `DRIFT_WORKAROUND: CI owns image tag`
   from Wave 4, commit a62b43d1)**: extend its `ignore_changes` list with the
   dns_config path. Handles both inline (`= [x]`) and multiline
   (`= [\n  x,\n]`) forms; ensures the last pre-existing list item carries
   a trailing comma so the extended list is valid HCL. 34 extensions.

The script skips anything already mentioning `dns_config` inside an
`ignore_changes`, so re-running is a no-op.

## Scale

- 142 total lifecycle injections/extensions
- 93 `.tf` files touched
- 108 brand-new `lifecycle {}` blocks + 34 extensions of existing ones
- Every Tier 0 and Tier 1 stack with a pod-owning resource is covered
- Together with Wave 3A's 27 pre-existing markers → **169 greppable
  `KYVERNO_LIFECYCLE_V1` dns_config sites across the repo**

## What is NOT in this change

- `stacks/trading-bot/main.tf` — entirely commented-out block (`/* … */`).
  Python script touched the file, reverted manually.
- `_template/main.tf.example` skeleton — kept minimal on purpose; any
  future stack created from it should either inherit the Wave 3A one-line
  form or add its own on first `kubernetes_deployment`.
- `terraform fmt` fixes to pre-existing alignment issues in meshcentral,
  nvidia/modules/nvidia, vault — unrelated to this commit. Left for a
  separate fmt-only pass.
- Non-pod resources (`kubernetes_service`, `kubernetes_secret`,
  `kubernetes_manifest`, etc.) — they don't own pods so they don't get
  Kyverno dns_config mutation.

## Verification

Random sample post-commit:
```
$ cd stacks/navidrome && ../../scripts/tg plan  → No changes.
$ cd stacks/f1-stream && ../../scripts/tg plan  → No changes.
$ cd stacks/frigate && ../../scripts/tg plan    → No changes.

$ rg -c 'KYVERNO_LIFECYCLE_V1' stacks/ --include='*.tf' --include='*.tf.example' \
    | awk -F: '{s+=$2} END {print s}'
169
```

## Reproduce locally
1. `git pull`
2. `rg 'KYVERNO_LIFECYCLE_V1' stacks/ | wc -l` → 169+
3. `cd stacks/navidrome && ../../scripts/tg plan` → expect 0 drift on
   the deployment's dns_config field.

Refs: code-seq (Wave 3B dns_config class closed; kubernetes_manifest
annotation class handled separately in 8d94688d for tls_secret)

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-18 21:19:48 +00:00
+								  lifecycle {
 								    # KYVERNO_LIFECYCLE_V1: Kyverno admission webhook mutates dns_config with ndots=2
 								    ignore_changes = [spec[0].template[0].spec[0].dns_config]
 								  }
-												[ci skip] add Forgejo task pipeline for OpenClaw AI agent

Forgejo issues as a task queue for OpenClaw:
- Forgejo OAuth2 with Authentik SSO, self-registration disabled
- Webhook-triggered task processing (instant) + CronJob backup (5min poll)
- Tasks processed via Mistral Large 3 (NVIDIA NIM API)
- Results posted as issue comments, auto-labeled and closed
- Comment follow-ups and reopened issues supported
- n8n RBAC for OpenClaw pod exec (future workflow integration)

											
										
										
											2026-03-07 21:09:31 +00:00
+								}
 								resource "kubernetes_service" "task_webhook" {
 								  metadata {
 								    name      = "task-webhook"
 								    namespace = kubernetes_namespace.openclaw.metadata[0].name
 								    labels = {
 								      app = "task-webhook"
 								    }
 								  }
 								  spec {
 								    selector = {
 								      app = "task-webhook"
 								    }
 								    port {
 								      port        = 80
 								      target_port = 8080
 								    }
 								  }
 								}
 								module "task_webhook_ingress" {
-												[openclaw,tor-proxy] Opt task-webhook + torrserver out of external monitoring

Adds `external_monitor = false` to the ingress_factory calls for
task-webhook and torrserver so the `external-monitor-sync` CronJob
stops auto-creating `[External] <name>` monitors for them. Both
services remain deployed and reachable; only the Uptime Kuma monitors
are dropped.

											
										
										
											2026-04-19 13:01:36 +00:00
+								  source           = "../../modules/kubernetes/ingress_factory"
 								  namespace        = kubernetes_namespace.openclaw.metadata[0].name
 								  name             = "task-webhook"
 								  tls_secret_name  = var.tls_secret_name
 								  host             = "task-webhook"
 								  port             = 80
 								  external_monitor = false
-												[ci skip] add Forgejo task pipeline for OpenClaw AI agent

Forgejo issues as a task queue for OpenClaw:
- Forgejo OAuth2 with Authentik SSO, self-registration disabled
- Webhook-triggered task processing (instant) + CronJob backup (5min poll)
- Tasks processed via Mistral Large 3 (NVIDIA NIM API)
- Results posted as issue comments, auto-labeled and closed
- Comment follow-ups and reopened issues supported
- n8n RBAC for OpenClaw pod exec (future workflow integration)

											
										
										
											2026-03-07 21:09:31 +00:00
+								}
-												[cluster-health] Expand to 42 checks, remove pod CronJob path

- scripts/cluster_healthcheck.sh: add 12 new checks (cert-manager
  readiness/expiry/requests, backup freshness per-DB/offsite/LVM,
  monitoring prom+AM/vault-sealed/CSS, external reachability cloudflared
  +authentik/ExternalAccessDivergence/traefik-5xx). Bump TOTAL_CHECKS
  to 42, add --no-fix flag.
- Remove the duplicate pod-version .claude/cluster-health.sh (1728
  lines) and the openclaw cluster_healthcheck CronJob (local CLI is
  now the single authoritative runner). Keep the healthcheck SA +
  Role + RoleBinding — still reused by task_processor CronJob.
- Remove SLACK_WEBHOOK_URL env from openclaw deployment and delete
  the unused setup-monitoring.sh.
- Rewrite .claude/skills/cluster-health/SKILL.md: mandates running
  the script first, refreshes the 42-check table, drops stale
  CronJob/Slack/post-mortem sections, documents the monorepo-canonical
  + hardlink layout. File is hardlinked to
  /home/wizard/code/.claude/skills/cluster-health/SKILL.md for
  dual discovery.
- AGENTS.md + k8s-portal agent page: 25-check → 42-check.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-19 15:13:03 +00:00
+								# --- Shared ServiceAccount: grants pod-exec into the openclaw pod ---
 								# Used by the task_processor CronJob (below). Previously also used by the
 								# cluster_healthcheck CronJob, which has been decommissioned — the local
 								# `scripts/cluster_healthcheck.sh` is now the single authoritative runner.
-												[ci skip] Flatten module wrappers into stack roots

Remove the module "xxx" { source = "./module" } indirection layer
from all 66 service stacks. Resources are now defined directly in
each stack's main.tf instead of through a wrapper module.

- Merge module/main.tf contents into stack main.tf
- Apply variable replacements (var.tier -> local.tiers.X, renamed vars)
- Fix shared module paths (one fewer ../ at each level)
- Move extra files/dirs (factory/, chart_values, subdirs) to stack root
- Update state files to strip module.<name>. prefix
- Update CLAUDE.md to reflect flat structure

Verified: terragrunt plan shows 0 add, 0 destroy across all stacks.

											
										
										
											2026-02-22 15:13:55 +00:00
 								resource "kubernetes_service_account" "healthcheck" {
 								  metadata {
 								    name      = "cluster-healthcheck"
 								    namespace = kubernetes_namespace.openclaw.metadata[0].name
 								  }
 								}
 								resource "kubernetes_role" "healthcheck_exec" {
 								  metadata {
 								    name      = "healthcheck-pod-exec"
 								    namespace = kubernetes_namespace.openclaw.metadata[0].name
 								  }
 								  rule {
 								    api_groups = [""]
 								    resources  = ["pods"]
 								    verbs      = ["get", "list"]
 								  }
 								  rule {
 								    api_groups = [""]
 								    resources  = ["pods/exec"]
 								    verbs      = ["create"]
 								  }
 								}
 								resource "kubernetes_role_binding" "healthcheck_exec" {
 								  metadata {
 								    name      = "healthcheck-pod-exec"
 								    namespace = kubernetes_namespace.openclaw.metadata[0].name
 								  }
 								  subject {
 								    kind      = "ServiceAccount"
 								    name      = kubernetes_service_account.healthcheck.metadata[0].name
 								    namespace = kubernetes_namespace.openclaw.metadata[0].name
 								  }
 								  role_ref {
 								    api_group = "rbac.authorization.k8s.io"
 								    kind      = "Role"
 								    name      = kubernetes_role.healthcheck_exec.metadata[0].name
 								  }
 								}
-												[ci skip] add Forgejo task pipeline for OpenClaw AI agent

Forgejo issues as a task queue for OpenClaw:
- Forgejo OAuth2 with Authentik SSO, self-registration disabled
- Webhook-triggered task processing (instant) + CronJob backup (5min poll)
- Tasks processed via Mistral Large 3 (NVIDIA NIM API)
- Results posted as issue comments, auto-labeled and closed
- Comment follow-ups and reopened issues supported
- n8n RBAC for OpenClaw pod exec (future workflow integration)

											
										
										
											2026-03-07 21:09:31 +00:00
+								# --- CronJob: Task processor — polls Forgejo issues and triggers OpenClaw ---
 								resource "kubernetes_cron_job_v1" "task_processor" {
 								  metadata {
 								    name      = "task-processor"
 								    namespace = kubernetes_namespace.openclaw.metadata[0].name
 								    labels = {
 								      app  = "task-processor"
 								      tier = local.tiers.aux
 								    }
 								  }
 								  spec {
 								    schedule                      = "*/5 * * * *"
 								    concurrency_policy            = "Forbid"
 								    failed_jobs_history_limit     = 3
 								    successful_jobs_history_limit = 3
 								    job_template {
 								      metadata {
 								        labels = {
 								          app = "task-processor"
 								        }
 								      }
 								      spec {
-												[cluster-health] Expand to 42 checks, remove pod CronJob path

- scripts/cluster_healthcheck.sh: add 12 new checks (cert-manager
  readiness/expiry/requests, backup freshness per-DB/offsite/LVM,
  monitoring prom+AM/vault-sealed/CSS, external reachability cloudflared
  +authentik/ExternalAccessDivergence/traefik-5xx). Bump TOTAL_CHECKS
  to 42, add --no-fix flag.
- Remove the duplicate pod-version .claude/cluster-health.sh (1728
  lines) and the openclaw cluster_healthcheck CronJob (local CLI is
  now the single authoritative runner). Keep the healthcheck SA +
  Role + RoleBinding — still reused by task_processor CronJob.
- Remove SLACK_WEBHOOK_URL env from openclaw deployment and delete
  the unused setup-monitoring.sh.
- Rewrite .claude/skills/cluster-health/SKILL.md: mandates running
  the script first, refreshes the 42-check table, drops stale
  CronJob/Slack/post-mortem sections, documents the monorepo-canonical
  + hardlink layout. File is hardlinked to
  /home/wizard/code/.claude/skills/cluster-health/SKILL.md for
  dual discovery.
- AGENTS.md + k8s-portal agent page: 25-check → 42-check.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-19 15:13:03 +00:00
+								        active_deadline_seconds    = 600
 								        backoff_limit              = 0
 								        ttl_seconds_after_finished = 86400
-												[ci skip] add Forgejo task pipeline for OpenClaw AI agent

Forgejo issues as a task queue for OpenClaw:
- Forgejo OAuth2 with Authentik SSO, self-registration disabled
- Webhook-triggered task processing (instant) + CronJob backup (5min poll)
- Tasks processed via Mistral Large 3 (NVIDIA NIM API)
- Results posted as issue comments, auto-labeled and closed
- Comment follow-ups and reopened issues supported
- n8n RBAC for OpenClaw pod exec (future workflow integration)

											
										
										
											2026-03-07 21:09:31 +00:00
+								        template {
 								          metadata {
 								            labels = {
 								              app = "task-processor"
 								            }
 								          }
 								          spec {
 								            service_account_name = kubernetes_service_account.healthcheck.metadata[0].name
 								            restart_policy       = "Never"
 								            container {
 								              name  = "task-processor"
 								              image = "bitnami/kubectl:latest"
 								              command = ["bash", "-c", <<-EOF
 								                # Find the openclaw pod
 								                POD=$(kubectl get pods -n openclaw -l app=openclaw -o jsonpath='{.items[0].metadata.name}' 2>/dev/null)
 								                if [ -z "$POD" ]; then
 								                  echo "ERROR: OpenClaw pod not found"
 								                  exit 1
 								                fi
 								                echo "Executing task processor in pod $POD..."
 								                kubectl exec -n openclaw "$POD" -c openclaw -- \
 								                  env FORGEJO_TOKEN="$FORGEJO_TOKEN" \
-												fix: openclaw task-processor use internal Forgejo URL

The task-processor CronJob was failing every 5min because
it used https://forgejo.viktorbarzin.me (external, via Cloudflare
tunnel) which is unreachable from within the cluster. Changed to
http://forgejo.forgejo.svc.cluster.local (internal ClusterIP).

											
										
										
											2026-03-24 19:40:15 +02:00
+								                      FORGEJO_URL="http://forgejo.forgejo.svc.cluster.local" \
-												[ci skip] add Forgejo task pipeline for OpenClaw AI agent

Forgejo issues as a task queue for OpenClaw:
- Forgejo OAuth2 with Authentik SSO, self-registration disabled
- Webhook-triggered task processing (instant) + CronJob backup (5min poll)
- Tasks processed via Mistral Large 3 (NVIDIA NIM API)
- Results posted as issue comments, auto-labeled and closed
- Comment follow-ups and reopened issues supported
- n8n RBAC for OpenClaw pod exec (future workflow integration)

											
										
										
											2026-03-07 21:09:31 +00:00
+								                      OPENCLAW_TOKEN="$OPENCLAW_TOKEN" \
 								                      OPENCLAW_URL="https://integrate.api.nvidia.com" \
 								                  bash /workspace/infra/scripts/task-processor.sh
 								              EOF
 								              ]
 								              env {
-												migrate consuming stacks to ESO + remove k8s-dashboard static token

Phase 9: ExternalSecret migration across 26 stacks:

Fully migrated (vault data source removed, ESO delivers secrets):
- speedtest, shadowsocks, wealthfolio, plotting-book, f1-stream, tandoor
- n8n, dawarich, diun, netbox, onlyoffice, tuya-bridge
- hackmd (ESO template for DB URL), health (ESO template for DB URL)
- trading-bot (ESO template for DATABASE_URL + 7 secret env vars)
- forgejo (removed unused vault data source)

Partially migrated (vault kept for plan-time, ESO added for runtime):
- immich, linkwarden, nextcloud, paperless-ngx (jsondecode for homepage)
- claude-memory, rybbit, url, webhook_handler (plan-time in locals/jobs)
- woodpecker, openclaw, resume (plan-time in helm values/jobs/modules)

17 stacks unchanged (all plan-time: homepage annotations, configmaps,
module inputs) — vault data source works with OIDC auth.

Phase 17a: Remove k8s-dashboard static admin token secret.
Users now get tokens via: vault write kubernetes/creds/dashboard-admin

											
										
										
											2026-03-15 19:05:04 +00:00
+								                name = "FORGEJO_TOKEN"
 								                value_from {
 								                  secret_key_ref {
 								                    name = "openclaw-secrets"
 								                    key  = "forgejo_api_token"
 								                  }
 								                }
-												[ci skip] add Forgejo task pipeline for OpenClaw AI agent

Forgejo issues as a task queue for OpenClaw:
- Forgejo OAuth2 with Authentik SSO, self-registration disabled
- Webhook-triggered task processing (instant) + CronJob backup (5min poll)
- Tasks processed via Mistral Large 3 (NVIDIA NIM API)
- Results posted as issue comments, auto-labeled and closed
- Comment follow-ups and reopened issues supported
- n8n RBAC for OpenClaw pod exec (future workflow integration)

											
										
										
											2026-03-07 21:09:31 +00:00
+								              }
 								              env {
-												migrate consuming stacks to ESO + remove k8s-dashboard static token

Phase 9: ExternalSecret migration across 26 stacks:

Fully migrated (vault data source removed, ESO delivers secrets):
- speedtest, shadowsocks, wealthfolio, plotting-book, f1-stream, tandoor
- n8n, dawarich, diun, netbox, onlyoffice, tuya-bridge
- hackmd (ESO template for DB URL), health (ESO template for DB URL)
- trading-bot (ESO template for DATABASE_URL + 7 secret env vars)
- forgejo (removed unused vault data source)

Partially migrated (vault kept for plan-time, ESO added for runtime):
- immich, linkwarden, nextcloud, paperless-ngx (jsondecode for homepage)
- claude-memory, rybbit, url, webhook_handler (plan-time in locals/jobs)
- woodpecker, openclaw, resume (plan-time in helm values/jobs/modules)

17 stacks unchanged (all plan-time: homepage annotations, configmaps,
module inputs) — vault data source works with OIDC auth.

Phase 17a: Remove k8s-dashboard static admin token secret.
Users now get tokens via: vault write kubernetes/creds/dashboard-admin

											
										
										
											2026-03-15 19:05:04 +00:00
+								                name = "OPENCLAW_TOKEN"
 								                value_from {
 								                  secret_key_ref {
 								                    name = "openclaw-secrets"
 								                    key  = "nvidia_api_key"
 								                  }
 								                }
-												[ci skip] add Forgejo task pipeline for OpenClaw AI agent

Forgejo issues as a task queue for OpenClaw:
- Forgejo OAuth2 with Authentik SSO, self-registration disabled
- Webhook-triggered task processing (instant) + CronJob backup (5min poll)
- Tasks processed via Mistral Large 3 (NVIDIA NIM API)
- Results posted as issue comments, auto-labeled and closed
- Comment follow-ups and reopened issues supported
- n8n RBAC for OpenClaw pod exec (future workflow integration)

											
										
										
											2026-03-07 21:09:31 +00:00
+								              }
 								              resources {
 								                requests = {
 								                  cpu    = "50m"
 								                  memory = "64Mi"
 								                }
 								                limits = {
-												right-size memory: set requests=limits based on actual usage

- Set memory requests = limits across 56 stacks to prevent overcommit
- Right-sized limits based on actual pod usage (2x actual, rounded up)
- Scaled down trading-bot (replicas=0) to free memory
- Fixed OOMKilled services: forgejo, dawarich, health, meshcentral,
  paperless-ngx, vault auto-unseal, rybbit, whisper, openclaw, clickhouse
- Added startup+liveness probes to calibre-web
- Bumped inotify limits on nodes 2,3 (max_user_instances 128->8192)

Post node2 OOM incident (2026-03-14). Previous kubelet config had no
kubeReserved/systemReserved set, allowing pods to starve the kernel.

											
										
										
											2026-03-14 21:01:24 +00:00
+								                  memory = "64Mi"
-												[ci skip] add Forgejo task pipeline for OpenClaw AI agent

Forgejo issues as a task queue for OpenClaw:
- Forgejo OAuth2 with Authentik SSO, self-registration disabled
- Webhook-triggered task processing (instant) + CronJob backup (5min poll)
- Tasks processed via Mistral Large 3 (NVIDIA NIM API)
- Results posted as issue comments, auto-labeled and closed
- Comment follow-ups and reopened issues supported
- n8n RBAC for OpenClaw pod exec (future workflow integration)

											
										
										
											2026-03-07 21:09:31 +00:00
+								                }
 								              }
 								            }
 								          }
 								        }
 								      }
 								    }
 								  }
-												[infra] Sweep dns_config ignore_changes across all pod-owning resources [ci skip]

## Context

Wave 3A (commit c9d221d5) added the `# KYVERNO_LIFECYCLE_V1` marker to the
27 pre-existing `ignore_changes = [...dns_config]` sites so they could be
grepped and audited. It did NOT address pod-owning resources that were
simply missing the suppression entirely. Post-Wave-3A sampling (2026-04-18)
found that navidrome, f1-stream, frigate, servarr, monitoring, crowdsec,
and many other stacks showed perpetual `dns_config` drift every plan
because their `kubernetes_deployment` / `kubernetes_stateful_set` /
`kubernetes_cron_job_v1` resources had no `lifecycle {}` block at all.

Root cause (same as Wave 3A): Kyverno's admission webhook stamps
`dns_config { option { name = "ndots"; value = "2" } }` on every pod's
`spec.template.spec.dns_config` to prevent NxDomain search-domain flooding
(see `k8s-ndots-search-domain-nxdomain-flood` skill). Without `ignore_changes`
on every Terraform-managed pod-owner, Terraform repeatedly tries to strip
the injected field.

## This change

Extends the Wave 3A convention by sweeping EVERY `kubernetes_deployment`,
`kubernetes_stateful_set`, `kubernetes_daemon_set`, `kubernetes_cron_job_v1`,
`kubernetes_job_v1` (+ their `_v1` variants) in the repo and ensuring each
carries the right `ignore_changes` path:

- **kubernetes_deployment / stateful_set / daemon_set / job_v1**:
  `spec[0].template[0].spec[0].dns_config`
- **kubernetes_cron_job_v1**:
  `spec[0].job_template[0].spec[0].template[0].spec[0].dns_config`
  (extra `job_template[0]` nesting — the CronJob's PodTemplateSpec is
  one level deeper)

Each injection / extension is tagged `# KYVERNO_LIFECYCLE_V1: Kyverno
admission webhook mutates dns_config with ndots=2` inline so the
suppression is discoverable via `rg 'KYVERNO_LIFECYCLE_V1' stacks/`.

Two insertion paths are handled by a Python pass (`/tmp/add_dns_config_ignore.py`):

1. **No existing `lifecycle {}`**: inject a brand-new block just before the
   resource's closing `}`. 108 new blocks on 93 files.
2. **Existing `lifecycle {}` (usually for `DRIFT_WORKAROUND: CI owns image tag`
   from Wave 4, commit a62b43d1)**: extend its `ignore_changes` list with the
   dns_config path. Handles both inline (`= [x]`) and multiline
   (`= [\n  x,\n]`) forms; ensures the last pre-existing list item carries
   a trailing comma so the extended list is valid HCL. 34 extensions.

The script skips anything already mentioning `dns_config` inside an
`ignore_changes`, so re-running is a no-op.

## Scale

- 142 total lifecycle injections/extensions
- 93 `.tf` files touched
- 108 brand-new `lifecycle {}` blocks + 34 extensions of existing ones
- Every Tier 0 and Tier 1 stack with a pod-owning resource is covered
- Together with Wave 3A's 27 pre-existing markers → **169 greppable
  `KYVERNO_LIFECYCLE_V1` dns_config sites across the repo**

## What is NOT in this change

- `stacks/trading-bot/main.tf` — entirely commented-out block (`/* … */`).
  Python script touched the file, reverted manually.
- `_template/main.tf.example` skeleton — kept minimal on purpose; any
  future stack created from it should either inherit the Wave 3A one-line
  form or add its own on first `kubernetes_deployment`.
- `terraform fmt` fixes to pre-existing alignment issues in meshcentral,
  nvidia/modules/nvidia, vault — unrelated to this commit. Left for a
  separate fmt-only pass.
- Non-pod resources (`kubernetes_service`, `kubernetes_secret`,
  `kubernetes_manifest`, etc.) — they don't own pods so they don't get
  Kyverno dns_config mutation.

## Verification

Random sample post-commit:
```
$ cd stacks/navidrome && ../../scripts/tg plan  → No changes.
$ cd stacks/f1-stream && ../../scripts/tg plan  → No changes.
$ cd stacks/frigate && ../../scripts/tg plan    → No changes.

$ rg -c 'KYVERNO_LIFECYCLE_V1' stacks/ --include='*.tf' --include='*.tf.example' \
    | awk -F: '{s+=$2} END {print s}'
169
```

## Reproduce locally
1. `git pull`
2. `rg 'KYVERNO_LIFECYCLE_V1' stacks/ | wc -l` → 169+
3. `cd stacks/navidrome && ../../scripts/tg plan` → expect 0 drift on
   the deployment's dns_config field.

Refs: code-seq (Wave 3B dns_config class closed; kubernetes_manifest
annotation class handled separately in 8d94688d for tls_secret)

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-18 21:19:48 +00:00
+								  lifecycle {
 								    # KYVERNO_LIFECYCLE_V1: Kyverno admission webhook mutates dns_config with ndots=2
 								    ignore_changes = [spec[0].job_template[0].spec[0].template[0].spec[0].dns_config]
 								  }
-												[ci skip] add Forgejo task pipeline for OpenClaw AI agent

Forgejo issues as a task queue for OpenClaw:
- Forgejo OAuth2 with Authentik SSO, self-registration disabled
- Webhook-triggered task processing (instant) + CronJob backup (5min poll)
- Tasks processed via Mistral Large 3 (NVIDIA NIM API)
- Results posted as issue comments, auto-labeled and closed
- Comment follow-ups and reopened issues supported
- n8n RBAC for OpenClaw pod exec (future workflow integration)

											
										
										
											2026-03-07 21:09:31 +00:00
+								}
-												state(dbaas): update encrypted state

											
										
										
											2026-03-19 20:23:59 +00:00
 								# --- OpenLobster: Multi-user Telegram AI assistant (trial) ---
-												feat(storage): migrate 38 NFS PVCs to proxmox-lvm (Wave 2)

Add proxmox-lvm PVCs with pvc-autoresizer annotations for all
remaining single-pod app data services. Deployments updated to
use new block storage PVCs. Old NFS modules retained for rollback.

Services: affine, changedetection, diun, excalidraw, f1-stream,
hackmd, isponsorblocktv, matrix, n8n, send, grampsweb, health,
onlyoffice, owntracks, paperless-ngx, privatebin, resume,
speedtest, stirling-pdf, tandoor, rybbit (clickhouse), tor-proxy
(torrserver), whisper+piper, frigate (config), ollama (ui),
servarr (prowlarr/listenarr/qbittorrent), aiostreams, freshrss
(extensions), meshcentral (data+files), openclaw (data+home+
openlobster), technitium, mailserver (data+roundcube html+enigma),
dbaas (pgadmin).

Strategy set to Recreate where needed for RWO volumes.

											
										
										
											2026-04-04 19:25:12 +03:00
+								resource "kubernetes_persistent_volume_claim" "openlobster_data_proxmox" {
 								  wait_until_bound = false
 								  metadata {
 								    name      = "openlobster-data-proxmox"
 								    namespace = kubernetes_namespace.openclaw.metadata[0].name
 								    annotations = {
 								      "resize.topolvm.io/threshold"     = "80%"
 								      "resize.topolvm.io/increase"      = "100%"
 								      "resize.topolvm.io/storage_limit" = "5Gi"
 								    }
 								  }
 								  spec {
 								    access_modes       = ["ReadWriteOnce"]
 								    storage_class_name = "proxmox-lvm"
 								    resources {
 								      requests = {
 								        storage = "1Gi"
 								      }
 								    }
 								  }
 								}
-												state(dbaas): update encrypted state

											
										
										
											2026-03-19 20:23:59 +00:00
+								resource "random_password" "openlobster_graphql_token" {
 								  length  = 32
 								  special = false
 								}
 								resource "kubernetes_deployment" "openlobster" {
 								  metadata {
 								    name      = "openlobster"
 								    namespace = kubernetes_namespace.openclaw.metadata[0].name
 								    labels = {
 								      app  = "openlobster"
 								      tier = local.tiers.aux
 								    }
 								  }
 								  spec {
 								    strategy {
 								      type = "Recreate"
 								    }
 								    replicas = 0
 								    selector {
 								      match_labels = {
 								        app = "openlobster"
 								      }
 								    }
 								    template {
 								      metadata {
 								        labels = {
 								          app = "openlobster"
 								        }
 								      }
 								      spec {
 								        # node4 has corrupted containerd content store — avoid it
 								        affinity {
 								          node_affinity {
 								            required_during_scheduling_ignored_during_execution {
 								              node_selector_term {
 								                match_expressions {
 								                  key      = "kubernetes.io/hostname"
 								                  operator = "NotIn"
 								                  values   = ["k8s-node4"]
 								                }
 								              }
 								            }
 								          }
 								        }
 								        container {
 								          name  = "openlobster"
 								          image = "ghcr.io/neirth/openlobster/openlobster:latest"
 								          port {
 								            container_port = 8080
 								          }
 								          env {
 								            name  = "OPENLOBSTER_GRAPHQL_AUTH_TOKEN"
 								            value = random_password.openlobster_graphql_token.result
 								          }
 								          env {
 								            name = "OPENLOBSTER_PROVIDERS_ANTHROPIC_API_KEY"
 								            value_from {
 								              secret_key_ref {
 								                name = "openclaw-secrets"
 								                key  = "anthropic_api_key"
 								              }
 								            }
 								          }
 								          env {
 								            name  = "OPENLOBSTER_PROVIDERS_ANTHROPIC_MODEL"
 								            value = "claude-sonnet-4-20250514"
 								          }
 								          env {
 								            name = "OPENLOBSTER_CHANNELS_TELEGRAM_TOKEN"
 								            value_from {
 								              secret_key_ref {
 								                name = "openclaw-secrets"
 								                key  = "telegram_bot_token"
 								              }
 								            }
 								          }
 								          env {
 								            name  = "OPENLOBSTER_DATABASE_DRIVER"
 								            value = "sqlite"
 								          }
 								          env {
 								            name  = "OPENLOBSTER_DATABASE_DSN"
 								            value = "/app/data/openlobster.db"
 								          }
 								          env {
 								            name  = "OPENLOBSTER_AGENT_NAME"
 								            value = "Lobster"
 								          }
 								          env {
 								            name  = "OPENLOBSTER_MEMORY_BACKEND"
 								            value = "file"
 								          }
 								          volume_mount {
 								            name       = "openlobster-data"
 								            mount_path = "/app/data"
 								          }
 								          resources {
 								            requests = {
 								              cpu    = "10m"
 								              memory = "64Mi"
 								            }
 								            limits = {
 								              memory = "256Mi"
 								            }
 								          }
 								        }
 								        volume {
 								          name = "openlobster-data"
 								          persistent_volume_claim {
-												feat(storage): migrate 38 NFS PVCs to proxmox-lvm (Wave 2)

Add proxmox-lvm PVCs with pvc-autoresizer annotations for all
remaining single-pod app data services. Deployments updated to
use new block storage PVCs. Old NFS modules retained for rollback.

Services: affine, changedetection, diun, excalidraw, f1-stream,
hackmd, isponsorblocktv, matrix, n8n, send, grampsweb, health,
onlyoffice, owntracks, paperless-ngx, privatebin, resume,
speedtest, stirling-pdf, tandoor, rybbit (clickhouse), tor-proxy
(torrserver), whisper+piper, frigate (config), ollama (ui),
servarr (prowlarr/listenarr/qbittorrent), aiostreams, freshrss
(extensions), meshcentral (data+files), openclaw (data+home+
openlobster), technitium, mailserver (data+roundcube html+enigma),
dbaas (pgadmin).

Strategy set to Recreate where needed for RWO volumes.

											
										
										
											2026-04-04 19:25:12 +03:00
+								            claim_name = kubernetes_persistent_volume_claim.openlobster_data_proxmox.metadata[0].name
-												state(dbaas): update encrypted state

											
										
										
											2026-03-19 20:23:59 +00:00
+								          }
 								        }
 								      }
 								    }
 								  }
 								  lifecycle {
-												[infra] Establish KYVERNO_LIFECYCLE_V1 drift-suppression convention [ci skip]

## Context

Phase 1 of the state-drift consolidation audit (plan Wave 3) identified that
the entire repo leans on a repeated `lifecycle { ignore_changes = [...dns_config] }`
snippet to suppress Kyverno's admission-webhook dns_config mutation (the ndots=2
override that prevents NxDomain search-domain flooding). 27 occurrences across
19 stacks. Without this suppression, every pod-owning resource shows perpetual
TF plan drift.

The original plan proposed a shared `modules/kubernetes/kyverno_lifecycle/`
module emitting the ignore-paths list as an output that stacks would consume in
their `ignore_changes` blocks. That approach is architecturally impossible:
Terraform's `ignore_changes` meta-argument accepts only static attribute paths
— it rejects module outputs, locals, variables, and any expression (the HCL
spec evaluates `lifecycle` before the regular expression graph). So a DRY
module cannot exist. The canonical pattern IS the repeated snippet.

What the snippet was missing was a *discoverability tag* so that (a) new
resources can be validated for compliance, (b) the existing 27 sites can be
grep'd in a single command, and (c) future maintainers understand the
convention rather than each reinventing it.

## This change

- Introduces `# KYVERNO_LIFECYCLE_V1` as the canonical marker comment.
  Attached inline on every `spec[0].template[0].spec[0].dns_config` line
  (or `spec[0].job_template[0].spec[0]...` for CronJobs) across all 27
  existing suppression sites.
- Documents the convention with rationale and copy-paste snippets in
  `AGENTS.md` → new "Kyverno Drift Suppression" section.
- Expands the existing `.claude/CLAUDE.md` Kyverno ndots note to reference
  the marker and explain why the module approach is blocked.
- Updates `_template/main.tf.example` so every new stack starts compliant.

## What is NOT in this change

- The `kubernetes_manifest` Kyverno annotation drift (beads `code-seq`)
  — that is Phase B with a sibling `# KYVERNO_MANIFEST_V1` marker.
- Behavioral changes — every `ignore_changes` list is byte-identical
  save for the inline comment.
- The fallback module the original plan anticipated — skipped because
  Terraform rejects expressions in `ignore_changes`.
- `terraform fmt` cleanup on adjacent unrelated blocks in three files
  (claude-agent-service, freedify/factory, hermes-agent). Reverted to
  keep this commit scoped to the convention rollout.

## Before / after

Before (cannot distinguish accidental-forgotten from intentional-convention):
```hcl
lifecycle {
  ignore_changes = [spec[0].template[0].spec[0].dns_config]
}
```

After (greppable, self-documenting, discoverable by tooling):
```hcl
lifecycle {
  ignore_changes = [spec[0].template[0].spec[0].dns_config] # KYVERNO_LIFECYCLE_V1
}
```

## Test Plan

### Automated
```
$ rg -c 'KYVERNO_LIFECYCLE_V1' stacks/ --include='*.tf' --include='*.tf.example' \
    | awk -F: '{s+=$2} END {print s}'
27

$ git diff --stat | grep -E '\.(tf|tf\.example|md)$' | wc -l
21

# All code-file diffs are 1 insertion + 1 deletion per marker site,
# except beads-server (3), ebooks (4), immich (3), uptime-kuma (2).
$ git diff --stat stacks/ | tail -1
20 files changed, 45 insertions(+), 28 deletions(-)
```

### Manual Verification

No apply required — HCL comments only. Zero effect on any stack's plan output.
Future audits: `rg 'KYVERNO_LIFECYCLE_V1' stacks/ | wc -l` must grow as new
pod-owning resources are added.

## Reproduce locally
1. `cd infra && git pull`
2. `rg 'KYVERNO_LIFECYCLE_V1' stacks/` → expect 27 hits in 19 files
3. Grep any new `kubernetes_deployment` for the marker; absence = missing
   suppression.

Closes: code-28m

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-18 14:15:51 +00:00
+								    ignore_changes = [spec[0].template[0].spec[0].dns_config] # KYVERNO_LIFECYCLE_V1
-												state(dbaas): update encrypted state

											
										
										
											2026-03-19 20:23:59 +00:00
+								  }
 								}
 								resource "kubernetes_service" "openlobster" {
 								  metadata {
 								    name      = "openlobster"
 								    namespace = kubernetes_namespace.openclaw.metadata[0].name
 								    labels = {
 								      app = "openlobster"
 								    }
 								  }
 								  spec {
 								    selector = {
 								      app = "openlobster"
 								    }
 								    port {
 								      port        = 80
 								      target_port = 8080
 								    }
 								  }
 								}
 								module "openlobster_ingress" {
 								  source          = "../../modules/kubernetes/ingress_factory"
-												[infra] Auto-create Cloudflare DNS records from ingress_factory

## Context

Deploying new services required manually adding hostnames to
cloudflare_proxied_names/cloudflare_non_proxied_names in config.tfvars —
a separate file from the service stack. This was frequently forgotten,
leaving services unreachable externally.

## This change:

- Add `dns_type` parameter to `ingress_factory` and `reverse_proxy/factory`
  modules. Setting `dns_type = "proxied"` or `"non-proxied"` auto-creates
  the Cloudflare DNS record (CNAME to tunnel or A/AAAA to public IP).
- Simplify cloudflared tunnel from 100 per-hostname rules to wildcard
  `*.viktorbarzin.me → Traefik`. Traefik still handles host-based routing.
- Add global Cloudflare provider via terragrunt.hcl (separate
  cloudflare_provider.tf with Vault-sourced API key).
- Migrate 118 hostnames from centralized config.tfvars to per-service
  dns_type. 17 hostnames remain centrally managed (Helm ingresses,
  special cases).
- Update docs, AGENTS.md, CLAUDE.md, dns.md runbook.

```
BEFORE                          AFTER
config.tfvars (manual list)     stacks/<svc>/main.tf
        |                         module "ingress" {
        v                           dns_type = "proxied"
stacks/cloudflared/               }
  for_each = list                     |
  cloudflare_record               auto-creates
  tunnel per-hostname             cloudflare_record + annotation
```

## What is NOT in this change:

- Uptime Kuma monitor migration (still reads from config.tfvars)
- 17 remaining centrally-managed hostnames (Helm, special cases)
- Removal of allow_overwrite (keep until migration confirmed stable)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-16 13:45:04 +00:00
+								  dns_type        = "proxied"
-												state(dbaas): update encrypted state

											
										
										
											2026-03-19 20:23:59 +00:00
+								  namespace       = kubernetes_namespace.openclaw.metadata[0].name
 								  name            = "openlobster"
 								  tls_secret_name = var.tls_secret_name
 								  host            = "openlobster"
 								  port            = 80
 								  protected       = true
 								}