claude-agent: replace unused 10Gi PVC with 5Gi NFS-backed /persistent

The 10Gi proxmox-lvm-encrypted PVC `claude-agent-workspace-encrypted` was declared in TF but never wired into the deployment — the `workspace` volume_mount pointed at an emptyDir, so the PVC sat allocated and idle from 2026-04-15 to 2026-05-11. Restructured per the design intent: * `workspace` (emptyDir) — fast per-pod ephemeral scratch for git clones. Each agent job clones the infra repo fresh, so persistence doesn't buy anything and emptyDir avoids RWO contention if the deployment is ever scaled past 1 replica. * `persistent` (5Gi NFS-backed RWX) — mounted at /persistent for cases where the agent needs to write state that should survive pod restarts (caches, ad-hoc outputs). RWX so all replicas share it; the service's sequential-mutex lock prevents concurrent writes. Also fixed `fix-perms` init container: the Dockerfile's `WORKDIR /workspace/infra` causes kubelet to create that path inside the emptyDir as root:fsGroup with the setgid bit, which uid 1000 can't write to. Pre-create the path + chmod 0775 to make it writable. NFS export already exists on the PVE host (/srv/nfs/claude-agent-persistent, owned 1000:1000). Verified: pod runs 1/1; `/persistent` writable as agent uid 1000; git-init successfully clones infra into /workspace/infra.
2026-05-11 20:40:21 +00:00 · 2026-05-11 20:40:21 +00:00 · 701b0e3c57
commit 701b0e3c57
parent cd13b9d062
1 changed files with 42 additions and 31 deletions
--- a/stacks/claude-agent-service/main.tf
+++ b/stacks/claude-agent-service/main.tf
@ -191,34 +191,25 @@ resource "kubernetes_cluster_role_binding" "claude_agent" {
 }

 # --- Storage ---
-
-resource "kubernetes_persistent_volume_claim" "workspace" {
-  wait_until_bound = false
-  metadata {
-    name      = "claude-agent-workspace-encrypted"
-    namespace = kubernetes_namespace.claude_agent.metadata[0].name
-    annotations = {
-      "resize.topolvm.io/threshold"     = "10%"
-      "resize.topolvm.io/increase"      = "100%"
-      "resize.topolvm.io/storage_limit" = "20Gi"
-    }
-  }
-  spec {
-    access_modes       = ["ReadWriteOnce"]
-    storage_class_name = "proxmox-lvm-encrypted"
-    resources {
-      requests = {
-        storage = "10Gi"
-      }
-    }
-  }
-  lifecycle {
-    # The autoresizer expands requests.storage up to storage_limit and
-    # PVCs can't shrink. Without this, every TF apply tries to revert
-    # to the spec value, K8s rejects the shrink, and the PVC ends up
-    # in Terminating-but-in-use limbo.
-    ignore_changes = [spec[0].resources[0].requests]
-  }
+#
+# The `workspace` volume in the deployment is intentionally emptyDir — agent
+# jobs do fresh git clones each run, so a per-pod scratch dir on node disk
+# is faster and isolated. The 10Gi `claude-agent-workspace-encrypted` PVC
+# that previously sat next to this comment was created but never wired
+# into the deployment (sat idle from 2026-04-15 to 2026-05-11).
+#
+# For cases where the agent DOES need to persist state across pod restarts
+# (caches, ad-hoc outputs, anything that should survive a pod reschedule),
+# `module.persistent` below provides a 5Gi NFS-backed RWX volume mounted
+# at /persistent. RWX so all 3 replicas can read/write the same dir;
+# sequential job mutex in the service prevents concurrent writes.
+module "persistent" {
+  source     = "../../modules/kubernetes/nfs_volume"
+  name       = "claude-agent-persistent"
+  namespace  = kubernetes_namespace.claude_agent.metadata[0].name
+  nfs_server = "192.168.1.127"
+  nfs_path   = "/srv/nfs/claude-agent-persistent"
+  storage    = "5Gi"
 }

 # --- Deployment ---
@ -258,11 +249,15 @@ resource "kubernetes_deployment" "claude_agent" {
          fs_group     = 1000
        }

-        # Fix workspace ownership (PVC may have root-owned files from prior run)
+        # Fix workspace ownership. Kubelet creates the Dockerfile WORKDIR
+        # (/workspace/infra) inside the emptyDir as root:gid=fsGroup with
+        # the setgid bit — uid 1000 can't write into it without explicit
+        # chown + chmod. Pre-create so the path is guaranteed, then chown
+        # recursively and chmod the infra subdir for safety.
        init_container {
          name    = "fix-perms"
          image   = "busybox:1.37"
-          command = ["sh", "-c", "chown -R 1000:1000 /workspace"]
+          command = ["sh", "-c", "mkdir -p /workspace/infra /persistent && chown -R 1000:1000 /workspace /persistent && chmod 0775 /workspace/infra /persistent"]
          security_context {
            run_as_user = 0
          }
@ -270,6 +265,10 @@ resource "kubernetes_deployment" "claude_agent" {
            name       = "workspace"
            mount_path = "/workspace"
          }
+          volume_mount {
+            name       = "persistent"
+            mount_path = "/persistent"
+          }
          resources {
            requests = {
              memory = "32Mi"
@ -438,6 +437,10 @@ resource "kubernetes_deployment" "claude_agent" {
            name       = "workspace"
            mount_path = "/workspace"
          }
+          volume_mount {
+            name       = "persistent"
+            mount_path = "/persistent"
+          }
          volume_mount {
            name       = "sops-age-key"
            mount_path = "/home/agent/.config/sops/age"
@ -460,8 +463,16 @@ resource "kubernetes_deployment" "claude_agent" {

        volume {
          name = "workspace"
+          # Per-pod ephemeral scratch — agent does fresh git clones each
+          # job, so node-disk emptyDir is faster than a network-backed PVC
+          # and avoids RWO contention across the 3 replicas.
+          empty_dir {}
+        }
+
+        volume {
+          name = "persistent"
          persistent_volume_claim {
-            claim_name = kubernetes_persistent_volume_claim.workspace.metadata[0].name
+            claim_name = module.persistent.claim_name
          }
        }