From 701b0e3c57928408252532278cfddce49dc4e75c Mon Sep 17 00:00:00 2001
From: Viktor Barzin <vbarzin@gmail.com>
Date: Mon, 11 May 2026 20:40:21 +0000
Subject: [PATCH] claude-agent: replace unused 10Gi PVC with 5Gi NFS-backed
 /persistent
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The 10Gi proxmox-lvm-encrypted PVC `claude-agent-workspace-encrypted` was
declared in TF but never wired into the deployment — the `workspace`
volume_mount pointed at an emptyDir, so the PVC sat allocated and idle
from 2026-04-15 to 2026-05-11.

Restructured per the design intent:
  * `workspace` (emptyDir) — fast per-pod ephemeral scratch for git clones.
    Each agent job clones the infra repo fresh, so persistence doesn't
    buy anything and emptyDir avoids RWO contention if the deployment
    is ever scaled past 1 replica.
  * `persistent` (5Gi NFS-backed RWX) — mounted at /persistent for cases
    where the agent needs to write state that should survive pod
    restarts (caches, ad-hoc outputs). RWX so all replicas share it;
    the service's sequential-mutex lock prevents concurrent writes.

Also fixed `fix-perms` init container: the Dockerfile's `WORKDIR
/workspace/infra` causes kubelet to create that path inside the
emptyDir as root:fsGroup with the setgid bit, which uid 1000 can't
write to. Pre-create the path + chmod 0775 to make it writable.

NFS export already exists on the PVE host
(/srv/nfs/claude-agent-persistent, owned 1000:1000).

Verified: pod runs 1/1; `/persistent` writable as agent uid 1000;
git-init successfully clones infra into /workspace/infra.
---
 stacks/claude-agent-service/main.tf | 73 +++++++++++++++++------------
 1 file changed, 42 insertions(+), 31 deletions(-)

diff --git a/stacks/claude-agent-service/main.tf b/stacks/claude-agent-service/main.tf
index a8f8e5af..4d4ff3e9 100644
--- a/stacks/claude-agent-service/main.tf
+++ b/stacks/claude-agent-service/main.tf
@@ -191,34 +191,25 @@ resource "kubernetes_cluster_role_binding" "claude_agent" {
 }
 
 # --- Storage ---
-
-resource "kubernetes_persistent_volume_claim" "workspace" {
-  wait_until_bound = false
-  metadata {
-    name      = "claude-agent-workspace-encrypted"
-    namespace = kubernetes_namespace.claude_agent.metadata[0].name
-    annotations = {
-      "resize.topolvm.io/threshold"     = "10%"
-      "resize.topolvm.io/increase"      = "100%"
-      "resize.topolvm.io/storage_limit" = "20Gi"
-    }
-  }
-  spec {
-    access_modes       = ["ReadWriteOnce"]
-    storage_class_name = "proxmox-lvm-encrypted"
-    resources {
-      requests = {
-        storage = "10Gi"
-      }
-    }
-  }
-  lifecycle {
-    # The autoresizer expands requests.storage up to storage_limit and
-    # PVCs can't shrink. Without this, every TF apply tries to revert
-    # to the spec value, K8s rejects the shrink, and the PVC ends up
-    # in Terminating-but-in-use limbo.
-    ignore_changes = [spec[0].resources[0].requests]
-  }
+#
+# The `workspace` volume in the deployment is intentionally emptyDir — agent
+# jobs do fresh git clones each run, so a per-pod scratch dir on node disk
+# is faster and isolated. The 10Gi `claude-agent-workspace-encrypted` PVC
+# that previously sat next to this comment was created but never wired
+# into the deployment (sat idle from 2026-04-15 to 2026-05-11).
+#
+# For cases where the agent DOES need to persist state across pod restarts
+# (caches, ad-hoc outputs, anything that should survive a pod reschedule),
+# `module.persistent` below provides a 5Gi NFS-backed RWX volume mounted
+# at /persistent. RWX so all 3 replicas can read/write the same dir;
+# sequential job mutex in the service prevents concurrent writes.
+module "persistent" {
+  source     = "../../modules/kubernetes/nfs_volume"
+  name       = "claude-agent-persistent"
+  namespace  = kubernetes_namespace.claude_agent.metadata[0].name
+  nfs_server = "192.168.1.127"
+  nfs_path   = "/srv/nfs/claude-agent-persistent"
+  storage    = "5Gi"
 }
 
 # --- Deployment ---
@@ -258,11 +249,15 @@ resource "kubernetes_deployment" "claude_agent" {
           fs_group     = 1000
         }
 
-        # Fix workspace ownership (PVC may have root-owned files from prior run)
+        # Fix workspace ownership. Kubelet creates the Dockerfile WORKDIR
+        # (/workspace/infra) inside the emptyDir as root:gid=fsGroup with
+        # the setgid bit — uid 1000 can't write into it without explicit
+        # chown + chmod. Pre-create so the path is guaranteed, then chown
+        # recursively and chmod the infra subdir for safety.
         init_container {
           name    = "fix-perms"
           image   = "busybox:1.37"
-          command = ["sh", "-c", "chown -R 1000:1000 /workspace"]
+          command = ["sh", "-c", "mkdir -p /workspace/infra /persistent && chown -R 1000:1000 /workspace /persistent && chmod 0775 /workspace/infra /persistent"]
           security_context {
             run_as_user = 0
           }
@@ -270,6 +265,10 @@ resource "kubernetes_deployment" "claude_agent" {
             name       = "workspace"
             mount_path = "/workspace"
           }
+          volume_mount {
+            name       = "persistent"
+            mount_path = "/persistent"
+          }
           resources {
             requests = {
               memory = "32Mi"
@@ -438,6 +437,10 @@ resource "kubernetes_deployment" "claude_agent" {
             name       = "workspace"
             mount_path = "/workspace"
           }
+          volume_mount {
+            name       = "persistent"
+            mount_path = "/persistent"
+          }
           volume_mount {
             name       = "sops-age-key"
             mount_path = "/home/agent/.config/sops/age"
@@ -460,8 +463,16 @@ resource "kubernetes_deployment" "claude_agent" {
 
         volume {
           name = "workspace"
+          # Per-pod ephemeral scratch — agent does fresh git clones each
+          # job, so node-disk emptyDir is faster than a network-backed PVC
+          # and avoids RWO contention across the 3 replicas.
+          empty_dir {}
+        }
+
+        volume {
+          name = "persistent"
           persistent_volume_claim {
-            claim_name = kubernetes_persistent_volume_claim.workspace.metadata[0].name
+            claim_name = module.persistent.claim_name
           }
         }