From bebe8fbd740248946f4affb5469fbb4d07fb0d7b Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Mon, 29 Jun 2026 12:06:17 +0000 Subject: [PATCH] workflows: add read-only memory-overcommit + node-removal capacity analysis MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reusable Workflow script that audits whether the cluster is memory-overcommitted and whether a single k8s worker can be removed to return RAM to the PVE host without sacrificing N-1 failover. Read-only throughout: gathers PVE host memory (qm config / free / KSM via SSH), k8s per-node capacity + cluster 30d peak working set, and per-workload right-sizing, then models N-1 two ways (physical actual-usage and scheduling-by-request) and adversarially verifies the conclusion with 3 skeptics. Sizes requests (scheduling reservation) and limits (OOM ceiling) as SEPARATE knobs — an earlier ad-hoc pass conflated them by sizing requests to 30d peak, which manufactured a false N-1 shortfall. Invoke via Workflow {scriptPath}, or by name when cwd is the infra repo. Requested by Viktor: identify memory overcommit and whether deployment requests can be trimmed to free PVE host RAM by removing a node, without sacrificing service reliability. Co-Authored-By: Claude Opus 4.8 --- ...memory-overcommit-node-removal.workflow.js | 203 ++++++++++++++++++ 1 file changed, 203 insertions(+) create mode 100644 .claude/workflows/memory-overcommit-node-removal.workflow.js diff --git a/.claude/workflows/memory-overcommit-node-removal.workflow.js b/.claude/workflows/memory-overcommit-node-removal.workflow.js new file mode 100644 index 00000000..bfbf84c9 --- /dev/null +++ b/.claude/workflows/memory-overcommit-node-removal.workflow.js @@ -0,0 +1,203 @@ +export const meta = { + name: 'memory-overcommit-node-removal', + description: 'Read-only: assess PVE host + k8s memory overcommit, right-size deployment REQUESTS (scheduling) and LIMITS (OOM) separately from 30d usage, then test whether one worker node can be removed while preserving N-1 by BOTH a physical-usage and a scheduling-request model. Emits a gated plan.', + phases: [ + { title: 'Gather' }, + { title: 'Model' }, + { title: 'Verify' }, + ], +} + +// ---------- confirmed read-only access paths ---------- +const SSH = "ssh -o BatchMode=yes -o ConnectTimeout=8 root@192.168.1.127"; +const PROM = "https://prometheus-query.viktorbarzin.lan/api/v1/query"; +const G = (mib) => (mib == null ? "?" : (mib / 1024).toFixed(1) + "Gi"); + +// ---------- schema helpers ---------- +const num = { type: "number" }, str = { type: "string" }, bool = { type: "boolean" }; +const arr = (items) => ({ type: "array", items }); +const obj = (props) => ({ type: "object", additionalProperties: false, required: Object.keys(props), properties: props }); + +const HOST = obj({ + host_total_mib: num, host_used_mib: num, host_free_mib: num, host_available_mib: num, + swap_total_mib: num, swap_used_mib: num, ksm_saved_mib: num, + vms: arr(obj({ vmid: num, name: str, configured_mib: num, balloon_mib: num, rss_mib: num, is_k8s_node: bool })), + sum_vm_configured_mib: num, sum_vm_rss_mib: num, notes: str, +}); + +const K8S = obj({ + nodes: arr(obj({ + name: str, role: str, is_gpu: bool, is_control_plane: bool, gpu_tainted: bool, schedulable: bool, + capacity_mib: num, allocatable_mib: num, requests_mib: num, ds_requests_mib: num, limits_mib: num, usage_now_mib: num, peak_30d_mib: num, pod_count: num, + })), + cluster_allocatable_mib: num, cluster_requests_mib: num, cluster_usage_now_mib: num, cluster_peak_30d_mib: num, notes: str, +}); + +// NOTE the v2 split: requests are sized for SCHEDULING (cover normal load, can shrink below current), +// limits are sized for OOM SAFETY (cover peak). They are DIFFERENT knobs and must not be conflated. +const USAGE = obj({ + totals: obj({ + sum_current_requests_mib: num, sum_recommended_requests_mib: num, net_request_reclaim_mib: num, + reschedulable_request_recommended_mib: num, ds_request_recommended_per_node_mib: num, gpu_request_recommended_mib: num, + largest_single_request_mib: num, count_request_shrink: num, count_limit_raise_oom: num, + }), + request_shrinks: arr(obj({ namespace: str, name: str, kind: str, replicas: num, current_request_mib: num, p95_30d_mib: num, recommended_request_mib: num, delta_mib: num, rationale: str })), + limit_raises_oom: arr(obj({ namespace: str, name: str, container: str, current_limit_mib: num, peak_max_30d_mib: num, recommended_limit_mib: num, risk: str })), + spiky_periodic: arr(obj({ namespace: str, name: str, note: str })), + method_notes: str, +}); + +const TOPO = obj({ + nodes: arr(obj({ name: str, sticky_pods: arr(str), local_pv_count: num, volumeattachments: num, cnpg_primary: bool, gpu_workloads: bool, evac_difficulty: str, evac_notes: str })), + spofs: arr(obj({ namespace: str, name: str, replicas: num, has_pdb: bool, issue: str })), + antiaffinity_risks: arr(str), + csi_pinning_note: str, + priority_classes_note: str, + notes: str, +}); + +const VERDICT = obj({ refuted: bool, confidence: str, reasoning: str, corrections: arr(str) }); + +// ---------- prompts ---------- +const HOST_PROMPT = `Read-only PVE host memory audit. SSH (key-based): ${SSH} '' (host 'pve', the Proxmox r730 at 192.168.1.127). Read-only ONLY; NEVER a state-changing qm/pvesh/ha-manager command. +- 'free -m' -> host_total/used/free/available_mib + swap_total/swap_used_mib. +- KSM: cat /sys/kernel/mm/ksm/pages_sharing ; ksm_saved_mib = pages_sharing*4096/1048576. +- 'qm list'; for each running VM 'qm config ' -> memory (configured_mib), balloon (balloon_mib; if balloon==memory or balloon==0 ballooning is effectively OFF -> host RSS pins near configured = the headroom RATCHET). +- Per-VM host RSS: read /var/run/qemu-server/.pid then 'ps -o rss= -p ' (KiB->MiB). +- is_k8s_node = VMs named k8s-*. +Return per-VM rows + sum_vm_configured_mib + sum_vm_rss_mib over ALL RUNNING VMs. notes: overcommit ratio, swap pressure, ballooning state.`; + +const K8S_PROMPT = `Read-only Kubernetes node-capacity audit. kubectl read access confirmed. For every node (k8s-master + k8s-node1..6): +- capacity_mib & allocatable_mib from 'kubectl get node -o json' (Ki->MiB). +- is_control_plane (node-role.kubernetes.io/control-plane), is_gpu (k8s-node1; nvidia.com/gpu in capacity), gpu_tainted (a NoSchedule taint general pods would NOT tolerate), schedulable. +- requests_mib, limits_mib, ds_requests_mib (DaemonSet-owned pods only), usage_now_mib, pod_count. + Prefer Prometheus (curl -sk -G '${PROM}' --data-urlencode 'query='): + sum by (node)(kube_pod_container_resource_requests{resource="memory"}) [these metrics HAVE a node label] + usage_now: cAdvisor container_memory_working_set_bytes has NO node label - join: sum by (node)(container_memory_working_set_bytes{container!="",container!="POD"} * on(namespace,pod) group_left(node) kube_pod_info) +- peak_30d_mib per node: max_over_time of that joined per-node sum over [30d:5m] (best effort; if the join is flaky leave 0 and rely on cluster figure). +ALSO return cluster-wide: +- cluster_allocatable_mib, cluster_requests_mib, cluster_usage_now_mib. +- cluster_peak_30d_mib = max_over_time(sum(container_memory_working_set_bytes{container!="",container!="POD"})[30d:5m]) /1024/1024 (this is the PHYSICAL reliability bedrock - the highest the whole cluster ever simultaneously used in 30d). +notes: host-vs-k8s overcommit contrast (requests vs allocatable vs actual usage).`; + +const USAGE_PROMPT = `Read-only memory RIGHT-SIZING from 30-day usage. CRITICAL: requests and limits are DIFFERENT knobs - size them separately. Do NOT set requests to peak (that is what a flawed earlier run did; it manufactured a false capacity shortfall). +- REQUEST (scheduling reservation, drives bin-packing & node-removal feasibility): size to cover NORMAL operation = recommended_request_mib = ceil(max(p95_30d * 1.15, 64)). This SHRINKS the many over-provisioned requests toward real usage. requests should sit BELOW limits (Burstable). Be moderately conservative for stateful/db/critical infra (mysql, postgres/CNPG, redis, vault, prometheus, mailserver): use p99 instead of p95. +- LIMIT (OOM ceiling): recommended_limit_mib = ceil(peak_max_30d * 1.25). FLAG any container whose peak_max_30d >= 95% of current limit as an OOM risk (limit_raises_oom) - these are real reliability bugs to fix REGARDLESS of node removal. + +Sources: kubectl (current requests/limits/replicas for Deployments/StatefulSets/DaemonSets, all namespaces); Prometheus (curl -sk -G '${PROM}' --data-urlencode 'query='): + p95: quantile_over_time(0.95, container_memory_working_set_bytes{container!="",container!="POD"}[30d]) + p99: quantile_over_time(0.99, ...[30d]) + peak: max_over_time(...[30d]) + Aggregate by (namespace,pod,container), map pod->workload (strip hash suffixes), take MAX across a workload's pods as per-replica value. + +Splits for the N-1 model (use the REQUEST recommendation; multiply per-replica by replicas): +- reschedulable_request_recommended_mib = SUM recommended_request of Deployment+StatefulSet pods that are NON-GPU and schedulable on general workers (everything that must reschedule if a worker is removed). +- ds_request_recommended_per_node_mib = SUM recommended_request of DaemonSet containers (one set per node). +- gpu_request_recommended_mib = SUM recommended_request of workloads pinned to GPU node k8s-node1 (REAL value; do not inflate). +- largest_single_request_mib = largest single recommended per-replica request among reschedulable. +Return totals (sum_current_requests_mib, sum_recommended_requests_mib, net_request_reclaim_mib = sum of POSITIVE request deltas i.e. shrinks, the splits, count_request_shrink, count_limit_raise_oom), request_shrinks (top ~30 by delta), limit_raises_oom (every OOM-tight container), spiky_periodic (mailserver/immich-ml/backups/dumps/postiz). NEVER mutate.`; + +const TOPO_PROMPT = `Read-only reliability-topology audit: which worker is safest to remove? Candidates: k8s-node2..node6 (NOT master, NOT GPU node1). For each worker (k8s-node1..6): sticky_pods (StatefulSet members; pods with local/hostPath PVCs; single-replica critical), local_pv_count, volumeattachments, cnpg_primary (CNPG 'pg-cluster' PRIMARY here? check pod role labels), gpu_workloads, evac_difficulty (easy|medium|hard)+evac_notes. +Cluster-wide: spofs (1 replica AND no PDB); antiaffinity_risks (hard podAntiAffinity / topologySpread DoNotSchedule that becomes UNSATISFIABLE at one fewer worker - check replica counts vs surviving distinct hosts); csi_pinning_note (do Proxmox-CSI PVs pin to a node, or share one host-level topology so they reattach anywhere? check volumeHandle / topology zone/region on the PVs - this decides whether removal STRANDS data); priority_classes_note. NEVER mutate.`; + +// ============================================================ +phase('Gather'); +log('Gather (read-only): PVE host memory, k8s capacity + cluster 30d peak, request/limit right-sizing, reliability topology'); +const [host, k8s, usage, topo] = await parallel([ + () => agent(HOST_PROMPT, { label: 'gather:pve-host', phase: 'Gather', schema: HOST }), + () => agent(K8S_PROMPT, { label: 'gather:k8s-capacity', phase: 'Gather', schema: K8S }), + () => agent(USAGE_PROMPT, { label: 'gather:rightsize', phase: 'Gather', schema: USAGE }), + () => agent(TOPO_PROMPT, { label: 'gather:reliability', phase: 'Gather', schema: TOPO }), +]); +if (!k8s || !usage) return { error: 'Critical gather agent failed (k8s/usage).', host, k8s, usage, topo }; + +// ============================================================ +phase('Model'); +const T = usage.totals; +const workers = k8s.nodes.filter((n) => !n.is_control_plane); +const generalPool = workers.filter((n) => !n.gpu_tainted); // general pods can land here (incl. GPU node if not tainted) +const candidates = workers.filter((n) => !n.is_gpu && !n.is_control_plane); // node2..node6 +const clusterPeak = k8s.cluster_peak_30d_mib || 0; + +const freeGeneral = (n) => n.allocatable_mib - (T.ds_request_recommended_per_node_mib || 0) - (n.is_gpu ? (T.gpu_request_recommended_mib || 0) : 0); + +function evalRemove(removeName) { + const pool = generalPool.filter((n) => n.name !== removeName); + // --- scheduling N-1 (realistic requests): fit reschedulable load even if the largest survivor then fails --- + const frees = pool.map(freeGeneral); + const schedCap = frees.reduce((a, b) => a + b, 0) - (frees.length ? Math.max(...frees) : 0); + const schedNeed = T.reschedulable_request_recommended_mib; + const schedMargin = schedCap - schedNeed; + // --- physical N-1 (actual peak usage): cluster 30d peak must fit on survivors after losing the largest too --- + const survAlloc = pool.map((n) => n.allocatable_mib); + const physCap = survAlloc.reduce((a, b) => a + b, 0) - (survAlloc.length ? Math.max(...survAlloc) : 0); + const physMargin = physCap - clusterPeak; + const t = topo && topo.nodes ? topo.nodes.find((n) => n.name === removeName) : null; + return { + removeName, pool: pool.map((n) => n.name), + sched_capacityN1_mib: Math.round(schedCap), sched_need_mib: Math.round(schedNeed), sched_margin_mib: Math.round(schedMargin), sched_pass: schedMargin >= 0, + phys_capacityN1_mib: Math.round(physCap), cluster_peak_mib: Math.round(clusterPeak), phys_margin_mib: Math.round(physMargin), phys_pass: physMargin >= 0, + pass: schedMargin >= 0 && physMargin >= 0, + host_freed_mib: hostFreedFor(removeName), + evac_difficulty: t ? t.evac_difficulty : 'unknown', cnpg_primary: t ? t.cnpg_primary : false, sticky_pods: t ? t.sticky_pods : [], + }; +} +function hostFreedFor(nodeName) { + if (host && host.vms) { + const s = nodeName.replace('k8s-', ''); + const vm = host.vms.find((v) => v.name === nodeName || (v.name && v.name.includes(s))); + if (vm) return vm.configured_mib; + } + const n = k8s.nodes.find((x) => x.name === nodeName); + return n ? n.capacity_mib : 0; +} + +const evalCandidates = candidates.map((c) => evalRemove(c.name)); +const diffRank = { easy: 0, medium: 1, hard: 2, unknown: 3 }; +const passing = evalCandidates.filter((c) => c.pass && !c.cnpg_primary) + .sort((a, b) => (diffRank[a.evac_difficulty] - diffRank[b.evac_difficulty]) || (b.phys_margin_mib - a.phys_margin_mib)); +const best = passing[0] || null; + +const hostOvercommit = host ? { sum_vm_configured_mib: host.sum_vm_configured_mib, host_total_mib: host.host_total_mib, ratio: +(host.sum_vm_configured_mib / host.host_total_mib).toFixed(3), free_mib: host.host_free_mib, available_mib: host.host_available_mib, swap_used_mib: host.swap_used_mib, swap_total_mib: host.swap_total_mib, ksm_saved_mib: host.ksm_saved_mib } : null; +const k8sOvercommit = { cluster_requests_mib: k8s.cluster_requests_mib, cluster_allocatable_mib: k8s.cluster_allocatable_mib, cluster_usage_now_mib: k8s.cluster_usage_now_mib, cluster_peak_30d_mib: clusterPeak, request_ratio: +(k8s.cluster_requests_mib / k8s.cluster_allocatable_mib).toFixed(3), usage_ratio: +(clusterPeak / k8s.cluster_allocatable_mib).toFixed(3) }; + +log(`Host overcommit ${hostOvercommit ? hostOvercommit.ratio : '?'}x (${G(hostOvercommit && hostOvercommit.free_mib)} free, swap ${G(hostOvercommit && hostOvercommit.swap_used_mib)}/${G(hostOvercommit && hostOvercommit.swap_total_mib)})`); +log(`K8s: requests ${G(k8s.cluster_requests_mib)} / 30d-peak-usage ${G(clusterPeak)} / allocatable ${G(k8s.cluster_allocatable_mib)} -> requests are ${(k8s.cluster_requests_mib / clusterPeak).toFixed(2)}x real peak`); +log(`Request right-sizing: ${G(T.net_request_reclaim_mib)} of over-provisioned requests can be trimmed (${T.count_request_shrink} workloads); ${T.count_limit_raise_oom} workloads are OOM-tight on LIMITS (raise regardless).`); +for (const c of evalCandidates) log(` remove ${c.removeName}: phys-N1 ${c.phys_pass ? 'PASS' : 'FAIL'} (${G(c.phys_margin_mib)}) | sched-N1 ${c.sched_pass ? 'PASS' : 'FAIL'} (${G(c.sched_margin_mib)}) | frees ~${G(c.host_freed_mib)} host | evac ${c.evac_difficulty}${c.cnpg_primary ? ' CNPG-PRIMARY' : ''}`); +log(best ? `Best candidate: ${best.removeName} (phys margin ${G(best.phys_margin_mib)}, frees ~${G(best.host_freed_mib)})` : 'No candidate passes both N-1 tests.'); + +// ============================================================ +phase('Verify'); +const headline = best + ? `${best.removeName} can be removed while preserving N-1: cluster 30d peak usage ${G(clusterPeak)} fits on survivors-minus-one (${G(best.phys_capacityN1_mib)}); after trimming over-provisioned requests, scheduling also fits (${G(best.sched_margin_mib)} margin). Frees ~${G(best.host_freed_mib)} to the PVE host.` + : `No worker can be removed while preserving N-1 by BOTH physical-usage and scheduling-request models.`; +const verifyData = JSON.stringify({ hostOvercommit, k8sOvercommit, k8s_nodes: k8s.nodes, usage_totals: T, evalCandidates, best, csi_pinning_note: topo ? topo.csi_pinning_note : null, generalPool: generalPool.map((n) => n.name) }, null, 2); +const lenses = [ + { key: 'math', ask: 'Recompute BOTH N-1 models independently. Physical: cluster 30d peak vs (sum survivor allocatable - largest survivor). Scheduling: reschedulable recommended REQUESTS (not limits, not peak) vs (sum survivor freeGeneral - largest). Verify GPU node reserve uses REAL gpu requests, allocatable not capacity, DaemonSets are per-node fixed load. Are pool selection and numbers right?' }, + { key: 'temporal', ask: 'Challenge the 30-DAY peak window and the request shrinks. Could a monthly/quarterly peak exceed cluster_peak_30d (compare a 90d peak)? Are the shrunk REQUESTS safe given each workload keeps a limit above its peak (Burstable)? Name any shrink or any still-tight limit that is reckless.' }, + { key: 'stateful', ask: 'Check the chosen candidate for STRANDED state and drain blockers: CSI PV pinning (do volumes reattach anywhere?), CNPG primary, VolumeAttachment caps, anti-affinity/topologySpread unsatisfiable at one fewer worker, PDBs that block drain (disruptionsAllowed=0). Is removal actually safe, and what drain ORDERING is required?' }, +]; +const verdicts = (await parallel(lenses.map((l) => () => + agent(`Adversarial reviewer. Try to REFUTE:\n"${headline}"\n\nLens: ${l.ask}\n\nData (read-only). Verify LIVE: kubectl, Prometheus (curl -sk -G '${PROM}' --data-urlencode 'query=...'), ${SSH} ''.\n\n${verifyData}\n\nDefault refuted=true if evidence does not clearly hold. Give concrete corrections.`, + { label: `verify:${l.key}`, phase: 'Verify', schema: VERDICT })) +)).filter(Boolean); + +return { + headline, + hostOvercommit, k8sOvercommit, + rightsizing: T, + request_shrinks: usage.request_shrinks, + limit_raises_oom: usage.limit_raises_oom, + spiky_periodic: usage.spiky_periodic, + candidates: evalCandidates, + recommendation: best, + k8s_nodes: k8s.nodes, + host_vms: host ? host.vms : null, + topo_spofs: topo ? topo.spofs : [], + topo_nodes: topo ? topo.nodes : [], + csi_pinning_note: topo ? topo.csi_pinning_note : null, + antiaffinity_risks: topo ? topo.antiaffinity_risks : [], + verdicts, + verdict_summary: `${verdicts.filter((v) => v.refuted).length}/${verdicts.length} reviewers refuted the headline`, +};