From 7e558de8f06a8fdb2da60fd36957a61f479a1cff Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Fri, 22 May 2026 10:20:00 +0000 Subject: [PATCH] openclaw: SSH + tmux task fallback to devvm MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Give the OpenClaw pod two new capabilities: 1. Host-tools bundle. New init container `install-host-tools` extracts openssh-client + dnsutils + tmux + jq + ripgrep + fd + vault + yq + friends into /tools/host-tools/, with the bookworm-slim libs the binaries need. PATH + LD_LIBRARY_PATH on the main container point ld.so at the bundle. Idempotent via /tools/host-tools/.installed-v1 marker; smoke test (ldd-based) fails the init at deploy time if any binary has unresolved deps. Bundle is ~558 MB on the existing /srv/nfs/openclaw/tools NFS. 2. devvm SSH + async task pattern. New init `setup-ssh-config` writes id_rsa/config/known_hosts under /home/node/.openclaw/.ssh; main container startup symlinks /home/node/.ssh → there. New /usr/local/bin/openclaw-task wrapper on devvm manages long-running work as tmux sessions on devvm (sessions and logs survive pod restarts — they live on devvm, not in the pod). New init container `seed-devvm-memory-note` drops a markdown note teaching the pattern; main container startup now runs `openclaw memory index --force` so the note is searchable on first boot. Design + verified E2E flow in docs/plans/2026-05-22-openclaw-devvm-access-design.md. Persistence test green: spawned a 50s task from pod A, deleted pod A, new pod B saw the task finish and read its full log. Pre-existing keel.sh annotation drift on openclaw/{openlobster, task_webhook} cleaned up in the same apply. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.7 --- ...2026-05-22-openclaw-devvm-access-design.md | 269 ++++++++++++++++++ stacks/openclaw/files/openclaw-task.sh | 184 ++++++++++++ stacks/openclaw/main.tf | 243 +++++++++++++++- 3 files changed, 694 insertions(+), 2 deletions(-) create mode 100644 docs/plans/2026-05-22-openclaw-devvm-access-design.md create mode 100644 stacks/openclaw/files/openclaw-task.sh diff --git a/docs/plans/2026-05-22-openclaw-devvm-access-design.md b/docs/plans/2026-05-22-openclaw-devvm-access-design.md new file mode 100644 index 00000000..2f5d17c2 --- /dev/null +++ b/docs/plans/2026-05-22-openclaw-devvm-access-design.md @@ -0,0 +1,269 @@ +# OpenClaw devvm access + async task pattern — design + +**Date:** 2026-05-22 +**Stack:** `infra/stacks/openclaw` +**Status:** Approved (in-session, see chat history 2026-05-22) + +## Goal + +Give the OpenClaw pod (running in K8s) two new capabilities: + +1. **Host-tools bundle** — common Linux CLIs the upstream OpenClaw image + doesn't ship (`ssh`, `scp`, `vault`, `dig`, `jq`, `yq`, `ripgrep`, `fd`, + `gnupg`, `tmux`, etc.). OpenClaw can't `apt install` because the + container runs as non-root `node` (uid 1000). +2. **devvm async task pattern** — OpenClaw spawns long-running work as + `tmux` sessions on devvm, sends prompts via `tmux send-keys`, captures + progress via `tmux capture-pane`. Sessions live on devvm, so they + survive OpenClaw pod restarts. + +OpenClaw uses this combination as a **trusted fallback** for tasks too +expensive, sensitive, or stateful for in-pod execution: Vault lookups, +multi-step `claude-code` work, anything needing wizard's full home-lab +access. + +## Why now + +- The in-pod sandbox is `security=full` but the container is minimal — + no `ssh`, no `vault`, no `dig`, no `tmux`. +- The user wants OpenClaw to be a first-line agent that delegates heavy + work to the dev VM rather than duplicate that work in a constrained pod. +- Long-running work (multi-minute `claude-code` sessions) shouldn't be + tied to a single synchronous `claude -p` invocation — needs persistence + and pollability. + +## Architecture decision: stay on K8s + +Discussed migrating OpenClaw to run directly on devvm (would obviate the +host-tools bundle + most of the SSH setup). Decision: **stay on K8s**. + +Reasons: +- Keeps HA (5-node cluster vs single devvm reboot) +- Keeps ingress/Authentik/Telegram entry chain intact +- Keeps Prometheus scrape + exporter sidecar +- Keeps PVC backup pipeline (LVM snapshots + Synology offsite) +- Resource isolation — a runaway LLM session can't stress wizard's daily-driver VM +- Migration cost is several days; this design is ~150 LoC + an 80-line wrapper + +The mental model — "OpenClaw is sandboxed, delegates to wizard@devvm for +trusted heavy lifting" — is a clean security boundary. Worth preserving. + +## Architecture + +### Pod side (`infra/stacks/openclaw/main.tf`) + +Two new init containers added to the OpenClaw Deployment, after the +existing four: + +#### Init 5 — `install-host-tools` + +- Image: `debian:bookworm-slim` (matches main container base for glibc compat) +- Idempotent: skips if `/tools/host-tools/.installed-v1` exists +- `apt-get install --download-only --no-install-recommends` for: + `openssh-client dnsutils iputils-ping wget gnupg jq ripgrep fd-find ncdu htop strace tcpdump tmux unzip` +- Iterates `.deb` files in `/var/cache/apt/archives/`, `dpkg-deb -x` each + into `/tools/host-tools/root/` (preserves `usr/bin`, `usr/sbin`, + `usr/lib` layout) +- Downloads static binaries to `/tools/host-tools/bin/`: + - `vault` (HashiCorp releases, pinned version) + - `yq` (mikefarah/yq GitHub releases, pinned version) +- Smoke test: invokes `--version` on each bundled binary; fails init if + any won't load (catches glibc / shared-lib drift at deploy time, not + runtime) +- Writes marker file with version + +#### Init 6 — `setup-ssh-config` + +- Image: uses the just-installed host-tools (debian:bookworm-slim base + with `/tools/host-tools/root/usr/bin` on PATH so `ssh-keyscan` works) +- Runs after `install-host-tools` +- Idempotent: skips if `/home/node/.openclaw/.ssh/.configured-v1` exists +- Creates `/home/node/.openclaw/.ssh/` (uid 1000) +- Copies `/ssh/id_rsa` (tmpfs secret mount) → `~/.ssh/id_rsa` with 0600 + (the secret tmpfs mount has wider perms that openssh rejects) +- Writes `~/.ssh/config`: + + ```ssh-config + Host devvm + HostName 10.0.10.10 + User wizard + IdentityFile ~/.ssh/id_rsa + UserKnownHostsFile ~/.ssh/known_hosts + StrictHostKeyChecking yes + ``` + + PATH handling on the remote side: devvm's sshd uses the default + non-interactive PATH (`/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin`) + and does NOT load `~/.profile` or `~/.bashrc` (memory id=740). Client-side + `SetEnv PATH=…` doesn't help because sshd's `AcceptEnv` is `LANG LC_*` only. + Solution: install the binaries openclaw cares about into `/usr/local/bin/` + on devvm (see "Devvm side" below). + +- Pre-seeds `~/.ssh/known_hosts` via `ssh-keyscan -H 10.0.10.10` +- Writes marker file + +#### Main container + +- `PATH` env updated: prepend + `/tools/host-tools/root/usr/bin:/tools/host-tools/root/usr/sbin:/tools/host-tools/bin` +- No other changes to the startup command + +### Devvm side + +#### `/usr/local/bin/openclaw-task` wrapper + +Canonical source: `infra/stacks/openclaw/files/openclaw-task.sh`. +Installed to devvm at `/usr/local/bin/openclaw-task` (`sudo cp`, `sudo +chmod +x`) so non-interactive SSH finds it on the default PATH without +needing `~/.profile`. Updates: re-run the install steps from the +canonical source. + +Also: `sudo ln -s /home/wizard/.local/bin/claude /usr/local/bin/claude` +so `ssh devvm claude …` works in non-interactive mode. `vault` and `tmux` +are already at `/usr/bin/` (system packages) so no symlink needed for +those. + +POSIX shell script. Subcommands: + +| Subcommand | Behavior | +|---|---| +| `new ` | Spawns detached tmux session `openclaw-task-`, pipes pane output to `~/openclaw-tasks/.log` | +| `claude ` | Convenience: spawns interactive `claude` in a tmux session, send-keys the prompt + Enter | +| `send ` | `tmux send-keys -t openclaw-task- "$@"` — caller supplies `Enter` literal if needed | +| `capture [lines]` | `tmux capture-pane -t … -p -S -` (default last 1000) | +| `log ` | `cat ~/openclaw-tasks/.log` | +| `tail ` | `tail -n 100 -f ~/openclaw-tasks/.log` (mainly for human ops) | +| `list` | tmux session list filtered to `openclaw-task-*`, one id per line | +| `status ` | `running` if tmux session alive, `ended` otherwise | +| `kill ` | `tmux kill-session -t openclaw-task-` (log file is kept) | +| `purge ` | `kill` + `rm -f ~/openclaw-tasks/.log` | + +Task state lives entirely on devvm: + +- tmux sessions persist across SSH disconnects and OpenClaw pod restarts +- `~/openclaw-tasks/.log` is the durable transcript even after a + session is killed +- No central database — `tmux list-sessions` is the source of truth for + "what's running" + +Naming convention: tmux sessions are prefixed `openclaw-task-` so they +don't collide with wizard's own tmux work (`0`, `Openclaw`, `read-only`). + +### Memory note + +File at `/workspace/memory/projects/openclaw-runtime/devvm-fallback.md` +teaching OpenClaw the pattern. Indexed by the existing daily +`memory-sync` CronJob (or via manual `node openclaw.mjs memory index +--force` for the initial seed). + +Content (verbatim): + +```markdown +# Using devvm as a fallback + +When in-pod tools/permissions block you, SSH to devvm and use it. The +devvm runs as wizard with full home-lab access (Vault, kubectl, git +repos, Cloudflare, etc.) and has Claude Code v2+ installed. + +## One-shot lookup + ssh devvm 'vault kv get -field=brave_api_key secret/openclaw' + ssh devvm 'claude -p "investigate why frigate is restarting"' + +## Long-running async work — USE THIS for anything > ~2 min +Spawn in a tmux session on devvm. Sessions survive OpenClaw pod restarts. + + # spawn + ssh devvm openclaw-task new my-task "claude -p --dangerously-skip-permissions 'do the thing'" + + # poll progress (last 1000 lines of pane) + ssh devvm openclaw-task capture my-task + + # interactive claude (send follow-up prompts) + ssh devvm openclaw-task claude my-task "initial prompt" + ssh devvm openclaw-task send my-task "follow-up prompt" Enter + + # housekeeping + ssh devvm openclaw-task list + ssh devvm openclaw-task status my-task + ssh devvm openclaw-task kill my-task + +Logs persist at ~/openclaw-tasks/.log on devvm even after a session +is killed. Use `ssh devvm openclaw-task log ` to retrieve them. +``` + +## Devvm: no infra changes + +Pre-existing state verified 2026-05-22: + +- pubkey from `/ssh/id_rsa` (Vault `secret/openclaw → ssh_key`) matches the + `ssh-ed25519 AAAA…lug node@openclaw-58cd9f7987-884bv` line in + `~/.ssh/authorized_keys` (the comment is a stale pod name; the key + itself is stable from Vault) +- sshd listens on 0.0.0.0:22 ✓ +- `claude` v2.1.126 at `/home/wizard/.local/bin/claude` ✓ +- `tmux` 3.4 installed, server already running with existing user sessions ✓ + +Only changes (one-time, done in the same session via `sudo`): +- Install `openclaw-task` wrapper to `/usr/local/bin/openclaw-task` +- Symlink `/home/wizard/.local/bin/claude` → `/usr/local/bin/claude` + +## Tradeoffs / risks + +- **Bundle size on NFS**: ~30MB extracted. Acceptable on + `/srv/nfs/openclaw/tools`. +- **Library version drift**: bundled binaries link against bookworm libs. + Smoke test in `install-host-tools` catches breakage on the next pod + restart if upstream OpenClaw image rebases. +- **Full-shell SSH**: explicit user choice. Blast radius if openclaw is + prompt-injected = full wizard access. Mitigation: keep OpenClaw's + plugin allowlist tight (current allow list: `memory-core, recruiter-api, + telegram, openrouter, brave, openai, codex`). +- **tmux server lifecycle on devvm**: if wizard's tmux server dies (rare — + usually only on devvm reboot), in-flight openclaw tasks are killed. + Acceptable for home lab. Task logs persist regardless. +- **Task log unbounded growth**: `~/openclaw-tasks/*.log` grows forever. + Out of scope here. User can add a `find -mtime +N -delete` cron later. +- **Init container order**: `setup-ssh-config` depends on + `install-host-tools` finishing first. K8s init containers run + sequentially in declaration order — natural ordering, no explicit + dependency mechanism needed. + +## Testing — E2E flows required by user + +1. **Tools present**: + `kubectl -n openclaw exec -c openclaw -- ssh -V` returns version, + same for `dig`, `vault`, `jq`, `yq`, `tmux`, `rg`. +2. **SSH happy path**: + `kubectl -n openclaw exec -c openclaw -- ssh devvm 'hostname'` + returns `devvm`. +3. **Claude one-shot**: + `kubectl -n openclaw exec -c openclaw -- ssh devvm 'claude -p "what is 1+1"'` + returns `2`. +4. **Async task lifecycle**: + - `ssh devvm openclaw-task new test-1 "sleep 30; echo done"` + - `ssh devvm openclaw-task list` contains `test-1` + - `ssh devvm openclaw-task status test-1` returns `running` + - wait 35s + - `ssh devvm openclaw-task log test-1` contains `done` + - `ssh devvm openclaw-task status test-1` returns `ended` +5. **Persistence test** (the key requirement): + - Spawn long task: `ssh devvm openclaw-task new persist-1 "sleep 120; echo survived > /tmp/persist-1.proof"` + - `kubectl -n openclaw delete pod ` — pod recreated + - Wait for new pod ready (init containers run, skip via marker, fast) + - `kubectl -n openclaw exec -c openclaw -- ssh devvm openclaw-task list` + contains `persist-1` + - Wait for original sleep to finish; verify `/tmp/persist-1.proof` + contains `survived` from new pod +6. **Memory note lookup**: + `kubectl -n openclaw exec -c openclaw -- node openclaw.mjs memory search 'devvm fallback'` + returns the note. + +## Docs to update with the change + +- `infra/docs/plans/2026-05-22-openclaw-devvm-access-design.md` (this doc) +- `infra/docs/plans/2026-05-22-openclaw-devvm-access-plan.md` (implementation plan) +- `infra/.claude/reference/service-catalog.md` (one-line addition under + OpenClaw: "Has SSH to devvm with host-tools bundle; long-running async + tasks via `openclaw-task` wrapper on devvm") +- `infra/.claude/CLAUDE.md` "Known Issues" section is left alone — none of + the existing OpenClaw caveats change. diff --git a/stacks/openclaw/files/openclaw-task.sh b/stacks/openclaw/files/openclaw-task.sh new file mode 100644 index 00000000..57025e90 --- /dev/null +++ b/stacks/openclaw/files/openclaw-task.sh @@ -0,0 +1,184 @@ +#!/usr/bin/env bash +# openclaw-task — manage long-running tmux tasks on devvm +# +# Canonical source: infra/stacks/openclaw/files/openclaw-task.sh +# Installed to /usr/local/bin/openclaw-task on devvm so non-interactive +# SSH (e.g. `ssh devvm openclaw-task list`) finds it on the default PATH. +# +# Sessions are prefixed `openclaw-task-` to avoid colliding with the +# user's own tmux work. Persistent transcripts live in +# ~/openclaw-tasks/.log via `tmux pipe-pane`. Sessions and logs +# survive OpenClaw pod restarts (they live on devvm, not in the pod). + +set -euo pipefail + +# Use full paths because non-interactive SSH does not source ~/.profile +# or ~/.bashrc (see memory id=740). +TMUX_BIN=/usr/bin/tmux +CLAUDE_BIN=/usr/local/bin/claude # installed as symlink to /home/wizard/.local/bin/claude + +PREFIX=openclaw-task- +TASK_DIR=${OPENCLAW_TASK_DIR:-$HOME/openclaw-tasks} +mkdir -p "$TASK_DIR" + +die() { echo "openclaw-task: $*" >&2; exit 1; } + +session_name() { printf 'openclaw-task-%s' "$1"; } + +require_session() { + local name="$1" + "$TMUX_BIN" has-session -t "$name" 2>/dev/null || die "no session '$name' (use 'openclaw-task list')" +} + +usage() { + cat < spawn detached tmux session + openclaw-task claude [prompt...] spawn interactive claude in a session; + if prompt given, send-keys it + Enter + openclaw-task send tmux send-keys passthrough (you must + pass 'Enter' literal for newline) + openclaw-task capture [lines] last of pane (default 1000) + openclaw-task log cat the persistent pipe-pane log + openclaw-task tail tail -f the persistent log + openclaw-task list all openclaw task ids (one per line) + openclaw-task status 'running' or 'ended' + openclaw-task kill kill session (log file kept) + openclaw-task purge kill + delete log file + +EXAMPLES + openclaw-task new build-foo "cd ~/code/foo && make all 2>&1" + openclaw-task claude diag-frigate + openclaw-task send diag-frigate "investigate gpu crashloop" Enter + openclaw-task capture diag-frigate 200 + openclaw-task list +EOF +} + +cmd_new() { + [ $# -lt 2 ] && die "usage: openclaw-task new " + local id="$1"; shift + local name; name=$(session_name "$id") + if "$TMUX_BIN" has-session -t "$name" 2>/dev/null; then + die "session '$name' already exists" + fi + local log="$TASK_DIR/$id.log" + : > "$log" + # Start an idle interactive bash so pipe-pane can attach BEFORE the + # user's command runs. If we passed the command directly to + # new-session, its first lines beat pipe-pane to the pane and never + # land in the log. + "$TMUX_BIN" new-session -d -s "$name" bash --norc -i + "$TMUX_BIN" pipe-pane -o -t "$name" "cat >> '$log'" + sleep 0.2 + "$TMUX_BIN" send-keys -t "$name" "$*" Enter + # Auto-exit propagating the command's status so the tmux session + # ends when the command does. + "$TMUX_BIN" send-keys -t "$name" 'exit $?' Enter + printf 'session: %s\nlog: %s\n' "$name" "$log" +} + +cmd_claude() { + [ $# -lt 1 ] && die "usage: openclaw-task claude [prompt...]" + local id="$1"; shift + local name; name=$(session_name "$id") + if "$TMUX_BIN" has-session -t "$name" 2>/dev/null; then + die "session '$name' already exists (use 'send' to add prompts)" + fi + local log="$TASK_DIR/$id.log" + : > "$log" + # sleep+exec lets pipe-pane attach before claude prints its banner. + "$TMUX_BIN" new-session -d -s "$name" bash -c "sleep 0.3; exec '$CLAUDE_BIN'" + "$TMUX_BIN" pipe-pane -o -t "$name" "cat >> '$log'" + if [ $# -gt 0 ]; then + # Wait for claude to come up before sending the prompt + sleep 2 + "$TMUX_BIN" send-keys -t "$name" "$*" Enter + fi + printf 'session: %s\nlog: %s\n' "$name" "$log" +} + +cmd_send() { + [ $# -lt 2 ] && die "usage: openclaw-task send " + local id="$1"; shift + local name; name=$(session_name "$id") + require_session "$name" + "$TMUX_BIN" send-keys -t "$name" "$@" +} + +cmd_capture() { + [ $# -lt 1 ] && die "usage: openclaw-task capture [lines]" + local id="$1" + local lines="${2:-1000}" + local name; name=$(session_name "$id") + require_session "$name" + "$TMUX_BIN" capture-pane -t "$name" -p -S "-$lines" +} + +cmd_log() { + [ $# -lt 1 ] && die "usage: openclaw-task log " + local id="$1" + local log="$TASK_DIR/$id.log" + [ -f "$log" ] || die "no log file for '$id' (looked at $log)" + cat "$log" +} + +cmd_tail() { + [ $# -lt 1 ] && die "usage: openclaw-task tail " + local id="$1" + local log="$TASK_DIR/$id.log" + [ -f "$log" ] || die "no log file for '$id' (looked at $log)" + tail -n 100 -f "$log" +} + +cmd_list() { + "$TMUX_BIN" list-sessions -F '#{session_name}' 2>/dev/null \ + | grep "^$PREFIX" \ + | sed "s|^$PREFIX||" \ + || true +} + +cmd_status() { + [ $# -lt 1 ] && die "usage: openclaw-task status " + local id="$1" + local name; name=$(session_name "$id") + if "$TMUX_BIN" has-session -t "$name" 2>/dev/null; then + echo running + else + echo ended + fi +} + +cmd_kill() { + [ $# -lt 1 ] && die "usage: openclaw-task kill " + local id="$1" + local name; name=$(session_name "$id") + require_session "$name" + "$TMUX_BIN" kill-session -t "$name" +} + +cmd_purge() { + [ $# -lt 1 ] && die "usage: openclaw-task purge " + local id="$1" + local name; name=$(session_name "$id") + "$TMUX_BIN" kill-session -t "$name" 2>/dev/null || true + rm -f "$TASK_DIR/$id.log" + echo "purged: $id" +} + +case "${1:-help}" in + new) shift; cmd_new "$@" ;; + claude) shift; cmd_claude "$@" ;; + send) shift; cmd_send "$@" ;; + capture) shift; cmd_capture "$@" ;; + log) shift; cmd_log "$@" ;; + tail) shift; cmd_tail "$@" ;; + list) shift; cmd_list "$@" ;; + status) shift; cmd_status "$@" ;; + kill) shift; cmd_kill "$@" ;; + purge) shift; cmd_purge "$@" ;; + help|-h|--help) usage ;; + *) usage; exit 2 ;; +esac diff --git a/stacks/openclaw/main.tf b/stacks/openclaw/main.tf index 3d670f7f..69186038 100644 --- a/stacks/openclaw/main.tf +++ b/stacks/openclaw/main.tf @@ -496,6 +496,223 @@ resource "kubernetes_deployment" "openclaw" { } } + # Init 4: install host-tools bundle (ssh, vault, jq, ripgrep, tmux, …) + # into /tools/host-tools/ so the in-pod agent reaches CLI parity + # with the dev VM. Upstream OpenClaw image is minimal Debian + # bookworm running as uid 1000 — can't apt-install at runtime. + # Idempotent via marker file; bump suffix to force reinstall. + # See docs/plans/2026-05-22-openclaw-devvm-access-design.md. + init_container { + name = "install-host-tools" + image = "debian:bookworm-slim" + command = ["bash", "-c", <<-EOT + set -euo pipefail + DEST=/tools/host-tools + MARKER="$DEST/.installed-v1" + if [ -f "$MARKER" ]; then + echo "host-tools v1 already installed (skipping)" + exit 0 + fi + echo "installing host-tools v1 ..." + rm -rf "$DEST" + mkdir -p "$DEST/root" "$DEST/bin" + + export DEBIAN_FRONTEND=noninteractive + apt-get update -qq + # debian:bookworm-slim doesn't ship wget/unzip; install + # transiently into this init container's filesystem so we + # can download the static binaries below. + apt-get install -y --no-install-recommends wget unzip ca-certificates + + # NOTE: we deliberately do NOT pass --no-install-recommends to + # the download step. ssh links against libgssapi-krb5-2 which + # is a hard Depends but its transitive deps (libkrb5-3 etc.) + # need to come along too. The bundle is a self-contained + # /usr-like tree that the openclaw container can use via + # LD_LIBRARY_PATH, so missing deps = broken binaries. + APT_PKGS="openssh-client dnsutils iputils-ping wget gnupg jq ripgrep fd-find ncdu htop strace tcpdump tmux unzip ca-certificates" + apt-get install -y --download-only $APT_PKGS + + for d in /var/cache/apt/archives/*.deb; do + dpkg-deb -x "$d" "$DEST/root/" + done + + VAULT_VER=1.18.3 + YQ_VER=v4.44.3 + wget -qO /tmp/vault.zip \ + "https://releases.hashicorp.com/vault/$${VAULT_VER}/vault_$${VAULT_VER}_linux_amd64.zip" + unzip -o /tmp/vault.zip vault -d "$DEST/bin/" + chmod +x "$DEST/bin/vault" + wget -qO "$DEST/bin/yq" \ + "https://github.com/mikefarah/yq/releases/download/$${YQ_VER}/yq_linux_amd64" + chmod +x "$DEST/bin/yq" + + # Smoke test — fail init if any bundled binary has unresolved + # shared-lib deps, so glibc / shared-lib drift surfaces at + # deploy time. We don't run --version because flag support + # varies (older scp returns non-zero, ping/nslookup use weird + # conventions). ldd is the reliable signal: if any "not + # found" appears, the binary won't load when called. + # LD_LIBRARY_PATH points ld.so at the bundled libs (the + # openclaw main container sets the same env). + export PATH="$DEST/root/usr/bin:$DEST/root/usr/sbin:$DEST/root/bin:$DEST/root/sbin:$DEST/bin:$PATH" + export LD_LIBRARY_PATH="$DEST/root/usr/lib/x86_64-linux-gnu:$DEST/root/lib/x86_64-linux-gnu" + for t in ssh scp ssh-keyscan dig host nslookup ping wget gpg jq rg fdfind tmux vault yq; do + bin=$(command -v "$t" 2>/dev/null) || { echo "FAIL: $t not on PATH"; exit 1; } + if ldd "$bin" 2>&1 | grep -q "not found"; then + echo "FAIL: $t has unresolved shared libs:" + ldd "$bin" + exit 1 + fi + echo "OK: $t" + done + + chown -R 1000:1000 "$DEST" + touch "$MARKER" + echo "host-tools v1 install complete ($(du -sh "$DEST" | cut -f1))" + EOT + ] + volume_mount { + name = "tools" + mount_path = "/tools" + } + resources { + requests = { cpu = "100m", memory = "256Mi" } + limits = { memory = "512Mi" } + } + } + + # Init 5: write /home/node/.openclaw/.ssh/{id_rsa,config,known_hosts} + # so the agent can `ssh devvm` without device-trust prompts. The + # main container symlinks /home/node/.ssh → here at startup so + # the ssh client picks it up via $HOME/.ssh. Installs + # openssh-client transiently into this init container so + # ssh-keyscan works without LD_LIBRARY_PATH gymnastics. + init_container { + name = "setup-ssh-config" + image = "debian:bookworm-slim" + command = ["bash", "-c", <<-EOT + set -euo pipefail + SSH=/home/node/.openclaw/.ssh + MARKER="$SSH/.configured-v1" + if [ -f "$MARKER" ]; then + echo "ssh-config v1 already set up (skipping)" + exit 0 + fi + echo "installing openssh-client for ssh-keyscan ..." + export DEBIAN_FRONTEND=noninteractive + apt-get update -qq + apt-get install -y --no-install-recommends openssh-client >/dev/null + + echo "configuring ssh ..." + mkdir -p "$SSH" + + # Copy the secret-mounted private key into ~/.ssh with 0600 — + # the secret's tmpfs mount has wider perms (1777 + symlinks) + # that openssh refuses. + cp /ssh/id_rsa "$SSH/id_rsa" + chmod 0600 "$SSH/id_rsa" + + cat > "$SSH/config" <<'SSH_EOF' + Host devvm + HostName 10.0.10.10 + User wizard + IdentityFile ~/.ssh/id_rsa + UserKnownHostsFile ~/.ssh/known_hosts + StrictHostKeyChecking yes + SSH_EOF + chmod 0600 "$SSH/config" + + ssh-keyscan -H 10.0.10.10 > "$SSH/known_hosts" 2>/tmp/keyscan.err + if [ ! -s "$SSH/known_hosts" ]; then + echo "ssh-keyscan produced empty known_hosts; stderr:" + cat /tmp/keyscan.err + exit 1 + fi + chmod 0644 "$SSH/known_hosts" + + chown -R 1000:1000 "$SSH" + touch "$MARKER" + echo "ssh-config v1 set up" + EOT + ] + volume_mount { + name = "openclaw-home" + mount_path = "/home/node/.openclaw" + } + volume_mount { + name = "ssh-key" + mount_path = "/ssh" + } + resources { + requests = { cpu = "50m", memory = "64Mi" } + limits = { memory = "256Mi" } + } + } + + # Init 6: seed the devvm-fallback memory note into + # /workspace/memory/projects/openclaw-runtime/. The note teaches + # openclaw the SSH+tmux pattern. The main container's startup + # runs `memory index --force` so it's searchable immediately; + # the daily memory-sync CronJob also keeps it indexed afterward. + # Always rewrites — the configmap-baked note is canonical. + init_container { + name = "seed-devvm-memory-note" + image = "busybox:1.37" + command = ["sh", "-c", <<-EOT + set -eu + DIR=/workspace/memory/projects/openclaw-runtime + mkdir -p "$DIR" + cat > "$DIR/devvm-fallback.md" <<'NOTE_EOF' + # Using devvm as a fallback + + When in-pod tools/permissions block you, SSH to devvm and use it. + Devvm runs as wizard with full home-lab access (Vault, kubectl, + git repos, Cloudflare, etc.) and Claude Code v2+ is installed. + + ## One-shot lookup + + ssh devvm 'vault kv get -field=brave_api_key secret/openclaw' + ssh devvm 'claude -p "investigate why frigate is restarting"' + + ## Long-running async work — USE THIS for anything > ~2 min + + Spawn in a tmux session on devvm. Sessions and logs survive + OpenClaw pod restarts (they live on devvm, not in this pod). + + # spawn + ssh devvm openclaw-task new my-task "claude -p --dangerously-skip-permissions 'do the thing'" + + # poll progress (last 1000 lines of pane) + ssh devvm openclaw-task capture my-task + + # interactive claude (send follow-up prompts) + ssh devvm openclaw-task claude my-task "initial prompt" + ssh devvm openclaw-task send my-task "follow-up prompt" Enter + + # housekeeping + ssh devvm openclaw-task list + ssh devvm openclaw-task status my-task + ssh devvm openclaw-task kill my-task + + Logs persist at ~/openclaw-tasks/.log on devvm even after a + session is killed. Use `ssh devvm openclaw-task log ` to + retrieve them. + NOTE_EOF + chown -R 1000:1000 "$DIR" + echo "memory note seeded at $DIR/devvm-fallback.md" + EOT + ] + volume_mount { + name = "workspace" + mount_path = "/workspace" + } + resources { + requests = { cpu = "10m", memory = "32Mi" } + limits = { memory = "32Mi" } + } + } + # Main container: OpenClaw container { name = "openclaw" @@ -511,6 +728,11 @@ resource "kubernetes_deployment" "openclaw" { # others hard-coded. # 4. gateway — exec into the gateway process command = ["sh", "-c", <<-EOC + # Symlink /home/node/.ssh → persistent .ssh so the ssh client + # finds id_rsa/config/known_hosts via $HOME/.ssh. HOME is + # /home/node (image overlay), .ssh files live on the PVC + # at /home/node/.openclaw/.ssh (set up by init 5). + ln -sfn /home/node/.openclaw/.ssh /home/node/.ssh node openclaw.mjs doctor --fix 2>/dev/null node openclaw.mjs models set openai-codex/gpt-5.4-mini 2>/dev/null node openclaw.mjs mcp set ha "{\"url\":\"$HA_SOFIA_MCP_URL\",\"transport\":\"streamable-http\"}" 2>/dev/null @@ -522,6 +744,10 @@ resource "kubernetes_deployment" "openclaw" { echo '{"plugins":{"allow":["memory-core","recruiter-api","telegram","openrouter","brave","openai","codex"]}}' \ | node openclaw.mjs config patch --stdin 2>/dev/null || true node openclaw.mjs plugins enable recruiter-api 2>/dev/null || true + # Reindex memory-core so the seeded devvm-fallback note (and + # anything else dropped under /workspace/memory/) is searchable + # on first boot; daily memory-sync CronJob also keeps it indexed. + node openclaw.mjs memory index --force 2>/dev/null || true exec node openclaw.mjs gateway --allow-unconfigured --bind lan EOC ] @@ -544,8 +770,21 @@ resource "kubernetes_deployment" "openclaw" { value = random_password.gateway_token.result } env { - name = "PATH" - value = "/tools:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin" + name = "PATH" + # Host-tools bundle (installed by init 4: install-host-tools) + # comes first so ssh/scp/dig/vault/jq/etc. resolve to the + # extracted Debian binaries + the static-binary downloads. + # /bin + /sbin are needed because iputils-ping installs ping + # under /bin (not /usr/bin) on Debian. + value = "/tools/host-tools/root/usr/bin:/tools/host-tools/root/usr/sbin:/tools/host-tools/root/bin:/tools/host-tools/root/sbin:/tools/host-tools/bin:/tools:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin" + } + env { + # Point ld.so at the bundled libs so the host-tools binaries + # find their shared-lib deps (libgssapi_krb5, libkrb5, etc.). + # Both base images are bookworm so the libs match the + # openclaw image's libc/libssl — no ABI conflicts expected. + name = "LD_LIBRARY_PATH" + value = "/tools/host-tools/root/usr/lib/x86_64-linux-gnu:/tools/host-tools/root/lib/x86_64-linux-gnu" } env { name = "TF_VAR_prod"