breakglass: in-cluster emergency-recovery UI for the devvm

Viktor wanted a web UI on the claude service to act as his breakglass when the devvm is down: open it, have Claude SSH in to diagnose/repair, and power-cycle the VM via the Proxmox host if needed. This is the app half (the infra stack + host bootstrap live in the infra repo). New, ISOLATED ASGI app under app/breakglass/ (never imports app.main, so the untrusted-input agents — recruiter-triage, nextcloud-todos — can't share a process with the root-on-devvm / PVE-reset SSH key): - pve.py: the LLM-independent power-verb path (status|forensics|reset|stop| start|cycle on VM 102), whitelist-validated client-side, executed over the forced-command SSH key (list argv, no shell). - agent_session.py: multi-turn streamed chat — claude -p --session-id / --resume with --output-format stream-json, translated to a small SSE vocabulary (session/text/tool/result/error/done). - auth.py: edge Authentik header OR bearer; fail-closed. - server.py: FastAPI (session/chat-SSE/pve-verb routes) + serves the Svelte UI. - Svelte SPA (frontend/, built into app/breakglass/static/ and committed — no in-cluster build, per ADR-0002): streamed chat + danger-styled manual VM controls with confirm-on-mutate. - agents/breakglass.md: narrow tools (Bash/Read/Grep/Glob, no web), taught the ssh devvm / ssh pve aliases and cycle-vs-reset. - docker-entrypoint-breakglass.sh: ssh-agent bootstrap from the mounted key + ssh aliases, then uvicorn app.breakglass.server. The breakglass Deployment overrides the image CMD with this; the existing service is untouched. 26 new tests (verb whitelist incl. injection attempts, stream-json→SSE translation, auth gating, route behaviour); full suite 58 green. Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
2026-06-12 21:36:05 +00:00 · 2026-06-12 21:36:05 +00:00 · 4f361d91eb
commit 4f361d91eb
parent 694530135d
28 changed files with 3889 additions and 0 deletions
--- a/app/breakglass/agent_session.py
+++ b/app/breakglass/agent_session.py
@ -0,0 +1,145 @@
+"""Drive the breakglass Claude agent and stream its work to the browser.
+
+Each chat turn runs ``claude -p --output-format stream-json`` in the session's
+persistent workspace; the first turn opens the session with ``--session-id`` and
+later turns ``--resume`` it, so the conversation has memory across turns. The
+CLI's JSON events are translated to a small, stable SSE vocabulary the UI
+renders (``session`` / ``text`` / ``tool`` / ``result`` / ``error``) — we do not
+leak the raw event firehose to the client.
+
+Subprocesses use ``asyncio.create_subprocess_exec`` (list argv, no shell): the
+prompt and ids are argv elements, never interpreted by a shell.
+"""
+import asyncio
+import json
+import os
+from subprocess import PIPE
+from typing import AsyncIterator
+
+from . import config
+
+# Sessions we've already opened (so the next turn resumes instead of re-creating).
+_started: set[str] = set()
+
+
+def _turn_argv(session_id: str, prompt: str, resume: bool, model: str) -> list[str]:
+    argv = [
+        "claude", "-p",
+        "--agent", config.BREAKGLASS_AGENT,
+        "--dangerously-skip-permissions",
+        "--output-format", "stream-json",
+        "--verbose",                      # required for stream-json output
+        "--model", model,
+    ]
+    # --session-id opens a brand-new session with that id; --resume continues it.
+    argv += (["--resume", session_id] if resume else ["--session-id", session_id])
+    argv.append(prompt)
+    return argv
+
+
+def translate_event(obj: dict) -> dict | None:
+    """Map one raw stream-json event to a UI event, or None to drop it.
+
+    Pure function — the unit tests pin this contract. Keeps the noisy
+    hook/thinking-token/system chatter off the wire and exposes only what an
+    operator watching a recovery needs: which session, assistant prose, which
+    tools ran, and the final result.
+    """
+    etype = obj.get("type")
+
+    if etype == "system":
+        if obj.get("subtype") == "init":
+            return {"kind": "session", "session_id": obj.get("session_id", "")}
+        return None  # hook_started/hook_response/thinking_tokens/etc. — noise
+
+    if etype == "assistant":
+        events: list[dict] = []
+        for block in obj.get("message", {}).get("content", []) or []:
+            btype = block.get("type")
+            if btype == "text" and block.get("text"):
+                events.append({"kind": "text", "text": block["text"]})
+            elif btype == "tool_use":
+                events.append({
+                    "kind": "tool",
+                    "name": block.get("name", ""),
+                    "input": block.get("input", {}),
+                })
+        if not events:
+            return None
+        # The server flattens a "batch" into individual SSE frames.
+        return events[0] if len(events) == 1 else {"kind": "batch", "events": events}
+
+    if etype == "result":
+        return {
+            "kind": "result",
+            "is_error": bool(obj.get("is_error")),
+            "result": obj.get("result", ""),
+            "duration_ms": obj.get("duration_ms"),
+        }
+
+    return None
+
+
+async def run_turn(
+    session_id: str, prompt: str, model: str | None = None
+) -> AsyncIterator[dict]:
+    """Run one chat turn, yielding translated UI events as they arrive."""
+    resume = session_id in _started
+    model = model or config.DEFAULT_MODEL
+    workspace = os.path.join(config.SESSIONS_DIR, session_id)
+    os.makedirs(workspace, exist_ok=True)
+
+    argv = _turn_argv(session_id, prompt, resume, model)
+    proc = await asyncio.create_subprocess_exec(
+        *argv, cwd=workspace, stdout=PIPE, stderr=PIPE,
+    )
+    _started.add(session_id)
+    assert proc.stdout is not None and proc.stderr is not None
+
+    try:
+        async def _pump() -> AsyncIterator[dict]:
+            async for raw in proc.stdout:
+                line = raw.decode(errors="replace").strip()
+                if not line:
+                    continue
+                try:
+                    obj = json.loads(line)
+                except json.JSONDecodeError:
+                    continue
+                ev = translate_event(obj)
+                if ev is None:
+                    continue
+                if ev.get("kind") == "batch":
+                    for sub in ev["events"]:
+                        yield sub
+                else:
+                    yield ev
+
+        async for ev in _with_timeout(_pump(), config.TURN_TIMEOUT_SECONDS):
+            yield ev
+    except asyncio.TimeoutError:
+        proc.kill()
+        await proc.wait()
+        yield {"kind": "error", "error": f"turn timed out after {config.TURN_TIMEOUT_SECONDS}s"}
+        return
+
+    await proc.wait()
+    if proc.returncode not in (0, None):
+        err = (await proc.stderr.read()).decode(errors="replace")
+        yield {"kind": "error", "error": err.strip()[:500] or f"exit {proc.returncode}"}
+
+
+async def _with_timeout(agen: AsyncIterator[dict], timeout: float) -> AsyncIterator[dict]:
+    """Yield from an async generator but raise TimeoutError if the WHOLE turn
+    exceeds ``timeout`` seconds (a wedged agent shouldn't stream forever)."""
+    loop = asyncio.get_event_loop()
+    deadline = loop.time() + timeout
+    it = agen.__aiter__()
+    while True:
+        remaining = deadline - loop.time()
+        if remaining <= 0:
+            raise asyncio.TimeoutError
+        try:
+            yield await asyncio.wait_for(it.__anext__(), timeout=remaining)
+        except StopAsyncIteration:
+            return