breakglass: in-cluster emergency-recovery UI for the devvm
All checks were successful
ci/woodpecker/push/woodpecker Pipeline was successful

Viktor wanted a web UI on the claude service to act as his breakglass when
the devvm is down: open it, have Claude SSH in to diagnose/repair, and
power-cycle the VM via the Proxmox host if needed. This is the app half
(the infra stack + host bootstrap live in the infra repo).

New, ISOLATED ASGI app under app/breakglass/ (never imports app.main, so the
untrusted-input agents — recruiter-triage, nextcloud-todos — can't share a
process with the root-on-devvm / PVE-reset SSH key):
- pve.py: the LLM-independent power-verb path (status|forensics|reset|stop|
  start|cycle on VM 102), whitelist-validated client-side, executed over the
  forced-command SSH key (list argv, no shell).
- agent_session.py: multi-turn streamed chat — claude -p --session-id /
  --resume with --output-format stream-json, translated to a small SSE
  vocabulary (session/text/tool/result/error/done).
- auth.py: edge Authentik header OR bearer; fail-closed.
- server.py: FastAPI (session/chat-SSE/pve-verb routes) + serves the Svelte UI.
- Svelte SPA (frontend/, built into app/breakglass/static/ and committed — no
  in-cluster build, per ADR-0002): streamed chat + danger-styled manual VM
  controls with confirm-on-mutate.
- agents/breakglass.md: narrow tools (Bash/Read/Grep/Glob, no web), taught the
  ssh devvm / ssh pve aliases and cycle-vs-reset.
- docker-entrypoint-breakglass.sh: ssh-agent bootstrap from the mounted key +
  ssh aliases, then uvicorn app.breakglass.server. The breakglass Deployment
  overrides the image CMD with this; the existing service is untouched.

26 new tests (verb whitelist incl. injection attempts, stream-json→SSE
translation, auth gating, route behaviour); full suite 58 green.

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
This commit is contained in:
Viktor Barzin 2026-06-12 21:36:05 +00:00
parent 694530135d
commit 4f361d91eb
28 changed files with 3889 additions and 0 deletions

View file

@ -0,0 +1,10 @@
"""Breakglass: an isolated emergency-recovery surface for the devvm.
This package is a SEPARATE ASGI app from ``app.main``. The breakglass
deployment runs ``uvicorn app.breakglass.server:app`` and mounts the SSH keys;
the ordinary claude-agent-service deployment keeps running ``app.main:app`` and
never sees those keys. Nothing here imports ``app.main`` and vice versa, so the
untrusted-input agents (recruiter-triage, nextcloud-todos) can never share a
process with the root-on-devvm / PVE-reset credentials. See
``docs/adr/0001-breakglass-security-architecture.md``.
"""

View file

@ -0,0 +1,145 @@
"""Drive the breakglass Claude agent and stream its work to the browser.
Each chat turn runs ``claude -p --output-format stream-json`` in the session's
persistent workspace; the first turn opens the session with ``--session-id`` and
later turns ``--resume`` it, so the conversation has memory across turns. The
CLI's JSON events are translated to a small, stable SSE vocabulary the UI
renders (``session`` / ``text`` / ``tool`` / ``result`` / ``error``) we do not
leak the raw event firehose to the client.
Subprocesses use ``asyncio.create_subprocess_exec`` (list argv, no shell): the
prompt and ids are argv elements, never interpreted by a shell.
"""
import asyncio
import json
import os
from subprocess import PIPE
from typing import AsyncIterator
from . import config
# Sessions we've already opened (so the next turn resumes instead of re-creating).
_started: set[str] = set()
def _turn_argv(session_id: str, prompt: str, resume: bool, model: str) -> list[str]:
argv = [
"claude", "-p",
"--agent", config.BREAKGLASS_AGENT,
"--dangerously-skip-permissions",
"--output-format", "stream-json",
"--verbose", # required for stream-json output
"--model", model,
]
# --session-id opens a brand-new session with that id; --resume continues it.
argv += (["--resume", session_id] if resume else ["--session-id", session_id])
argv.append(prompt)
return argv
def translate_event(obj: dict) -> dict | None:
"""Map one raw stream-json event to a UI event, or None to drop it.
Pure function the unit tests pin this contract. Keeps the noisy
hook/thinking-token/system chatter off the wire and exposes only what an
operator watching a recovery needs: which session, assistant prose, which
tools ran, and the final result.
"""
etype = obj.get("type")
if etype == "system":
if obj.get("subtype") == "init":
return {"kind": "session", "session_id": obj.get("session_id", "")}
return None # hook_started/hook_response/thinking_tokens/etc. — noise
if etype == "assistant":
events: list[dict] = []
for block in obj.get("message", {}).get("content", []) or []:
btype = block.get("type")
if btype == "text" and block.get("text"):
events.append({"kind": "text", "text": block["text"]})
elif btype == "tool_use":
events.append({
"kind": "tool",
"name": block.get("name", ""),
"input": block.get("input", {}),
})
if not events:
return None
# The server flattens a "batch" into individual SSE frames.
return events[0] if len(events) == 1 else {"kind": "batch", "events": events}
if etype == "result":
return {
"kind": "result",
"is_error": bool(obj.get("is_error")),
"result": obj.get("result", ""),
"duration_ms": obj.get("duration_ms"),
}
return None
async def run_turn(
session_id: str, prompt: str, model: str | None = None
) -> AsyncIterator[dict]:
"""Run one chat turn, yielding translated UI events as they arrive."""
resume = session_id in _started
model = model or config.DEFAULT_MODEL
workspace = os.path.join(config.SESSIONS_DIR, session_id)
os.makedirs(workspace, exist_ok=True)
argv = _turn_argv(session_id, prompt, resume, model)
proc = await asyncio.create_subprocess_exec(
*argv, cwd=workspace, stdout=PIPE, stderr=PIPE,
)
_started.add(session_id)
assert proc.stdout is not None and proc.stderr is not None
try:
async def _pump() -> AsyncIterator[dict]:
async for raw in proc.stdout:
line = raw.decode(errors="replace").strip()
if not line:
continue
try:
obj = json.loads(line)
except json.JSONDecodeError:
continue
ev = translate_event(obj)
if ev is None:
continue
if ev.get("kind") == "batch":
for sub in ev["events"]:
yield sub
else:
yield ev
async for ev in _with_timeout(_pump(), config.TURN_TIMEOUT_SECONDS):
yield ev
except asyncio.TimeoutError:
proc.kill()
await proc.wait()
yield {"kind": "error", "error": f"turn timed out after {config.TURN_TIMEOUT_SECONDS}s"}
return
await proc.wait()
if proc.returncode not in (0, None):
err = (await proc.stderr.read()).decode(errors="replace")
yield {"kind": "error", "error": err.strip()[:500] or f"exit {proc.returncode}"}
async def _with_timeout(agen: AsyncIterator[dict], timeout: float) -> AsyncIterator[dict]:
"""Yield from an async generator but raise TimeoutError if the WHOLE turn
exceeds ``timeout`` seconds (a wedged agent shouldn't stream forever)."""
loop = asyncio.get_event_loop()
deadline = loop.time() + timeout
it = agen.__aiter__()
while True:
remaining = deadline - loop.time()
if remaining <= 0:
raise asyncio.TimeoutError
try:
yield await asyncio.wait_for(it.__anext__(), timeout=remaining)
except StopAsyncIteration:
return

36
app/breakglass/auth.py Normal file
View file

@ -0,0 +1,36 @@
"""Auth for the breakglass app.
The app sits behind the ingress ``auth = "required"`` resilience proxy
(Authentik SSO normally, HTTP basic-auth fallback when Authentik is down), so a
browser request that reaches us is already edge-authenticated and carries the
proxy-injected ``X-authentik-username`` header. We also accept a bearer token
for machine/CLI callers. Either is sufficient.
When neither a token is configured nor a trusted header is present, we fail
closed.
"""
import hmac
from fastapi import Header, HTTPException
from . import config
def require_auth(
authorization: str | None = Header(default=None),
x_authentik_username: str | None = Header(default=None),
) -> str:
"""FastAPI dependency. Returns the identity (username or 'bearer'); raises
401 otherwise."""
# Edge-authenticated human: the auth-proxy sets this and overwrites any
# client-supplied value, so its presence is trustworthy.
if x_authentik_username:
return x_authentik_username
# Machine caller with the shared bearer token.
if config.API_TOKEN and authorization and authorization.startswith("Bearer "):
token = authorization.removeprefix("Bearer ")
if hmac.compare_digest(token, config.API_TOKEN):
return "bearer"
raise HTTPException(status_code=401, detail="unauthenticated")

36
app/breakglass/config.py Normal file
View file

@ -0,0 +1,36 @@
"""Environment-driven config for the breakglass app.
Targets are hardcoded IPs by default (the breakglass must not depend on cluster
DNS it has to work when things are broken). Everything is overridable via env
for tests and future re-IPing.
"""
import os
# SSH targets. IPs, not names — no DNS dependency in an incident.
DEVVM_HOST = os.environ.get("BREAKGLASS_DEVVM_HOST", "10.0.10.10")
DEVVM_USER = os.environ.get("BREAKGLASS_DEVVM_USER", "breakglass")
PVE_HOST = os.environ.get("BREAKGLASS_PVE_HOST", "192.168.1.127")
PVE_USER = os.environ.get("BREAKGLASS_PVE_USER", "root")
# The Claude agent the breakglass UI drives. Narrow tool surface, no web tools.
BREAKGLASS_AGENT = os.environ.get("BREAKGLASS_AGENT", "breakglass")
DEFAULT_MODEL = os.environ.get("BREAKGLASS_MODEL", "sonnet")
# Where claude session state + per-session scratch live. emptyDir in prod.
SESSIONS_DIR = os.environ.get("BREAKGLASS_SESSIONS_DIR", "/workspace/sessions")
# A single human operator per incident — no need for the job-runner's fan-out.
MAX_CONCURRENT_TURNS = int(os.environ.get("BREAKGLASS_MAX_CONCURRENT_TURNS", "2"))
# A chat turn that runs longer than this is killed (the agent is wedged).
TURN_TIMEOUT_SECONDS = int(os.environ.get("BREAKGLASS_TURN_TIMEOUT_SECONDS", "1800"))
# A single PVE power verb must return fast; a wedged host shouldn't hang the UI.
PVE_VERB_TIMEOUT_SECONDS = int(os.environ.get("BREAKGLASS_PVE_VERB_TIMEOUT_SECONDS", "120"))
# Auth. The app sits behind the ingress `auth = "required"` resilience proxy
# (Authentik SSO, basic-auth fallback when Authentik is down). We additionally
# accept a bearer token for machine/CLI callers. Either gate is sufficient;
# the edge is the primary one for the browser UI.
API_TOKEN = os.environ.get("API_BEARER_TOKEN", "")
# Header the auth-proxy injects for an authenticated human (set by Authentik, or
# by the basic-auth fallback's `$remote_user`). Presence ⇒ edge-authenticated.
TRUSTED_USER_HEADER = "x-authentik-username"

89
app/breakglass/pve.py Normal file
View file

@ -0,0 +1,89 @@
"""PVE power verbs — the LLM-independent recovery path.
The manual UI buttons hit this directly (no ``claude`` in the path), so reset
works even when the Anthropic API is down. The real enforcement is the
forced-command on the PVE host (``/usr/local/bin/breakglass-pve``): whatever we
send as the SSH command is ignored except as ``$SSH_ORIGINAL_COMMAND``, and the
host script only honours the verbs below against VM 102. We validate here too
defense in depth + a clean error before a round-trip.
All subprocesses use ``asyncio.create_subprocess_exec`` (list argv, no shell),
so the verb string is never interpreted by a shell there is no injection
surface even though the allowlist already constrains the input.
"""
import asyncio
from subprocess import PIPE
from . import config
# Must mirror /usr/local/bin/breakglass-pve on the PVE host.
ALLOWED_VERBS: frozenset[str] = frozenset(
{"status", "forensics", "reset", "stop", "start", "cycle"}
)
# Verbs that change VM state — the UI flags these for an explicit confirm and
# the host script captures forensics before running them.
MUTATING_VERBS: frozenset[str] = frozenset({"reset", "stop", "start", "cycle"})
def _ssh_argv(user: str, host: str, remote_command: str) -> list[str]:
"""Build an ssh argv (list form, no shell). ``remote_command`` is passed as
a single token; on the PVE host the forced-command ignores it except as
``$SSH_ORIGINAL_COMMAND``.
Host-key checking is disabled deliberately: a devvm REBUILD changes its host
key (e.g. 2026-05-23), and strict checking would lock the breakglass out at
exactly the moment it's needed. The targets are on the trusted internal LAN;
availability beats MITM hardening here. Auth is still by key (ssh-agent)."""
return [
"ssh",
"-o", "BatchMode=yes",
"-o", "ConnectTimeout=10",
"-o", "StrictHostKeyChecking=no",
"-o", "UserKnownHostsFile=/dev/null",
"-o", "LogLevel=ERROR",
f"{user}@{host}",
remote_command,
]
def is_allowed(verb: str) -> bool:
return verb in ALLOWED_VERBS
async def run_verb(verb: str, timeout: float | None = None) -> dict:
"""Run a single PVE verb against VM 102 over the forced-command SSH key.
Returns ``{"verb", "exit_code", "stdout", "stderr", "rejected"}``. A verb
not in the allowlist is rejected locally (``rejected=True``) without any
SSH at all.
"""
if verb not in ALLOWED_VERBS:
return {
"verb": verb,
"exit_code": None,
"stdout": "",
"stderr": f"rejected: '{verb}' is not an allowed verb",
"rejected": True,
}
timeout = timeout if timeout is not None else config.PVE_VERB_TIMEOUT_SECONDS
argv = _ssh_argv(config.PVE_USER, config.PVE_HOST, verb)
proc = await asyncio.create_subprocess_exec(*argv, stdout=PIPE, stderr=PIPE)
try:
out, err = await asyncio.wait_for(proc.communicate(), timeout=timeout)
except asyncio.TimeoutError:
proc.kill()
await proc.wait()
return {
"verb": verb,
"exit_code": None,
"stdout": "",
"stderr": f"timeout after {timeout}s talking to PVE host",
"rejected": False,
}
return {
"verb": verb,
"exit_code": proc.returncode,
"stdout": out.decode(errors="replace"),
"stderr": err.decode(errors="replace"),
"rejected": False,
}

96
app/breakglass/server.py Normal file
View file

@ -0,0 +1,96 @@
"""Breakglass FastAPI app — the in-cluster emergency recovery UI.
Routes:
GET /health liveness (no auth)
GET / the single-page UI (static)
POST /api/session open a chat session, returns {session_id}
POST /api/chat run one turn, streams SSE events (text/tool/result)
POST /api/pve/{verb} LLM-independent PVE power verb (manual buttons)
GET /api/pve/verbs list allowed verbs + which mutate
Everything under /api requires auth (edge Authentik header or bearer token).
"""
import json
import os
import uuid
from fastapi import Depends, FastAPI, HTTPException
from fastapi.responses import FileResponse, JSONResponse, StreamingResponse
from fastapi.staticfiles import StaticFiles
from pydantic import BaseModel, Field
from . import agent_session, config, pve
from .auth import require_auth
app = FastAPI(title="Claude Breakglass")
_STATIC_DIR = os.path.join(os.path.dirname(__file__), "static")
class SessionResponse(BaseModel):
session_id: str
class ChatRequest(BaseModel):
session_id: str
prompt: str = Field(..., min_length=1)
model: str | None = None
@app.get("/health")
async def health():
return {"status": "ok", "service": "claude-breakglass"}
@app.post("/api/session", response_model=SessionResponse)
async def open_session(_identity: str = Depends(require_auth)):
# Claude wants a UUID for --session-id.
return SessionResponse(session_id=str(uuid.uuid4()))
@app.post("/api/chat")
async def chat(req: ChatRequest, _identity: str = Depends(require_auth)):
"""Stream one chat turn as Server-Sent Events. The browser reads the
response body incrementally (fetch + ReadableStream)."""
async def _sse():
try:
async for ev in agent_session.run_turn(req.session_id, req.prompt, req.model):
yield f"data: {json.dumps(ev)}\n\n"
except Exception as exc: # noqa: BLE001 — surface any failure to the UI
yield f"data: {json.dumps({'kind': 'error', 'error': str(exc)[:500]})}\n\n"
yield f"data: {json.dumps({'kind': 'done'})}\n\n"
return StreamingResponse(
_sse(),
media_type="text/event-stream",
headers={"Cache-Control": "no-cache", "X-Accel-Buffering": "no"},
)
@app.get("/api/pve/verbs")
async def pve_verbs(_identity: str = Depends(require_auth)):
return {
"verbs": sorted(pve.ALLOWED_VERBS),
"mutating": sorted(pve.MUTATING_VERBS),
}
@app.post("/api/pve/{verb}")
async def pve_verb(verb: str, _identity: str = Depends(require_auth)):
"""Run a PVE power verb directly (no LLM in the path). Mutating verbs
capture forensics first on the host, unconditionally."""
if not pve.is_allowed(verb):
raise HTTPException(status_code=400, detail=f"unknown verb '{verb}'")
result = await pve.run_verb(verb)
status = 200 if result.get("exit_code") == 0 else 502
return JSONResponse(status_code=status, content=result)
# Serve the SPA. Mounted last so it doesn't shadow /api or /health.
if os.path.isdir(_STATIC_DIR):
@app.get("/")
async def index():
return FileResponse(os.path.join(_STATIC_DIR, "index.html"))
app.mount("/", StaticFiles(directory=_STATIC_DIR, html=True), name="static")

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View file

@ -0,0 +1,15 @@
<!doctype html>
<html lang="en">
<head>
<meta charset="UTF-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<meta name="color-scheme" content="dark" />
<meta name="robots" content="noindex, nofollow" />
<title>devvm breakglass</title>
<script type="module" crossorigin src="./assets/index-DNECe1Jo.js"></script>
<link rel="stylesheet" crossorigin href="./assets/index-DKeuidum.css">
</head>
<body>
<div id="app"></div>
</body>
</html>