infra/stacks/claude-breakglass/main.tf
Viktor Barzin 32cf75635f claude-breakglass: in-cluster warm break-glass UI for the devvm
Stand up the infra for Viktor's break-glass: when the devvm is wedged (cluster
healthy), open breakglass.viktorbarzin.me, have Claude SSH in to diagnose/fix,
and power-cycle VM 102 via the Proxmox host if needed. App half landed in the
claude-agent-service repo.

New stack stacks/claude-breakglass/ — own namespace + SA, NO Vault role (ESO
syncs only its key, so the pod has zero direct Vault access). Hardened to
survive the pressure it exists to fix: priorityClassName tier-0-core, broad
node-pressure tolerations, anti-affinity off node1, imagePullPolicy Always.
auth="required" ingress so it rides the Authentik resilience proxy and stays
reachable via the basic-auth fallback during an auth-stack outage. Runs the
shared claude-agent-service image with the breakglass entrypoint.
files/breakglass-pve is the PVE forced-command (status|forensics|reset|stop|
start|cycle on VM 102, forensics-first).

Isolation: the shared claude-agent pod's terraform-state Vault policy is
explicitly DENIED secret/claude-breakglass/* (stacks/vault/main.tf) so a
prompt-injected agent on that pod can't read the root-on-devvm key.

traefik: add a checksum/auth-proxy-htpasswd annotation so the auth-proxy rolls
when the emergency basic-auth password rotates (it's a subPath mount that
doesn't auto-update) — regenerated this session so Viktor has a known
emergency credential, which the auth-stack-outage failure domain requires.

Docs: docs/runbooks/breakglass-ui.md (full incident + bootstrap procedure,
incl. the per-host from= NAT quirks) and a security.md note recording the two
new privileged footholds.

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
2026-06-12 21:40:17 +00:00

361 lines
11 KiB
HCL

# claude-breakglass — in-cluster emergency-recovery UI for the devvm.
#
# A SEPARATE deployment from claude-agent-service (own namespace, own
# ServiceAccount, NO Vault K8s-auth role) that runs ONLY the breakglass agent.
# It shares the claude-agent-service image but overrides the command with the
# breakglass entrypoint. The untrusted-input agents (recruiter-triage,
# nextcloud-todos) never share this process or these credentials.
# See claude-agent-service/docs/adr/0001-breakglass-security-architecture.md.
#
# Scope is the WARM case: devvm wedged while the cluster is healthy. The cold,
# cluster-down path is the break-glass SSH on PVE :52222 (docs/runbooks/breakglass-ssh.md)
# + the server-lifecycle iDRAC CLI — out of scope here.
variable "tls_secret_name" {
type = string
sensitive = true
}
locals {
namespace = "claude-breakglass"
# Same image as claude-agent-service — the breakglass code lives in that repo
# under app/breakglass/, and the deployment below overrides the command.
image = "forgejo.viktorbarzin.me/viktor/claude-agent-service"
image_tag = "latest"
labels = {
app = "claude-breakglass"
}
}
# --- Namespace ---
resource "kubernetes_namespace" "breakglass" {
metadata {
name = local.namespace
labels = {
tier = local.tiers.aux
}
}
lifecycle {
# KYVERNO_LIFECYCLE_V1: goldilocks/vpa-mode label stamping (harmless if absent)
ignore_changes = [metadata[0].labels["goldilocks.fairwinds.com/vpa-update-mode"]]
}
}
resource "kubernetes_service_account" "breakglass" {
metadata {
name = "claude-breakglass"
namespace = kubernetes_namespace.breakglass.metadata[0].name
}
}
# --- Secrets (synced by ESO; the pod itself has NO Vault access) ---
# SSH private key (devvm sudo + PVE forced-command). Mounted as a file the
# entrypoint loads into ssh-agent. Dedicated path secret/claude-breakglass/* —
# the claude-agent namespace's terraform-state Vault policy is explicitly
# DENIED this path (see stacks/vault/main.tf) so the shared, prompt-injectable
# pod can never read it.
resource "kubernetes_manifest" "external_secret_ssh" {
manifest = {
apiVersion = "external-secrets.io/v1beta1"
kind = "ExternalSecret"
metadata = {
name = "breakglass-ssh"
namespace = local.namespace
}
spec = {
refreshInterval = "1h"
secretStoreRef = { name = "vault-kv", kind = "ClusterSecretStore" }
target = { name = "breakglass-ssh" }
data = [
{
secretKey = "private_key"
remoteRef = { key = "claude-breakglass/ssh_key", property = "private_key" }
},
]
}
}
depends_on = [kubernetes_namespace.breakglass]
}
# Env secrets: the Anthropic OAuth token (shared with claude-agent-service —
# same account) and the app bearer token (in-cluster/CLI fallback caller auth).
resource "kubernetes_manifest" "external_secret_env" {
manifest = {
apiVersion = "external-secrets.io/v1beta1"
kind = "ExternalSecret"
metadata = {
name = "breakglass-env"
namespace = local.namespace
}
spec = {
refreshInterval = "1h"
secretStoreRef = { name = "vault-kv", kind = "ClusterSecretStore" }
target = { name = "breakglass-env" }
data = [
{
secretKey = "CLAUDE_CODE_OAUTH_TOKEN"
remoteRef = { key = "claude-agent-service", property = "claude_oauth_token" }
},
{
secretKey = "API_BEARER_TOKEN"
remoteRef = { key = "claude-breakglass", property = "api_bearer_token" }
},
]
}
}
depends_on = [kubernetes_namespace.breakglass]
}
# --- Deployment ---
resource "kubernetes_deployment" "breakglass" {
metadata {
name = "claude-breakglass"
namespace = kubernetes_namespace.breakglass.metadata[0].name
labels = local.labels
}
spec {
replicas = 1
strategy { type = "Recreate" }
selector { match_labels = local.labels }
template {
metadata { labels = local.labels }
spec {
service_account_name = kubernetes_service_account.breakglass.metadata[0].name
image_pull_secrets {
name = "registry-credentials"
}
# Survive the very pressure event the breakglass exists to fix: high
# priority (resist eviction), tolerate node pressure, and prefer NOT to
# land on the contended GPU node1. Pull policy is Always: nodes already
# cache the OLD claude-agent-service:latest (no breakglass entrypoint),
# so IfNotPresent would run stale code. A registry-down-on-restart is
# the cluster-down (cold) case, which this UI doesn't cover anyway.
priority_class_name = "tier-0-core"
toleration {
key = "node.kubernetes.io/memory-pressure"
operator = "Exists"
effect = "NoSchedule"
}
toleration {
key = "node.kubernetes.io/disk-pressure"
operator = "Exists"
effect = "NoSchedule"
}
toleration {
key = "node.kubernetes.io/not-ready"
operator = "Exists"
effect = "NoExecute"
toleration_seconds = 300
}
toleration {
key = "node.kubernetes.io/unreachable"
operator = "Exists"
effect = "NoExecute"
toleration_seconds = 300
}
affinity {
node_affinity {
preferred_during_scheduling_ignored_during_execution {
weight = 100
preference {
match_expressions {
key = "kubernetes.io/hostname"
operator = "NotIn"
values = ["k8s-node1"]
}
}
}
}
}
security_context {
run_as_user = 1000
run_as_group = 1000
fs_group = 1000
}
# Seed the breakglass agent into the fresh ~/.claude emptyDir and make
# the session dir writable by uid 1000.
init_container {
name = "seed-agent"
image = "${local.image}:${local.image_tag}"
command = ["sh", "-c", <<-EOT
set -e
mkdir -p /home/agent/.claude/agents /workspace/sessions
cp /usr/share/agent-seed/breakglass.md /home/agent/.claude/agents/breakglass.md
chown -R 1000:1000 /home/agent/.claude /workspace
EOT
]
image_pull_policy = "Always"
security_context {
run_as_user = 0
}
volume_mount {
name = "claude-home"
mount_path = "/home/agent/.claude"
}
volume_mount {
name = "sessions"
mount_path = "/workspace"
}
resources {
requests = { memory = "32Mi" }
limits = { memory = "64Mi" }
}
}
container {
name = "claude-breakglass"
image = "${local.image}:${local.image_tag}"
image_pull_policy = "Always"
# Override the image's default CMD (the claude-agent-service uvicorn)
# with the breakglass entrypoint: ssh-agent bootstrap + ssh aliases,
# then uvicorn app.breakglass.server:app.
command = ["/srv/docker-entrypoint-breakglass.sh"]
port { container_port = 8080 }
# OAuth token (claude -p) + app bearer token.
env_from {
secret_ref { name = "breakglass-env" }
}
env {
name = "BREAKGLASS_KEY_PATH"
value = "/secrets/breakglass/private_key"
}
env {
name = "BREAKGLASS_SESSIONS_DIR"
value = "/workspace/sessions"
}
env {
name = "HOME"
value = "/home/agent"
}
liveness_probe {
http_get {
path = "/health"
port = 8080
}
initial_delay_seconds = 10
period_seconds = 30
}
readiness_probe {
http_get {
path = "/health"
port = 8080
}
initial_delay_seconds = 5
period_seconds = 10
}
volume_mount {
name = "claude-home"
mount_path = "/home/agent/.claude"
}
volume_mount {
name = "sessions"
mount_path = "/workspace"
}
volume_mount {
name = "breakglass-ssh"
mount_path = "/secrets/breakglass"
read_only = true
}
resources {
requests = {
cpu = "200m"
memory = "512Mi"
}
limits = {
memory = "4Gi"
}
}
}
volume {
name = "claude-home"
empty_dir {}
}
volume {
name = "sessions"
empty_dir {}
}
volume {
name = "breakglass-ssh"
secret {
secret_name = "breakglass-ssh"
# 0440 + fsGroup 1000 ⇒ readable by uid 1000; the entrypoint copies
# to a 0600 tmpfs file before ssh-add (which rejects group-readable).
default_mode = "0440"
}
}
}
}
}
lifecycle {
ignore_changes = [spec[0].template[0].spec[0].dns_config] # KYVERNO_LIFECYCLE_V1
}
depends_on = [
kubernetes_manifest.external_secret_ssh,
kubernetes_manifest.external_secret_env,
]
}
# --- Service ---
resource "kubernetes_service" "breakglass" {
metadata {
name = "claude-breakglass"
namespace = kubernetes_namespace.breakglass.metadata[0].name
labels = local.labels
}
spec {
selector = local.labels
port {
port = 8080
target_port = 8080
}
type = "ClusterIP"
}
}
# --- Ingress: breakglass.viktorbarzin.me ---
# auth = "required": Authentik forward-auth via the resilience proxy, which
# FALLS BACK to HTTP basic-auth when Authentik is down — the whole point, so the
# breakglass is reachable during an auth-stack outage. CrowdSec + rate-limit are
# attached by default (not excluded). The app additionally accepts the injected
# X-authentik-username header (or a bearer) as its own gate.
module "ingress" {
source = "../../modules/kubernetes/ingress_factory"
name = "breakglass"
service_name = kubernetes_service.breakglass.metadata[0].name
port = 8080
namespace = kubernetes_namespace.breakglass.metadata[0].name
tls_secret_name = var.tls_secret_name
auth = "required"
dns_type = "proxied"
extra_annotations = {
"gethomepage.dev/enabled" = "true"
"gethomepage.dev/name" = "devvm breakglass"
"gethomepage.dev/description" = "Emergency recovery UI for the devvm"
"gethomepage.dev/icon" = "proxmox.png"
"gethomepage.dev/group" = "Infrastructure"
}
}