2026-06-09 08:45:33 +00:00
variable " tls_secret_name " {
type = string
sensitive = true
}
variable " postgresql_host " { type = string }
variable " claude_memory_db_password " {
type = string
sensitive = true
default = " " # falls back to Vault `secret/claude-memory.db_password` below
}
data " vault_kv_secret_v2 " " secrets " {
mount = " secret "
name = " claude-memory "
}
resource " kubernetes_namespace " " claude-memory " {
metadata {
name = " claude-memory "
labels = {
2026-06-13 02:34:20 +00:00
tier = local . tiers . aux
2026-06-09 08:45:33 +00:00
" keel.sh/enrolled " = " true "
}
}
lifecycle {
# KYVERNO_LIFECYCLE_V1: goldilocks-vpa-auto-mode ClusterPolicy stamps this label on every namespace
ignore_changes = [ metadata [ 0 ] . labels [ " goldilocks.fairwinds.com/vpa-update-mode " ] ]
}
}
resource " kubernetes_manifest " " external_secret " {
manifest = {
apiVersion = " external-secrets.io/v1beta1 "
kind = " ExternalSecret "
metadata = {
name = " claude-memory-secrets "
namespace = " claude-memory "
}
spec = {
refreshInterval = " 15m "
secretStoreRef = {
name = " vault-kv "
kind = " ClusterSecretStore "
}
target = {
name = " claude-memory-secrets "
}
dataFrom = [ {
extract = {
key = " claude-memory "
}
} ]
}
}
depends_on = [ kubernetes_namespace . claude - memory ]
}
# DB credentials from Vault database engine (rotated every 24h)
resource " kubernetes_manifest " " db_external_secret " {
manifest = {
apiVersion = " external-secrets.io/v1beta1 "
kind = " ExternalSecret "
metadata = {
name = " claude-memory-db-creds "
namespace = " claude-memory "
}
spec = {
refreshInterval = " 15m "
secretStoreRef = {
name = " vault-database "
kind = " ClusterSecretStore "
}
target = {
name = " claude-memory-db-creds "
template = {
data = {
DATABASE_URL = " postgresql://claude_memory:{{ .password }}@ ${ var . postgresql_host } :5432/claude_memory "
DB_PASSWORD = " {{ .password }} "
}
}
}
data = [ {
secretKey = " password "
remoteRef = {
key = " static-creds/pg-claude-memory "
property = " password "
}
} ]
}
}
depends_on = [ kubernetes_namespace . claude - memory ]
}
module " tls_secret " {
source = " ../../modules/kubernetes/setup_tls_secret "
namespace = kubernetes_namespace . claude - memory . metadata [ 0 ] . name
tls_secret_name = var . tls_secret_name
}
# Database init job
resource " kubernetes_job " " db_init " {
metadata {
name = " claude-memory-db-init "
namespace = kubernetes_namespace . claude - memory . metadata [ 0 ] . name
}
spec {
template {
metadata { }
spec {
container {
name = " db-init "
image = " postgres:16-alpine "
command = [
" sh " , " -c " ,
< < - EOT
set - e
# -d postgres: psql defaults database name to username; root user
# doesn't have a root-named database, so be explicit.
PGPASSWORD =' $ { data . vault_kv_secret_v2 . secrets . data [ " dbaas_root_password " ] } ' psql - h $ { var . postgresql_host } - U root - d postgres - tc " SELECT 1 FROM pg_roles WHERE rolname='claude_memory' " | grep - q 1 | | \
PGPASSWORD =' $ { data . vault_kv_secret_v2 . secrets . data [ " dbaas_root_password " ] } ' psql - h $ { var . postgresql_host } - U root - d postgres - c " CREATE ROLE claude_memory WITH LOGIN PASSWORD ' ${ coalesce ( var . claude_memory_db_password , data . vault_kv_secret_v2 . secrets . data [ " db_password " ] ) } ' "
PGPASSWORD =' $ { data . vault_kv_secret_v2 . secrets . data [ " dbaas_root_password " ] } ' psql - h $ { var . postgresql_host } - U root - d postgres - tc " SELECT 1 FROM pg_database WHERE datname='claude_memory' " | grep - q 1 | | \
PGPASSWORD =' $ { data . vault_kv_secret_v2 . secrets . data [ " dbaas_root_password " ] } ' psql - h $ { var . postgresql_host } - U root - d postgres - c " CREATE DATABASE claude_memory OWNER claude_memory "
PGPASSWORD =' $ { data . vault_kv_secret_v2 . secrets . data [ " dbaas_root_password " ] } ' psql - h $ { var . postgresql_host } - U root - d postgres - c " GRANT ALL PRIVILEGES ON DATABASE claude_memory TO claude_memory "
echo " Database init complete "
EOT
]
}
restart_policy = " Never "
}
}
backoff_limit = 3
}
wait_for_completion = true
timeouts {
create = " 2m "
}
}
resource " kubernetes_deployment " " claude-memory " {
depends_on = [ kubernetes_job . db_init ]
metadata {
name = " claude-memory "
namespace = kubernetes_namespace . claude - memory . metadata [ 0 ] . name
labels = {
app = " claude-memory "
tier = local . tiers . aux
}
annotations = {
" reloader.stakater.com/auto " = " true "
}
}
spec {
claude-memory: HA (replicas 2 + PDB) to stop recurring MCP disconnects
The claude-memory MCP backend ran as a single replica with no PDB, so every
voluntary disruption took it to zero for ~30-90s — which surfaced as the
memory MCP "keeps getting disconnected" problem. Disruption sources hitting
the lone pod: the descheduler (every-5-min CronJob, LowNodeUtilization —
caught evicting it live), Keel image bumps, Reloader restarts on the 7-day
DB-password rotation, node drains, and CI deploys.
The local stdio MCP subprocess itself was proven healthy (fast non-blocking
startup, stderr suppressed, graceful degradation), so the fault was purely
backend availability, not the MCP plumbing.
Fix: run 2 replicas (the backend is stateless FastAPI over shared CNPG
Postgres and already has hostname anti-affinity) + restore the PDB at
minAvailable=1 (safe now — the drain deadlock that justified removing it
only existed at 1 replica) + descheduler evict=false to stop the needless
5-min churn. All five disruption sources become zero-downtime rolling events.
Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-18 09:13:36 +00:00
# 2 replicas (stateless FastAPI over shared CNPG PG) so node drains, Keel
# bumps, Reloader restarts (7d DB rotation), CI deploys, and descheduler
# evictions become zero-downtime rolling events instead of hard outages —
# the latter were surfacing as recurring memory-MCP "disconnects".
replicas = 2
2026-06-09 08:45:33 +00:00
selector {
match_labels = {
app = " claude-memory "
}
}
template {
metadata {
labels = {
app = " claude-memory "
}
annotations = {
" dependency.kyverno.io/wait-for " = " postgresql.dbaas:5432 "
claude-memory: HA (replicas 2 + PDB) to stop recurring MCP disconnects
The claude-memory MCP backend ran as a single replica with no PDB, so every
voluntary disruption took it to zero for ~30-90s — which surfaced as the
memory MCP "keeps getting disconnected" problem. Disruption sources hitting
the lone pod: the descheduler (every-5-min CronJob, LowNodeUtilization —
caught evicting it live), Keel image bumps, Reloader restarts on the 7-day
DB-password rotation, node drains, and CI deploys.
The local stdio MCP subprocess itself was proven healthy (fast non-blocking
startup, stderr suppressed, graceful degradation), so the fault was purely
backend availability, not the MCP plumbing.
Fix: run 2 replicas (the backend is stateless FastAPI over shared CNPG
Postgres and already has hostname anti-affinity) + restore the PDB at
minAvailable=1 (safe now — the drain deadlock that justified removing it
only existed at 1 replica) + descheduler evict=false to stop the needless
5-min churn. All five disruption sources become zero-downtime rolling events.
Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-18 09:13:36 +00:00
# Skip descheduler eviction — it bounced this pod every ~5min
# (LowNodeUtilization). The PDB below keeps drains/Keel/CI safe at
# 2 replicas; this just stops the needless churn of the MCP backend.
" descheduler.alpha.kubernetes.io/evict " = " false "
2026-06-09 08:45:33 +00:00
}
}
spec {
affinity {
pod_anti_affinity {
required_during_scheduling_ignored_during_execution {
label_selector {
match_labels = {
app = " claude-memory "
}
}
topology_key = " kubernetes.io/hostname "
}
}
}
container {
name = " claude-memory "
# Phase 3 cutover 2026-05-07 — moved off DockerHub to Forgejo as
# part of the registry consolidation. Old: viktorbarzin/claude-memory-mcp:17
2026-06-13 02:34:20 +00:00
image = " ghcr.io/viktorbarzin/claude-memory-mcp:latest "
2026-06-09 08:45:33 +00:00
port {
container_port = 8000
}
env {
name = " DATABASE_URL "
value_from {
secret_key_ref {
name = " claude-memory-db-creds "
key = " DATABASE_URL "
}
}
}
env {
name = " API_KEYS "
value_from {
secret_key_ref {
name = " claude-memory-secrets "
key = " api_keys "
}
}
}
startup_probe {
http_get {
path = " /health "
port = 8000
}
failure_threshold = 30
period_seconds = 2
}
liveness_probe {
http_get {
path = " /health "
port = 8000
}
initial_delay_seconds = 5
period_seconds = 30
}
readiness_probe {
http_get {
path = " /health "
port = 8000
}
initial_delay_seconds = 3
period_seconds = 10
}
resources {
requests = {
memory = " 128Mi "
cpu = " 10m "
}
limits = {
memory = " 128Mi "
}
}
}
}
}
}
lifecycle {
# DRIFT_WORKAROUND: CI pipeline owns image tag (kubectl set image from Woodpecker/GHA). Reviewed 2026-04-18.
ignore_changes = [
spec [ 0 ] . template [ 0 ] . spec [ 0 ] . container [ 0 ] . image ,
spec [ 0 ] . template [ 0 ] . spec [ 0 ] . dns_config , # KYVERNO_LIFECYCLE_V1: Kyverno admission webhook mutates dns_config with ndots=2
metadata [ 0 ] . annotations [ " keel.sh/policy " ] ,
metadata [ 0 ] . annotations [ " keel.sh/trigger " ] ,
metadata [ 0 ] . annotations [ " keel.sh/pollSchedule " ] , # KYVERNO_LIFECYCLE_V2
metadata [ 0 ] . annotations [ " keel.sh/match-tag " ] ,
metadata [ 0 ] . annotations [ " kubernetes.io/change-cause " ] ,
metadata [ 0 ] . annotations [ " deployment.kubernetes.io/revision " ] ,
spec [ 0 ] . template [ 0 ] . metadata [ 0 ] . annotations [ " keel.sh/update-time " ] , # KEEL_LIFECYCLE_V1
]
}
}
claude-memory: HA (replicas 2 + PDB) to stop recurring MCP disconnects
The claude-memory MCP backend ran as a single replica with no PDB, so every
voluntary disruption took it to zero for ~30-90s — which surfaced as the
memory MCP "keeps getting disconnected" problem. Disruption sources hitting
the lone pod: the descheduler (every-5-min CronJob, LowNodeUtilization —
caught evicting it live), Keel image bumps, Reloader restarts on the 7-day
DB-password rotation, node drains, and CI deploys.
The local stdio MCP subprocess itself was proven healthy (fast non-blocking
startup, stderr suppressed, graceful degradation), so the fault was purely
backend availability, not the MCP plumbing.
Fix: run 2 replicas (the backend is stateless FastAPI over shared CNPG
Postgres and already has hostname anti-affinity) + restore the PDB at
minAvailable=1 (safe now — the drain deadlock that justified removing it
only existed at 1 replica) + descheduler evict=false to stop the needless
5-min churn. All five disruption sources become zero-downtime rolling events.
Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-18 09:13:36 +00:00
# PDB restored alongside replicas=2 (2026-06-18). The old reason for removing it
# — a 1-replica minAvailable=1 PDB deadlocks node drains — no longer applies at
# 2 replicas: minAvailable=1 lets one pod be drained/evicted while the other
# serves, so voluntary disruptions never take the MCP backend to zero.
resource " kubernetes_pod_disruption_budget_v1 " " claude-memory " {
metadata {
name = " claude-memory "
namespace = kubernetes_namespace . claude - memory . metadata [ 0 ] . name
}
spec {
min_available = " 1 "
selector {
match_labels = {
app = " claude-memory "
}
}
}
}
2026-06-09 08:45:33 +00:00
resource " kubernetes_service " " claude-memory " {
metadata {
name = " claude-memory "
namespace = kubernetes_namespace . claude - memory . metadata [ 0 ] . name
labels = {
app = " claude-memory "
}
}
spec {
selector = {
app = " claude-memory "
}
port {
name = " http "
port = 80
target_port = 8000
}
}
}
module " ingress " {
source = " ../../modules/kubernetes/ingress_factory "
# MCP server — called by Claude Code (and other tools/agents) via app-layer
# bearer-token auth; forward-auth would break programmatic clients.
# auth = "none": MCP server called by Claude Code via bearer-token auth; forward-auth would break programmatic clients.
auth = " none "
dns_type = " proxied "
namespace = kubernetes_namespace . claude - memory . metadata [ 0 ] . name
name = " claude-memory "
tls_secret_name = var . tls_secret_name
extra_annotations = {
" gethomepage.dev/enabled " = " true "
" gethomepage.dev/name " = " Claude Memory "
" gethomepage.dev/description " = " Shared persistent memory for Claude sessions "
" gethomepage.dev/icon " = " claude-ai.png "
" gethomepage.dev/group " = " Core Platform "
" gethomepage.dev/pod-selector " = " "
}
}