- Set memory requests = limits across 56 stacks to prevent overcommit - Right-sized limits based on actual pod usage (2x actual, rounded up) - Scaled down trading-bot (replicas=0) to free memory - Fixed OOMKilled services: forgejo, dawarich, health, meshcentral, paperless-ngx, vault auto-unseal, rybbit, whisper, openclaw, clickhouse - Added startup+liveness probes to calibre-web - Bumped inotify limits on nodes 2,3 (max_user_instances 128->8192) Post node2 OOM incident (2026-03-14). Previous kubelet config had no kubeReserved/systemReserved set, allowing pods to starve the kernel.
214 lines
5.7 KiB
HCL
214 lines
5.7 KiB
HCL
variable "tls_secret_name" {
|
|
type = string
|
|
sensitive = true
|
|
}
|
|
variable "postgresql_host" { type = string }
|
|
variable "claude_memory_db_password" {
|
|
type = string
|
|
sensitive = true
|
|
}
|
|
|
|
data "vault_kv_secret_v2" "secrets" {
|
|
mount = "secret"
|
|
name = "claude-memory"
|
|
}
|
|
|
|
resource "kubernetes_namespace" "claude-memory" {
|
|
metadata {
|
|
name = "claude-memory"
|
|
labels = {
|
|
tier = local.tiers.aux
|
|
}
|
|
}
|
|
}
|
|
|
|
module "tls_secret" {
|
|
source = "../../modules/kubernetes/setup_tls_secret"
|
|
namespace = kubernetes_namespace.claude-memory.metadata[0].name
|
|
tls_secret_name = var.tls_secret_name
|
|
}
|
|
|
|
# Database init job
|
|
resource "kubernetes_job" "db_init" {
|
|
metadata {
|
|
name = "claude-memory-db-init"
|
|
namespace = kubernetes_namespace.claude-memory.metadata[0].name
|
|
}
|
|
spec {
|
|
template {
|
|
metadata {}
|
|
spec {
|
|
container {
|
|
name = "db-init"
|
|
image = "postgres:16-alpine"
|
|
command = [
|
|
"sh", "-c",
|
|
<<-EOT
|
|
set -e
|
|
PGPASSWORD='${data.vault_kv_secret_v2.secrets.data["dbaas_root_password"]}' psql -h ${var.postgresql_host} -U root -tc "SELECT 1 FROM pg_roles WHERE rolname='claude_memory'" | grep -q 1 || \
|
|
PGPASSWORD='${data.vault_kv_secret_v2.secrets.data["dbaas_root_password"]}' psql -h ${var.postgresql_host} -U root -c "CREATE ROLE claude_memory WITH LOGIN PASSWORD '${var.claude_memory_db_password}'"
|
|
PGPASSWORD='${data.vault_kv_secret_v2.secrets.data["dbaas_root_password"]}' psql -h ${var.postgresql_host} -U root -tc "SELECT 1 FROM pg_database WHERE datname='claude_memory'" | grep -q 1 || \
|
|
PGPASSWORD='${data.vault_kv_secret_v2.secrets.data["dbaas_root_password"]}' psql -h ${var.postgresql_host} -U root -c "CREATE DATABASE claude_memory OWNER claude_memory"
|
|
PGPASSWORD='${data.vault_kv_secret_v2.secrets.data["dbaas_root_password"]}' psql -h ${var.postgresql_host} -U root -c "GRANT ALL PRIVILEGES ON DATABASE claude_memory TO claude_memory"
|
|
echo "Database init complete"
|
|
EOT
|
|
]
|
|
}
|
|
restart_policy = "Never"
|
|
}
|
|
}
|
|
backoff_limit = 3
|
|
}
|
|
wait_for_completion = true
|
|
timeouts {
|
|
create = "2m"
|
|
}
|
|
}
|
|
|
|
resource "kubernetes_deployment" "claude-memory" {
|
|
depends_on = [kubernetes_job.db_init]
|
|
metadata {
|
|
name = "claude-memory"
|
|
namespace = kubernetes_namespace.claude-memory.metadata[0].name
|
|
labels = {
|
|
app = "claude-memory"
|
|
tier = local.tiers.aux
|
|
}
|
|
}
|
|
spec {
|
|
replicas = 2
|
|
selector {
|
|
match_labels = {
|
|
app = "claude-memory"
|
|
}
|
|
}
|
|
template {
|
|
metadata {
|
|
labels = {
|
|
app = "claude-memory"
|
|
}
|
|
}
|
|
spec {
|
|
affinity {
|
|
pod_anti_affinity {
|
|
required_during_scheduling_ignored_during_execution {
|
|
label_selector {
|
|
match_labels = {
|
|
app = "claude-memory"
|
|
}
|
|
}
|
|
topology_key = "kubernetes.io/hostname"
|
|
}
|
|
}
|
|
}
|
|
container {
|
|
name = "claude-memory"
|
|
image = "viktorbarzin/claude-memory-mcp:latest"
|
|
|
|
port {
|
|
container_port = 8000
|
|
}
|
|
|
|
env {
|
|
name = "DATABASE_URL"
|
|
value = "postgresql://claude_memory:${var.claude_memory_db_password}@${var.postgresql_host}:5432/claude_memory"
|
|
}
|
|
env {
|
|
name = "API_KEY"
|
|
value = data.vault_kv_secret_v2.secrets.data["api_key"]
|
|
}
|
|
|
|
startup_probe {
|
|
http_get {
|
|
path = "/health"
|
|
port = 8000
|
|
}
|
|
failure_threshold = 30
|
|
period_seconds = 2
|
|
}
|
|
liveness_probe {
|
|
http_get {
|
|
path = "/health"
|
|
port = 8000
|
|
}
|
|
initial_delay_seconds = 5
|
|
period_seconds = 30
|
|
}
|
|
readiness_probe {
|
|
http_get {
|
|
path = "/health"
|
|
port = 8000
|
|
}
|
|
initial_delay_seconds = 3
|
|
period_seconds = 10
|
|
}
|
|
|
|
resources {
|
|
requests = {
|
|
memory = "64Mi"
|
|
cpu = "10m"
|
|
}
|
|
limits = {
|
|
memory = "64Mi"
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
lifecycle {
|
|
ignore_changes = [
|
|
spec[0].template[0].spec[0].container[0].image
|
|
]
|
|
}
|
|
}
|
|
|
|
resource "kubernetes_pod_disruption_budget_v1" "claude-memory" {
|
|
metadata {
|
|
name = "claude-memory"
|
|
namespace = kubernetes_namespace.claude-memory.metadata[0].name
|
|
}
|
|
spec {
|
|
min_available = "1"
|
|
selector {
|
|
match_labels = {
|
|
app = "claude-memory"
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
resource "kubernetes_service" "claude-memory" {
|
|
metadata {
|
|
name = "claude-memory"
|
|
namespace = kubernetes_namespace.claude-memory.metadata[0].name
|
|
labels = {
|
|
app = "claude-memory"
|
|
}
|
|
}
|
|
spec {
|
|
selector = {
|
|
app = "claude-memory"
|
|
}
|
|
port {
|
|
name = "http"
|
|
port = 80
|
|
target_port = 8000
|
|
}
|
|
}
|
|
}
|
|
|
|
module "ingress" {
|
|
source = "../../modules/kubernetes/ingress_factory"
|
|
namespace = kubernetes_namespace.claude-memory.metadata[0].name
|
|
name = "claude-memory"
|
|
tls_secret_name = var.tls_secret_name
|
|
extra_annotations = {
|
|
"gethomepage.dev/enabled" = "true"
|
|
"gethomepage.dev/name" = "Claude Memory"
|
|
"gethomepage.dev/description" = "Shared persistent memory for Claude sessions"
|
|
"gethomepage.dev/icon" = "claude-ai.png"
|
|
"gethomepage.dev/group" = "Core Platform"
|
|
"gethomepage.dev/pod-selector" = ""
|
|
}
|
|
}
|