diff --git a/.claude/CLAUDE.md b/.claude/CLAUDE.md index c6c024c0..774dc71e 100755 --- a/.claude/CLAUDE.md +++ b/.claude/CLAUDE.md @@ -276,7 +276,8 @@ resource "kubernetes_persistent_volume_claim" "data_encrypted" { ## Known Issues - **CrowdSec Helm upgrade times out**: `terragrunt apply` on platform stack causes CrowdSec Helm release to get stuck in `pending-upgrade`. Workaround: `helm rollback crowdsec -n crowdsec`. Root cause: likely ResourceQuota CPU at 302% preventing pods from passing readiness probes. Needs investigation. -- **OpenClaw config is writable**: OpenClaw writes to `openclaw.json` at runtime (doctor --fix, plugin auto-enable). Never use subPath ConfigMap mounts for it — use an init container to copy into a writable volume. Needs 2Gi memory + `NODE_OPTIONS=--max-old-space-size=1536`. +- **OpenClaw config is writable**: OpenClaw writes to `openclaw.json` at runtime (doctor --fix, plugin auto-enable). Never use subPath ConfigMap mounts for it — use an init container to copy into a writable volume. Needs 2Gi memory + `NODE_OPTIONS=--max-old-space-size=1536`. **`mcp.servers` baked into the ConfigMap-loaded openclaw.json gets stripped by `doctor --fix`** — register MCP servers via `openclaw mcp set ` in the container startup command instead (CLI-written entries persist across doctor runs). Current servers wired this way: `ha`, `context7`, `playwright` (sidecar at `localhost:3000/mcp`). +- **OpenClaw memory-core indexes `/workspace/memory/`, not `/home/node/.openclaw/memory/`**: `/home/node/.openclaw/memory/main.sqlite` is the index store, NOT a content source. Files written under `/home/node/.openclaw/memory/projects//*.md` will NOT be indexed. To populate memory-core, write Markdown under `/workspace/memory/projects//` and run `openclaw memory index --force`. This is what the daily `memory-sync` CronJob in `stacks/openclaw/` does for claude-memory → OpenClaw sync. - **Goldilocks VPA sets limits**: When increasing memory requests, always set explicit `limits` too — Goldilocks may have added a limit that blocks the change. ## User Preferences diff --git a/stacks/openclaw/files/memory-sync.py b/stacks/openclaw/files/memory-sync.py new file mode 100644 index 00000000..95629899 --- /dev/null +++ b/stacks/openclaw/files/memory-sync.py @@ -0,0 +1,90 @@ +"""claude-memory → OpenClaw memory-core sync. + +Pulls memories from the central claude-memory REST API, writes per-category +Markdown files into /workspace/memory/projects/claude-memory-sync/ +which memory-core picks up via its QMD backend. + +Runs inside the openclaw pod (piped via `kubectl exec -i -- python3 -`). +Uses MEMORY_API_URL + MEMORY_API_KEY env vars already set on the pod. + +Filters out is_sensitive=true memories. Also one-shot deletes the stale +metaclaw-export.json from a prior export attempt. +""" + +import json +import os +import pathlib +import sys +import time +import urllib.request + + +def main() -> int: + api_url = os.environ["MEMORY_API_URL"].rstrip("/") + api_key = os.environ["MEMORY_API_KEY"] + + req = urllib.request.Request( + f"{api_url}/api/memories?limit=10000", + headers={"Authorization": f"Bearer {api_key}"}, + ) + with urllib.request.urlopen(req, timeout=30) as r: + data = json.load(r) + + raw = data.get("memories", []) + mems = [m for m in raw if not m.get("is_sensitive", False)] + sensitive_count = len(raw) - len(mems) + + by_cat: dict[str, list[dict]] = {} + for m in mems: + by_cat.setdefault(m.get("category") or "uncategorized", []).append(m) + + # Write under /workspace/memory/ — memory-core's QMD backend auto-indexes + # this path on every reindex. /home/node/.openclaw/memory/ is the + # SQLite index location, not a content source. + out_dir = pathlib.Path("/workspace/memory/projects/claude-memory-sync") + out_dir.mkdir(parents=True, exist_ok=True) + + stamp = time.strftime("%Y-%m-%d %H:%M UTC", time.gmtime()) + for cat, items in sorted(by_cat.items()): + items.sort(key=lambda x: x.get("id", 0)) + lines = [ + f"# {cat.title()} memories", + "", + f"_Synced from claude-memory at {stamp}. {len(items)} memories._", + "", + ] + for m in items: + content = m.get("content") or "" + first_line = content.splitlines()[0] if content else "" + title = first_line.lstrip("# ").strip()[:120] or f"#{m['id']}" + lines.extend([ + f"## #{m['id']} — {title}", + "", + f"- Tags: `{m.get('tags', '')}`", + f"- Importance: {float(m.get('importance', 0.5)):.2f}", + f"- Created: {m.get('created_at', '?')}", + f"- Updated: {m.get('updated_at', '?')}", + "", + content, + "", + "---", + "", + ]) + (out_dir / f"{cat}.md").write_text("\n".join(lines)) + + # One-shot: nuke the stale 2026-02-28 export sitting next to memory-core. + stale = pathlib.Path("/home/node/.openclaw/memory/metaclaw-export.json") + if stale.exists(): + stale.unlink() + print("[sync] deleted stale metaclaw-export.json") + + total = sum(len(v) for v in by_cat.values()) + print( + f"[sync] wrote {total} memories across {len(by_cat)} categories to " + f"{out_dir} (skipped {sensitive_count} sensitive)" + ) + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/stacks/openclaw/main.tf b/stacks/openclaw/main.tf index 028f0cc7..8c2fa853 100644 --- a/stacks/openclaw/main.tf +++ b/stacks/openclaw/main.tf @@ -21,7 +21,7 @@ resource "kubernetes_namespace" "openclaw" { tier = local.tiers.aux "resource-governance/custom-limitrange" = "true" "resource-governance/custom-quota" = "true" - "keel.sh/enrolled" = "true" + "keel.sh/enrolled" = "true" } } lifecycle { @@ -186,9 +186,16 @@ resource "kubernetes_config_map" "openclaw_config" { allow = ["memory-core"] slots = { memory = "memory-core" } load = { - paths = ["/home/node/.openclaw/extensions", "/app/extensions"] + # /app/extensions is the legacy bundled-plugins path; OpenClaw + # already loads bundled plugins natively (doctor warning). + paths = ["/home/node/.openclaw/extensions"] } } + # Note: mcp.servers is configured via `openclaw mcp set` in the main + # container startup command (see below) rather than in this ConfigMap. + # OpenClaw's `doctor --fix` (which runs on every pod start) strips + # bulk-loaded mcp blocks from openclaw.json, but preserves CLI-set + # entries. The CLI is the canonical writer. commands = { native = true nativeSkills = true @@ -493,9 +500,25 @@ resource "kubernetes_deployment" "openclaw" { container { name = "openclaw" image = "ghcr.io/openclaw/openclaw:2026.5.4" - # Doctor --fix auto-promotes the highest-tier codex model (gpt-5-pro) after - # auth-profile-based model discovery; pin gpt-5.4-mini back to default after it. - command = ["sh", "-c", "node openclaw.mjs doctor --fix 2>/dev/null; node openclaw.mjs models set openai-codex/gpt-5.4-mini 2>/dev/null; exec node openclaw.mjs gateway --allow-unconfigured --bind lan"] + # Startup sequence: + # 1. doctor --fix — repair sessions/state (also resets some config) + # 2. models set — pin gpt-5.4-mini (doctor auto-promotes to gpt-5-pro otherwise) + # 3. mcp set — register MCP servers via the CLI (the + # ConfigMap-baked mcp.servers block gets + # stripped by doctor --fix, but CLI-written + # entries persist). Values: ha URL from + # $HA_SOFIA_MCP_URL env (Vault-sourced), + # others hard-coded. + # 4. gateway — exec into the gateway process + command = ["sh", "-c", <<-EOC + node openclaw.mjs doctor --fix 2>/dev/null + node openclaw.mjs models set openai-codex/gpt-5.4-mini 2>/dev/null + node openclaw.mjs mcp set ha "{\"url\":\"$HA_SOFIA_MCP_URL\",\"transport\":\"streamable-http\"}" 2>/dev/null + node openclaw.mjs mcp set context7 '{"command":"npx","args":["-y","@upstash/context7-mcp"]}' 2>/dev/null + node openclaw.mjs mcp set playwright '{"url":"http://localhost:3000/mcp","transport":"streamable-http"}' 2>/dev/null + exec node openclaw.mjs gateway --allow-unconfigured --bind lan + EOC + ] port { container_port = 18789 } @@ -547,6 +570,12 @@ resource "kubernetes_deployment" "openclaw" { name = "HOME_ASSISTANT_SOFIA_TOKEN" value = local.skill_secrets["home_assistant_sofia_token"] } + # MCP URL for ha-mcp add-on on ha-sofia (secret-path auth). + # Consumed in the startup command by `openclaw mcp set ha ...`. + env { + name = "HA_SOFIA_MCP_URL" + value = data.vault_kv_secret_v2.secrets.data["ha_sofia_mcp_url"] + } # Skill secrets - Uptime Kuma env { name = "UPTIME_KUMA_PASSWORD" @@ -1168,6 +1197,117 @@ resource "kubernetes_cron_job_v1" "task_processor" { } } +# --- CronJob: claude-memory → memory-core sync (daily) --- +# Pulls all (non-sensitive) memories from claude-memory's REST API and +# writes them into memory-core's QMD-backed tree at +# /home/node/.openclaw/memory/projects/claude-memory-sync/. Then runs +# `openclaw memory index --force` to rebuild the search index so the +# OpenClaw agent can `memory_search` over the shared knowledge. +# +# Note: the central claude-memory MCP transport (/mcp/mcp) is broken +# on the deployed image (beads code-z1so) — this REST sync is the +# workaround. Once that's fixed we can also wire claude_memory as a +# native MCP server in the mcp.servers block above. + +resource "kubernetes_config_map" "memory_sync_script" { + metadata { + name = "memory-sync-script" + namespace = kubernetes_namespace.openclaw.metadata[0].name + } + data = { + "memory-sync.py" = file("${path.module}/files/memory-sync.py") + } +} + +resource "kubernetes_cron_job_v1" "memory_sync" { + metadata { + name = "memory-sync" + namespace = kubernetes_namespace.openclaw.metadata[0].name + labels = { + app = "memory-sync" + tier = local.tiers.aux + } + } + spec { + schedule = "0 3 * * *" + concurrency_policy = "Forbid" + failed_jobs_history_limit = 3 + successful_jobs_history_limit = 3 + + job_template { + metadata { + labels = { + app = "memory-sync" + } + } + spec { + active_deadline_seconds = 600 + backoff_limit = 0 + ttl_seconds_after_finished = 86400 + template { + metadata { + labels = { + app = "memory-sync" + } + } + spec { + # Reuses the SA created for the (decommissioned) cluster + # healthcheck job — already has pods + pods/exec in this ns. + service_account_name = kubernetes_service_account.healthcheck.metadata[0].name + restart_policy = "Never" + + container { + name = "memory-sync" + image = "bitnami/kubectl:latest" + command = ["bash", "-c", <<-EOF + set -eu + POD=$(kubectl get pods -n openclaw -l app=openclaw -o jsonpath='{.items[0].metadata.name}') + if [ -z "$POD" ]; then + echo "ERROR: no openclaw pod" + exit 1 + fi + echo "syncing into pod $POD ..." + kubectl exec -n openclaw "$POD" -c openclaw -i -- python3 -u - < /scripts/memory-sync.py + echo "reindexing memory-core ..." + kubectl exec -n openclaw "$POD" -c openclaw -- sh -c 'cd /app && node openclaw.mjs memory index --force 2>&1 | tail -20' + echo "memory-sync complete." + EOF + ] + + volume_mount { + name = "script" + mount_path = "/scripts" + read_only = true + } + + resources { + requests = { + cpu = "20m" + memory = "64Mi" + } + limits = { + memory = "64Mi" + } + } + } + + volume { + name = "script" + config_map { + name = kubernetes_config_map.memory_sync_script.metadata[0].name + } + } + } + } + } + } + } + lifecycle { + # KYVERNO_LIFECYCLE_V1: Kyverno admission webhook mutates dns_config with ndots=2 + ignore_changes = [spec[0].job_template[0].spec[0].template[0].spec[0].dns_config] + } +} + # --- OpenLobster: Multi-user Telegram AI assistant (trial) --- resource "kubernetes_persistent_volume_claim" "openlobster_data_proxmox" {