conversational: trim per-turn context to cut brain TTFT ~1.3s

The no-tools conversational agent was dragging the full project context (this repo's CLAUDE.md, the MCP server configs, local settings) plus the dynamic system-prompt sections into every voice turn — ~45k input tokens -> ~3.4s time-to-first-token (measured against the live pod, 2026-06-21). Add --setting-sources user + --exclude-dynamic-system-prompt-sections to both the gateway (json) and realtime (stream-json) conversational argvs: context drops to ~23k and TTFT to ~2.1s (~1.3s/turn faster) with no change to the reply. Helps the portal-assistant v1 gateway AND the v2 realtime agent (both run the same turn). The /execute agent path is untouched. Investigation ruled out the assumed culprits: CLI startup is only ~0.5s, and a warm prompt cache does NOT lower TTFT (turn 2 read all 45k from cache yet TTFT was unchanged) — the cost was the context size, not the spawn. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-21 18:00:21 +00:00 · 2026-06-21 18:00:21 +00:00 · eccf0dd407
commit eccf0dd407
parent a29bffdda3
2 changed files with 22 additions and 0 deletions
--- a/app/conversational.py
+++ b/app/conversational.py
@ -16,6 +16,19 @@ CONVERSATIONAL_TIMEOUT_SECONDS = int(
    os.environ.get("CONVERSATIONAL_TIMEOUT_SECONDS", "120")
 )

+# Latency: the conversational agent is no-tools (ADR-0002), so the CLI's default
+# project context — this repo's CLAUDE.md, the MCP server configs, local settings
+# — plus the dynamic system-prompt sections are pure overhead on a voice turn.
+# Measured 2026-06-21: the default load is ~45k input tokens/turn -> ~3.4s TTFT;
+# restricting settings to `user` and excluding the dynamic sections more than
+# halves the context (~23k) and cuts TTFT to ~2.1s (~1.3s/turn faster) with no
+# change to the reply. Applies to BOTH the gateway (json) and realtime (stream)
+# paths, since both run the same no-tools conversational turn.
+_LEAN_CONTEXT_FLAGS = [
+    "--setting-sources", "user",
+    "--exclude-dynamic-system-prompt-sections",
+]
+
 # Session ids the Claude CLI has already opened in THIS process, so a follow-up
 # turn resumes instead of re-opening. In-memory + single-replica: a pod restart
 # clears this AND the CLI's emptyDir session state together, so they stay in sync.
@ -42,6 +55,7 @@ def conversational_argv(
        "--agent", CONVERSATIONAL_AGENT,
        "--output-format", "json",
        "--model", model,
+        *_LEAN_CONTEXT_FLAGS,
    ]
    argv += ["--resume", session_id] if resume else ["--session-id", session_id]
    argv.append(message)
@ -123,6 +137,7 @@ def stream_argv(prompt: str, model: str) -> list[str]:
        "--output-format", "stream-json",
        "--include-partial-messages",
        "--verbose",
+        *_LEAN_CONTEXT_FLAGS,
        prompt,
    ]

--- a/tests/test_conversational.py
+++ b/tests/test_conversational.py
@ -30,6 +30,10 @@ def test_conversational_argv_new_session():
    assert "--dangerously-skip-permissions" not in argv
    assert argv[argv.index("--model") + 1] == "sonnet"
    assert argv[argv.index("--output-format") + 1] == "json"
+    # latency: trims project CLAUDE.md/MCP + dynamic system-prompt sections off
+    # the no-tools voice turn (~45k -> ~23k input tokens, ~1.3s faster TTFT)
+    assert argv[argv.index("--setting-sources") + 1] == "user"
+    assert "--exclude-dynamic-system-prompt-sections" in argv
    assert argv[-1] == "Hi there"


@ -189,6 +193,9 @@ def test_stream_argv_uses_stream_json_and_is_stateless():
    assert "--include-partial-messages" in argv
    assert "--verbose" in argv
    assert "--model" in argv and "sonnet" in argv
+    # latency: same lean-context trim as the gateway path
+    assert argv[argv.index("--setting-sources") + 1] == "user"
+    assert "--exclude-dynamic-system-prompt-sections" in argv
    assert argv[-1] == "hello"
    # stateless + no tools
    assert "--resume" not in argv and "--session-id" not in argv