claude-memory-mcp/src/claude_memory/sync.py
Viktor Barzin 5151bbe0d5
Some checks failed
Build and Push / lint-and-test (push) Has been cancelled
Build and Push / build (push) Has been cancelled
Build and Push / deploy (push) Has been cancelled
Build and Push / notify-failure (push) Has been cancelled
fix(mcp): serialize local SQLite writes to end "database is locked" under concurrent stores
Under heavy concurrent memory_store (many subagents/sessions writing close
together) the local SQLite layer raced on the single SQLite writer and surfaced
sqlite3.OperationalError: database is locked, which made memory tools slow and
eventually dropped whole sessions. Two root causes:

  - The MCP server (mcp_server.py) and the background SyncEngine (sync.py) each
    opened a SEPARATE connection to the same SQLite file. WAL allows one writer;
    when the sync writer held the lock across a resync, a concurrent store blew
    past busy_timeout and raised "database is locked".
  - mcp_server's connection was opened WITHOUT check_same_thread=False, so the
    moment two requests were handled on different threads every local store/recall
    raised ProgrammingError "SQLite objects created in a thread...".

Fix: a single process-wide serialized writer.

  - New LocalStore (local_store.py) owns ONE connection (check_same_thread=False)
    guarded by ONE re-entrant lock, keeps WAL, and wraps writes in
    transaction()/write() with bounded exponential-backoff retry on the rare
    residual lock (e.g. another OS process) instead of failing the call.
  - MemoryServer builds the LocalStore and SHARES it with the SyncEngine, so the
    sync writer no longer opens a second connection — the two-connections race is
    eliminated. All server reads/writes go through the shared lock; stores stay
    snappy (enqueue-local + async sync).
  - Bound the one genuinely slow path (remote semantic memory_recall) with an
    explicit RECALL_TIMEOUT and, on timeout/unreachable backend, return a clear
    "working / retry" signal instead of hanging silently or crashing. When a
    client supplies _meta.progressToken, emit one notifications/progress so the
    call shows life.

Ships to users via the plugin (mcp/memory-mcp.json runs src/.../mcp_server.py);
no server-side/API change needed. TDD: added concurrency tests (many threads +
sync writer on one file) and recall progress/bounding tests; full gate green
(ruff + mypy strict + 185 pytest).

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-19 06:06:09 +00:00

600 lines
24 KiB
Python

"""Background sync between local SQLite cache and remote API.
Uses only stdlib — no pip install required.
"""
import json
import logging
import threading
import urllib.error
import urllib.parse
import urllib.request
from typing import Any
from datetime import datetime, timezone
from claude_memory.local_store import LocalStore
logger = logging.getLogger(__name__)
# Max retries before an individual pending op is permanently skipped
MAX_OP_RETRIES = 5
# Full resync every N sync cycles (~10 min at 60s interval)
FULL_RESYNC_EVERY = 10
class SyncEngine:
"""Background sync between local SQLite cache and remote API."""
def __init__(
self,
db_path: str,
api_base_url: str,
api_key: str,
sync_interval: int = 60,
store: "LocalStore | None" = None,
):
self.db_path = db_path
self.api_base_url = api_base_url.rstrip("/")
self.api_key = api_key
self.sync_interval = sync_interval
self._stop_event = threading.Event()
self._thread: threading.Thread | None = None
self._last_sync_success = False
self._auth_failed = False
# Share the caller's LocalStore (one connection, one lock) when given, so the
# background sync writer never opens a SECOND connection to the same file and
# never races the store path on the single SQLite writer. When run standalone
# (e.g. tests), open our own store. Either way, all SQLite access below goes
# through self._store / self._conn / self._lock, which now point at one shared
# connection guarded by one re-entrant lock.
if store is None:
self._store = LocalStore.open(db_path)
self._owns_store = True
else:
self._store = store
self._owns_store = False
self._conn = self._store.conn
self._lock = self._store.lock
self._init_sync_tables()
def _init_sync_tables(self) -> None:
"""Create sync-specific tables if they don't exist."""
with self._lock:
self._conn.executescript("""
CREATE TABLE IF NOT EXISTS pending_ops (
id INTEGER PRIMARY KEY AUTOINCREMENT,
op_type TEXT NOT NULL,
payload TEXT NOT NULL,
created_at TEXT NOT NULL,
retry_count INTEGER DEFAULT 0
);
CREATE TABLE IF NOT EXISTS sync_meta (
key TEXT PRIMARY KEY,
value TEXT NOT NULL
);
""")
# Add server_id column to memories if missing
cursor = self._conn.execute("PRAGMA table_info(memories)")
columns = {row["name"] for row in cursor.fetchall()}
if "server_id" not in columns:
self._conn.execute("ALTER TABLE memories ADD COLUMN server_id INTEGER")
self._conn.execute(
"CREATE UNIQUE INDEX IF NOT EXISTS idx_memories_server_id ON memories(server_id)"
)
# Add retry_count column to pending_ops if missing (migration)
cursor = self._conn.execute("PRAGMA table_info(pending_ops)")
po_columns = {row["name"] for row in cursor.fetchall()}
if "retry_count" not in po_columns:
self._conn.execute("ALTER TABLE pending_ops ADD COLUMN retry_count INTEGER DEFAULT 0")
self._conn.commit()
@property
def last_sync_ts(self) -> str | None:
with self._lock:
cursor = self._conn.execute(
"SELECT value FROM sync_meta WHERE key = 'last_sync_ts'"
)
row = cursor.fetchone()
return row["value"] if row else None
@last_sync_ts.setter
def last_sync_ts(self, value: str) -> None:
with self._lock:
self._conn.execute(
"INSERT OR REPLACE INTO sync_meta (key, value) VALUES ('last_sync_ts', ?)",
(value,),
)
self._conn.commit()
@property
def api_available(self) -> bool:
return self._last_sync_success
def start(self) -> None:
"""Start background sync thread. Runs a full resync on startup."""
# Full sync on startup (blocking, before background thread)
try:
self._full_resync()
self._last_sync_success = True
self._auth_failed = False
except Exception as e:
logger.warning("Startup full sync failed: %s", e)
self._thread = threading.Thread(target=self._sync_loop, daemon=True)
self._thread.start()
def stop(self) -> None:
"""Signal background thread to stop and wait."""
self._stop_event.set()
if self._thread and self._thread.is_alive():
self._thread.join(timeout=5)
# Only close the connection if we opened it; when sharing the server's
# LocalStore, the server owns the connection lifecycle.
if self._owns_store:
self._store.close()
def _sync_loop(self) -> None:
"""Periodic sync loop running in background thread."""
cycle = 0
while not self._stop_event.is_set():
self._stop_event.wait(self.sync_interval)
if self._stop_event.is_set():
break
cycle += 1
try:
# If auth previously failed, try a lightweight check first
if self._auth_failed:
if not self._check_auth():
continue # Still failing, skip this cycle
if cycle % FULL_RESYNC_EVERY == 0:
self._full_resync()
else:
self._sync_once()
self._last_sync_success = True
except Exception as e:
logger.warning("Sync cycle failed: %s", e)
self._last_sync_success = False
def _check_auth(self) -> bool:
"""Lightweight auth check. Returns True if auth is OK."""
try:
self._api_request("GET", "/api/auth-check")
self._auth_failed = False
logger.info("Auth check passed — resuming sync")
return True
except urllib.error.HTTPError as e:
if e.code in (401, 403):
logger.warning(
"Auth still failing (HTTP %d) — API key mismatch. "
"Update MEMORY_API_KEY in ~/.claude.json", e.code
)
return False
# Non-auth error (e.g. 500) — try the auth-check endpoint might not exist,
# fall back to /health
pass
except Exception:
pass
# Fallback: try /health (unauthenticated)
try:
url = f"{self.api_base_url}/health"
req = urllib.request.Request(url, method="GET")
with urllib.request.urlopen(req, timeout=5):
pass
# Server is reachable but auth-check failed — auth is still broken
return False
except Exception:
# Server unreachable — not an auth problem
return False
def _sync_once(self) -> None:
"""Push pending ops, then pull remote changes. Both run independently."""
push_ok = self._push_pending_ops()
pull_ok = self._pull_changes()
if not push_ok and not pull_ok:
raise RuntimeError("Both push and pull failed")
def _full_resync(self) -> None:
"""Full cache replacement from server — handles drift, deletes, schema changes."""
# Step 1: Push orphaned local-only records (deduplicated)
self._push_orphans()
# Step 2: Pull everything from server (no since filter = non-deleted only)
result = self._api_request("GET", "/api/memories/sync")
memories = result.get("memories", [])
server_time = result.get("server_time")
server_ids = {m["id"] for m in memories}
with self._lock:
# Delete local records whose server_id no longer exists on server
local_rows = self._conn.execute(
"SELECT id, server_id FROM memories WHERE server_id IS NOT NULL"
).fetchall()
for row in local_rows:
if row["server_id"] not in server_ids:
self._conn.execute("DELETE FROM memories WHERE id = ?", (row["id"],))
# Delete remaining orphans (already pushed or duplicates)
self._conn.execute("DELETE FROM memories WHERE server_id IS NULL")
# Upsert all server records
for mem in memories:
server_id = mem["id"]
existing = self._conn.execute(
"SELECT id FROM memories WHERE server_id = ?", (server_id,)
).fetchone()
if existing:
self._conn.execute(
"""UPDATE memories SET content=?, category=?, tags=?,
expanded_keywords=?, importance=?, is_sensitive=?,
updated_at=? WHERE server_id=?""",
(
mem["content"], mem["category"], mem.get("tags", ""),
mem.get("expanded_keywords", ""), mem["importance"],
1 if mem.get("is_sensitive") else 0,
mem.get("updated_at", ""), server_id,
),
)
else:
self._conn.execute(
"""INSERT INTO memories (content, category, tags, expanded_keywords,
importance, is_sensitive, created_at, updated_at, server_id)
VALUES (?,?,?,?,?,?,?,?,?)""",
(
mem["content"], mem["category"], mem.get("tags", ""),
mem.get("expanded_keywords", ""), mem["importance"],
1 if mem.get("is_sensitive") else 0,
mem.get("created_at", ""), mem.get("updated_at", ""), server_id,
),
)
self._conn.commit()
if server_time:
self.last_sync_ts = server_time
def _push_orphans(self) -> None:
"""Push local-only records to server, skipping content duplicates."""
with self._lock:
orphans = self._conn.execute(
"SELECT id, content, category, tags, expanded_keywords, importance "
"FROM memories WHERE server_id IS NULL"
).fetchall()
if not orphans:
return
# Get all server content for dedup comparison
result = self._api_request("GET", "/api/memories/sync")
server_contents = {m["content"] for m in result.get("memories", [])}
for orphan in orphans:
if orphan["content"] in server_contents:
continue # Skip duplicate
try:
resp = self._api_request("POST", "/api/memories", {
"content": orphan["content"],
"category": orphan["category"],
"tags": orphan["tags"],
"expanded_keywords": orphan["expanded_keywords"],
"importance": orphan["importance"],
})
server_id = resp.get("id")
if server_id:
with self._lock:
self._conn.execute(
"UPDATE memories SET server_id=? WHERE id=?",
(server_id, orphan["id"]),
)
self._conn.commit()
except Exception:
pass # Will be cleaned up by the full resync delete step
def _api_request(self, method: str, path: str, body: dict[str, Any] | None = None) -> dict[str, Any]:
"""Make an HTTP request to the memory API."""
url = f"{self.api_base_url}{path}"
data = json.dumps(body).encode() if body else None
req = urllib.request.Request(
url,
data=data,
method=method,
headers={
"Authorization": f"Bearer {self.api_key}",
"Content-Type": "application/json",
},
)
try:
with urllib.request.urlopen(req, timeout=15) as resp:
result: dict[str, Any] = json.loads(resp.read().decode())
return result
except urllib.error.HTTPError as e:
if e.code in (401, 403):
self._auth_failed = True
logger.warning(
"Auth failed (HTTP %d) — API key may have rotated. "
"Update MEMORY_API_KEY in ~/.claude.json", e.code
)
raise
def _push_pending_ops(self) -> bool:
"""Push queued operations to the API server. Returns True on success."""
with self._lock:
cursor = self._conn.execute(
"SELECT id, op_type, payload, retry_count FROM pending_ops ORDER BY id"
)
ops = cursor.fetchall()
if not ops:
return True
all_ok = True
for op in ops:
op_id = op["id"]
op_type = op["op_type"]
payload = json.loads(op["payload"])
retry_count = op["retry_count"] or 0
# Skip ops that have exceeded retry limit
if retry_count >= MAX_OP_RETRIES:
logger.warning(
"Skipping op %d (%s) after %d retries — removing from queue",
op_id, op_type, retry_count,
)
with self._lock:
self._conn.execute("DELETE FROM pending_ops WHERE id = ?", (op_id,))
self._conn.commit()
continue
try:
if op_type == "store":
result = self._api_request("POST", "/api/memories", payload)
server_id = result.get("id")
if server_id and payload.get("local_id"):
with self._lock:
self._conn.execute(
"UPDATE memories SET server_id = ? WHERE id = ?",
(server_id, payload["local_id"]),
)
self._conn.commit()
elif op_type == "delete":
server_id = payload.get("server_id")
if server_id:
try:
self._api_request("DELETE", f"/api/memories/{server_id}")
except urllib.error.HTTPError as e:
if e.code == 404:
pass # Already deleted on server
else:
raise
# Remove from pending queue on success
with self._lock:
self._conn.execute("DELETE FROM pending_ops WHERE id = ?", (op_id,))
self._conn.commit()
except urllib.error.HTTPError as e:
if e.code in (401, 403):
self._auth_failed = True
logger.warning("Auth failed (HTTP %d) — aborting push", e.code)
return False # Abort entire push — no point retrying with bad key
# Increment retry count for non-auth errors
with self._lock:
self._conn.execute(
"UPDATE pending_ops SET retry_count = retry_count + 1 WHERE id = ?",
(op_id,),
)
self._conn.commit()
logger.warning("Failed to push op %d (%s): HTTP %d", op_id, op_type, e.code)
all_ok = False
except Exception as e:
with self._lock:
self._conn.execute(
"UPDATE pending_ops SET retry_count = retry_count + 1 WHERE id = ?",
(op_id,),
)
self._conn.commit()
logger.warning("Failed to push op %d (%s): %s", op_id, op_type, e)
all_ok = False
return all_ok
def _pull_changes(self) -> bool:
"""Pull changes from server since last sync. Returns True on success."""
try:
params = ""
ts = self.last_sync_ts
if ts:
params = f"?since={urllib.parse.quote(ts, safe='')}"
result = self._api_request("GET", f"/api/memories/sync{params}")
memories = result.get("memories", [])
server_time = result.get("server_time")
with self._lock:
for mem in memories:
server_id = mem["id"]
deleted_at = mem.get("deleted_at")
if deleted_at:
# Remove from local cache
self._conn.execute(
"DELETE FROM memories WHERE server_id = ?", (server_id,)
)
else:
# Upsert by server_id (server wins)
existing = self._conn.execute(
"SELECT id FROM memories WHERE server_id = ?", (server_id,)
).fetchone()
if existing:
self._conn.execute(
"""UPDATE memories SET content = ?, category = ?, tags = ?,
expanded_keywords = ?, importance = ?, is_sensitive = ?,
updated_at = ? WHERE server_id = ?""",
(
mem["content"],
mem["category"],
mem.get("tags", ""),
mem.get("expanded_keywords", ""),
mem["importance"],
1 if mem.get("is_sensitive") else 0,
mem.get("updated_at", datetime.now(timezone.utc).isoformat()),
server_id,
),
)
else:
self._conn.execute(
"""INSERT INTO memories
(content, category, tags, expanded_keywords, importance,
is_sensitive, created_at, updated_at, server_id)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)""",
(
mem["content"],
mem["category"],
mem.get("tags", ""),
mem.get("expanded_keywords", ""),
mem["importance"],
1 if mem.get("is_sensitive") else 0,
mem.get("created_at", datetime.now(timezone.utc).isoformat()),
mem.get("updated_at", datetime.now(timezone.utc).isoformat()),
server_id,
),
)
self._conn.commit()
if server_time:
self.last_sync_ts = server_time
return True
except Exception as e:
logger.warning("Pull changes failed: %s", e)
return False
def enqueue_store(
self,
local_id: int,
content: str,
category: str,
tags: str,
expanded_keywords: str,
importance: float,
force_sensitive: bool = False,
) -> None:
"""Queue a store operation for later sync."""
payload = {
"local_id": local_id,
"content": content,
"category": category,
"tags": tags,
"expanded_keywords": expanded_keywords,
"importance": importance,
"force_sensitive": force_sensitive,
}
now = datetime.now(timezone.utc).isoformat()
with self._lock:
self._conn.execute(
"INSERT INTO pending_ops (op_type, payload, created_at) VALUES (?, ?, ?)",
("store", json.dumps(payload), now),
)
self._conn.commit()
def enqueue_delete(self, server_id: int) -> None:
"""Queue a delete operation for later sync."""
payload = {"server_id": server_id}
now = datetime.now(timezone.utc).isoformat()
with self._lock:
self._conn.execute(
"INSERT INTO pending_ops (op_type, payload, created_at) VALUES (?, ?, ?)",
("delete", json.dumps(payload), now),
)
self._conn.commit()
def try_sync_store(
self,
local_id: int,
content: str,
category: str,
tags: str,
expanded_keywords: str,
importance: float,
force_sensitive: bool = False,
) -> int | None:
"""Try to sync a store immediately. Returns server_id or None if failed."""
if self._auth_failed:
self.enqueue_store(
local_id, content, category, tags, expanded_keywords, importance, force_sensitive
)
return None
try:
result = self._api_request("POST", "/api/memories", {
"content": content,
"category": category,
"tags": tags,
"expanded_keywords": expanded_keywords,
"importance": importance,
"force_sensitive": force_sensitive,
})
server_id = result.get("id")
if server_id:
with self._lock:
self._conn.execute(
"UPDATE memories SET server_id = ? WHERE id = ?",
(server_id, local_id),
)
self._conn.commit()
return server_id
except Exception:
self.enqueue_store(
local_id, content, category, tags, expanded_keywords, importance, force_sensitive
)
return None
def try_sync_delete(self, server_id: int) -> bool:
"""Try to sync a delete immediately. Returns True if successful."""
if self._auth_failed:
self.enqueue_delete(server_id)
return False
try:
self._api_request("DELETE", f"/api/memories/{server_id}")
return True
except urllib.error.HTTPError as e:
if e.code == 404:
return True # Already deleted on server — not an error
self.enqueue_delete(server_id)
return False
except Exception:
self.enqueue_delete(server_id)
return False
def get_counts(self) -> dict[str, Any]:
"""Get memory counts for diagnostics."""
with self._lock:
total = self._conn.execute("SELECT COUNT(*) as c FROM memories").fetchone()["c"]
by_cat = self._conn.execute(
"SELECT category, COUNT(*) as c FROM memories GROUP BY category ORDER BY c DESC"
).fetchall()
orphans = self._conn.execute(
"SELECT COUNT(*) as c FROM memories WHERE server_id IS NULL"
).fetchone()["c"]
pending = self._conn.execute(
"SELECT COUNT(*) as c FROM pending_ops"
).fetchone()["c"]
return {
"total": total,
"by_category": {row["category"]: row["c"] for row in by_cat},
"orphans_no_server_id": orphans,
"pending_ops": pending,
"last_sync_ts": self.last_sync_ts,
"auth_failed": self._auth_failed,
"last_sync_success": self._last_sync_success,
}