resilient memory sync: decouple push/pull, startup full resync, auth failure handling

- Decouple push and pull in _sync_once() so pull always runs even if push fails
- Add startup full resync to catch drift from other agents and schema changes
- Add periodic full resync every ~10 minutes for continuous drift correction
- Add auth failure detection (401/403) with graceful SQLite-only degradation
- Add /api/auth-check endpoint for lightweight key validation
- Add retry cap (5 attempts) on pending ops to prevent infinite queue buildup
- Add orphan reconciliation: push local-only records with content dedup
- Add memory_count MCP tool for sync diagnostics
- Add version-based SQLite schema migration (PRAGMA user_version)
- Fix API key in ~/.claude.json to match server
- Update README with sync resilience docs, test structure, project layout
- Add 30 new tests covering all new behaviors (155 total, all passing)
This commit is contained in:
Viktor Barzin 2026-03-16 18:35:09 +00:00
parent a18b94d310
commit e47efee6b6
No known key found for this signature in database
GPG key ID: 0EB088298288D958
8 changed files with 948 additions and 134 deletions

View file

@ -59,6 +59,12 @@ async def health() -> dict[str, str]:
return {"status": "ok"}
@app.get("/api/auth-check")
async def auth_check(user: AuthUser = Depends(get_current_user)) -> dict[str, str]:
"""Validate API key without doing any real work."""
return {"status": "ok", "user_id": user.user_id}
@app.get("/api/memories/sync", response_model=SyncResponse)
async def sync_memories(
since: Optional[str] = None,

View file

@ -86,6 +86,41 @@ def _get_db_path(db_path: str | None = None) -> str:
return resolved
SCHEMA_VERSION = 2
def _migrate_sqlite(conn: sqlite3.Connection) -> None:
"""Version-based SQLite schema migrations."""
current = conn.execute("PRAGMA user_version").fetchone()[0]
if current < 1:
# Add server_id column for hybrid mode sync
cursor = conn.execute("PRAGMA table_info(memories)")
columns = {row["name"] for row in cursor.fetchall()}
if "server_id" not in columns:
conn.execute("ALTER TABLE memories ADD COLUMN server_id INTEGER")
conn.execute(
"CREATE UNIQUE INDEX IF NOT EXISTS idx_memories_server_id ON memories(server_id)"
)
if current < 2:
# Ensure pending_ops has retry_count (sync.py also handles this, but belt-and-suspenders)
conn.execute("""
CREATE TABLE IF NOT EXISTS pending_ops (
id INTEGER PRIMARY KEY AUTOINCREMENT,
op_type TEXT NOT NULL,
payload TEXT NOT NULL,
created_at TEXT NOT NULL,
retry_count INTEGER DEFAULT 0
)
""")
# Add retry_count if pending_ops already exists without it
cursor = conn.execute("PRAGMA table_info(pending_ops)")
po_columns = {row["name"] for row in cursor.fetchall()}
if "retry_count" not in po_columns:
conn.execute("ALTER TABLE pending_ops ADD COLUMN retry_count INTEGER DEFAULT 0")
conn.execute(f"PRAGMA user_version = {SCHEMA_VERSION}")
conn.commit()
def _init_sqlite(db_path: str | None = None) -> tuple[sqlite3.Connection, str]:
"""Initialize SQLite database."""
from pathlib import Path
@ -111,14 +146,8 @@ def _init_sqlite(db_path: str | None = None) -> tuple[sqlite3.Connection, str]:
updated_at TEXT NOT NULL
)
""")
# Add server_id column if missing (for hybrid mode sync)
cursor.execute("PRAGMA table_info(memories)")
columns = {row["name"] for row in cursor.fetchall()}
if "server_id" not in columns:
cursor.execute("ALTER TABLE memories ADD COLUMN server_id INTEGER")
cursor.execute(
"CREATE UNIQUE INDEX IF NOT EXISTS idx_memories_server_id ON memories(server_id)"
)
# Version-based schema migrations
_migrate_sqlite(conn)
cursor.execute("""
CREATE VIRTUAL TABLE IF NOT EXISTS memories_fts USING fts5(
@ -250,6 +279,14 @@ TOOLS = [
"required": ["id"],
},
},
{
"name": "memory_count",
"description": "Get memory counts by category from local cache and sync status. Useful for diagnostics.",
"inputSchema": {
"type": "object",
"properties": {},
},
},
]
@ -418,6 +455,32 @@ class MemoryServer:
return self._sqlite_secret_get(memory_id)
def memory_count(self, args: dict[str, Any]) -> str:
if self.sync_engine:
counts = self.sync_engine.get_counts()
lines = [f"Local memories: {counts['total']}"]
for cat, n in counts["by_category"].items():
lines.append(f" {cat}: {n}")
lines.append(f"Orphans (no server_id): {counts['orphans_no_server_id']}")
lines.append(f"Pending ops: {counts['pending_ops']}")
lines.append(f"Last sync: {counts['last_sync_ts'] or 'never'}")
lines.append(f"Auth failed: {counts['auth_failed']}")
lines.append(f"Last sync success: {counts['last_sync_success']}")
return "\n".join(lines)
if self.sqlite_conn:
cursor = self.sqlite_conn.cursor()
cursor.execute("SELECT COUNT(*) as c FROM memories")
total = cursor.fetchone()["c"]
cursor.execute("SELECT category, COUNT(*) as c FROM memories GROUP BY category ORDER BY c DESC")
by_cat = cursor.fetchall()
lines = [f"Local memories (SQLite-only): {total}"]
for row in by_cat:
lines.append(f" {row['category']}: {row['c']}")
return "\n".join(lines)
return "No storage available"
# ── SQLite methods ──────────────────────────────────────────────
def _sqlite_store(self, content: str, category: str, tags: str, importance: float, expanded_keywords: str, force_sensitive: bool = False) -> str:
@ -573,6 +636,7 @@ class MemoryServer:
"memory_list": self.memory_list,
"memory_delete": self.memory_delete,
"secret_get": self.secret_get,
"memory_count": self.memory_count,
}.get(tool_name)
if handler is None:
return {"content": [{"type": "text", "text": f"Unknown tool: {tool_name}"}], "isError": True}

View file

@ -16,6 +16,12 @@ from pathlib import Path
logger = logging.getLogger(__name__)
# Max retries before an individual pending op is permanently skipped
MAX_OP_RETRIES = 5
# Full resync every N sync cycles (~10 min at 60s interval)
FULL_RESYNC_EVERY = 10
class SyncEngine:
"""Background sync between local SQLite cache and remote API."""
@ -29,6 +35,7 @@ class SyncEngine:
self._stop_event = threading.Event()
self._thread: threading.Thread | None = None
self._last_sync_success = False
self._auth_failed = False
# Own connection for thread safety
Path(db_path).parent.mkdir(parents=True, exist_ok=True)
@ -48,7 +55,8 @@ class SyncEngine:
id INTEGER PRIMARY KEY AUTOINCREMENT,
op_type TEXT NOT NULL,
payload TEXT NOT NULL,
created_at TEXT NOT NULL
created_at TEXT NOT NULL,
retry_count INTEGER DEFAULT 0
);
CREATE TABLE IF NOT EXISTS sync_meta (
@ -64,6 +72,13 @@ class SyncEngine:
self._conn.execute(
"CREATE UNIQUE INDEX IF NOT EXISTS idx_memories_server_id ON memories(server_id)"
)
# Add retry_count column to pending_ops if missing (migration)
cursor = self._conn.execute("PRAGMA table_info(pending_ops)")
po_columns = {row["name"] for row in cursor.fetchall()}
if "retry_count" not in po_columns:
self._conn.execute("ALTER TABLE pending_ops ADD COLUMN retry_count INTEGER DEFAULT 0")
self._conn.commit()
@property
@ -89,7 +104,15 @@ class SyncEngine:
return self._last_sync_success
def start(self) -> None:
"""Start background sync thread (non-blocking)."""
"""Start background sync thread. Runs a full resync on startup."""
# Full sync on startup (blocking, before background thread)
try:
self._full_resync()
self._last_sync_success = True
self._auth_failed = False
except Exception as e:
logger.warning("Startup full sync failed: %s", e)
self._thread = threading.Thread(target=self._sync_loop, daemon=True)
self._thread.start()
@ -102,21 +125,162 @@ class SyncEngine:
def _sync_loop(self) -> None:
"""Periodic sync loop running in background thread."""
cycle = 0
while not self._stop_event.is_set():
self._stop_event.wait(self.sync_interval)
if self._stop_event.is_set():
break
cycle += 1
try:
self._sync_once()
# If auth previously failed, try a lightweight check first
if self._auth_failed:
if not self._check_auth():
continue # Still failing, skip this cycle
if cycle % FULL_RESYNC_EVERY == 0:
self._full_resync()
else:
self._sync_once()
self._last_sync_success = True
except Exception as e:
logger.warning("Sync cycle failed: %s", e)
self._last_sync_success = False
def _check_auth(self) -> bool:
"""Lightweight auth check. Returns True if auth is OK."""
try:
self._api_request("GET", "/api/auth-check")
self._auth_failed = False
logger.info("Auth check passed — resuming sync")
return True
except urllib.error.HTTPError as e:
if e.code in (401, 403):
logger.warning(
"Auth still failing (HTTP %d) — API key mismatch. "
"Update MEMORY_API_KEY in ~/.claude.json", e.code
)
return False
# Non-auth error (e.g. 500) — try the auth-check endpoint might not exist,
# fall back to /health
pass
except Exception:
pass
# Fallback: try /health (unauthenticated)
try:
url = f"{self.api_base_url}/health"
req = urllib.request.Request(url, method="GET")
with urllib.request.urlopen(req, timeout=5):
pass
# Server is reachable but auth-check failed — auth is still broken
return False
except Exception:
# Server unreachable — not an auth problem
return False
def _sync_once(self) -> None:
"""Push pending ops, then pull remote changes."""
self._push_pending_ops()
self._pull_changes()
"""Push pending ops, then pull remote changes. Both run independently."""
push_ok = self._push_pending_ops()
pull_ok = self._pull_changes()
if not push_ok and not pull_ok:
raise RuntimeError("Both push and pull failed")
def _full_resync(self) -> None:
"""Full cache replacement from server — handles drift, deletes, schema changes."""
# Step 1: Push orphaned local-only records (deduplicated)
self._push_orphans()
# Step 2: Pull everything from server (no since filter = non-deleted only)
result = self._api_request("GET", "/api/memories/sync")
memories = result.get("memories", [])
server_time = result.get("server_time")
server_ids = {m["id"] for m in memories}
with self._lock:
# Delete local records whose server_id no longer exists on server
local_rows = self._conn.execute(
"SELECT id, server_id FROM memories WHERE server_id IS NOT NULL"
).fetchall()
for row in local_rows:
if row["server_id"] not in server_ids:
self._conn.execute("DELETE FROM memories WHERE id = ?", (row["id"],))
# Delete remaining orphans (already pushed or duplicates)
self._conn.execute("DELETE FROM memories WHERE server_id IS NULL")
# Upsert all server records
for mem in memories:
server_id = mem["id"]
existing = self._conn.execute(
"SELECT id FROM memories WHERE server_id = ?", (server_id,)
).fetchone()
if existing:
self._conn.execute(
"""UPDATE memories SET content=?, category=?, tags=?,
expanded_keywords=?, importance=?, is_sensitive=?,
updated_at=? WHERE server_id=?""",
(
mem["content"], mem["category"], mem.get("tags", ""),
mem.get("expanded_keywords", ""), mem["importance"],
1 if mem.get("is_sensitive") else 0,
mem.get("updated_at", ""), server_id,
),
)
else:
self._conn.execute(
"""INSERT INTO memories (content, category, tags, expanded_keywords,
importance, is_sensitive, created_at, updated_at, server_id)
VALUES (?,?,?,?,?,?,?,?,?)""",
(
mem["content"], mem["category"], mem.get("tags", ""),
mem.get("expanded_keywords", ""), mem["importance"],
1 if mem.get("is_sensitive") else 0,
mem.get("created_at", ""), mem.get("updated_at", ""), server_id,
),
)
self._conn.commit()
if server_time:
self.last_sync_ts = server_time
def _push_orphans(self) -> None:
"""Push local-only records to server, skipping content duplicates."""
with self._lock:
orphans = self._conn.execute(
"SELECT id, content, category, tags, expanded_keywords, importance "
"FROM memories WHERE server_id IS NULL"
).fetchall()
if not orphans:
return
# Get all server content for dedup comparison
result = self._api_request("GET", "/api/memories/sync")
server_contents = {m["content"] for m in result.get("memories", [])}
for orphan in orphans:
if orphan["content"] in server_contents:
continue # Skip duplicate
try:
resp = self._api_request("POST", "/api/memories", {
"content": orphan["content"],
"category": orphan["category"],
"tags": orphan["tags"],
"expanded_keywords": orphan["expanded_keywords"],
"importance": orphan["importance"],
})
server_id = resp.get("id")
if server_id:
with self._lock:
self._conn.execute(
"UPDATE memories SET server_id=? WHERE id=?",
(server_id, orphan["id"]),
)
self._conn.commit()
except Exception:
pass # Will be cleaned up by the full resync delete step
def _api_request(self, method: str, path: str, body: dict[str, Any] | None = None) -> dict[str, Any]:
"""Make an HTTP request to the memory API."""
@ -131,22 +295,47 @@ class SyncEngine:
"Content-Type": "application/json",
},
)
with urllib.request.urlopen(req, timeout=15) as resp:
result: dict[str, Any] = json.loads(resp.read().decode())
return result
try:
with urllib.request.urlopen(req, timeout=15) as resp:
result: dict[str, Any] = json.loads(resp.read().decode())
return result
except urllib.error.HTTPError as e:
if e.code in (401, 403):
self._auth_failed = True
logger.warning(
"Auth failed (HTTP %d) — API key may have rotated. "
"Update MEMORY_API_KEY in ~/.claude.json", e.code
)
raise
def _push_pending_ops(self) -> None:
"""Push queued operations to the API server."""
def _push_pending_ops(self) -> bool:
"""Push queued operations to the API server. Returns True on success."""
with self._lock:
cursor = self._conn.execute(
"SELECT id, op_type, payload FROM pending_ops ORDER BY id"
"SELECT id, op_type, payload, retry_count FROM pending_ops ORDER BY id"
)
ops = cursor.fetchall()
if not ops:
return True
all_ok = True
for op in ops:
op_id = op["id"]
op_type = op["op_type"]
payload = json.loads(op["payload"])
retry_count = op["retry_count"] or 0
# Skip ops that have exceeded retry limit
if retry_count >= MAX_OP_RETRIES:
logger.warning(
"Skipping op %d (%s) after %d retries — removing from queue",
op_id, op_type, retry_count,
)
with self._lock:
self._conn.execute("DELETE FROM pending_ops WHERE id = ?", (op_id,))
self._conn.commit()
continue
try:
if op_type == "store":
@ -164,8 +353,8 @@ class SyncEngine:
if server_id:
try:
self._api_request("DELETE", f"/api/memories/{server_id}")
except (RuntimeError, urllib.error.HTTPError) as e:
if "404" in str(e):
except urllib.error.HTTPError as e:
if e.code == 404:
pass # Already deleted on server
else:
raise
@ -175,75 +364,103 @@ class SyncEngine:
self._conn.execute("DELETE FROM pending_ops WHERE id = ?", (op_id,))
self._conn.commit()
except Exception as e:
logger.warning("Failed to push op %d (%s): %s", op_id, op_type, e)
raise # Propagate to mark sync as failed
def _pull_changes(self) -> None:
"""Pull changes from server since last sync."""
params = ""
ts = self.last_sync_ts
if ts:
params = f"?since={urllib.parse.quote(ts, safe='')}"
result = self._api_request("GET", f"/api/memories/sync{params}")
memories = result.get("memories", [])
server_time = result.get("server_time")
with self._lock:
for mem in memories:
server_id = mem["id"]
deleted_at = mem.get("deleted_at")
if deleted_at:
# Remove from local cache
except urllib.error.HTTPError as e:
if e.code in (401, 403):
self._auth_failed = True
logger.warning("Auth failed (HTTP %d) — aborting push", e.code)
return False # Abort entire push — no point retrying with bad key
# Increment retry count for non-auth errors
with self._lock:
self._conn.execute(
"DELETE FROM memories WHERE server_id = ?", (server_id,)
"UPDATE pending_ops SET retry_count = retry_count + 1 WHERE id = ?",
(op_id,),
)
else:
# Upsert by server_id (server wins)
existing = self._conn.execute(
"SELECT id FROM memories WHERE server_id = ?", (server_id,)
).fetchone()
self._conn.commit()
logger.warning("Failed to push op %d (%s): HTTP %d", op_id, op_type, e.code)
all_ok = False
except Exception as e:
with self._lock:
self._conn.execute(
"UPDATE pending_ops SET retry_count = retry_count + 1 WHERE id = ?",
(op_id,),
)
self._conn.commit()
logger.warning("Failed to push op %d (%s): %s", op_id, op_type, e)
all_ok = False
if existing:
return all_ok
def _pull_changes(self) -> bool:
"""Pull changes from server since last sync. Returns True on success."""
try:
params = ""
ts = self.last_sync_ts
if ts:
params = f"?since={urllib.parse.quote(ts, safe='')}"
result = self._api_request("GET", f"/api/memories/sync{params}")
memories = result.get("memories", [])
server_time = result.get("server_time")
with self._lock:
for mem in memories:
server_id = mem["id"]
deleted_at = mem.get("deleted_at")
if deleted_at:
# Remove from local cache
self._conn.execute(
"""UPDATE memories SET content = ?, category = ?, tags = ?,
expanded_keywords = ?, importance = ?, is_sensitive = ?,
updated_at = ? WHERE server_id = ?""",
(
mem["content"],
mem["category"],
mem.get("tags", ""),
mem.get("expanded_keywords", ""),
mem["importance"],
1 if mem.get("is_sensitive") else 0,
mem.get("updated_at", datetime.now(timezone.utc).isoformat()),
server_id,
),
"DELETE FROM memories WHERE server_id = ?", (server_id,)
)
else:
self._conn.execute(
"""INSERT INTO memories
(content, category, tags, expanded_keywords, importance,
is_sensitive, created_at, updated_at, server_id)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)""",
(
mem["content"],
mem["category"],
mem.get("tags", ""),
mem.get("expanded_keywords", ""),
mem["importance"],
1 if mem.get("is_sensitive") else 0,
mem.get("created_at", datetime.now(timezone.utc).isoformat()),
mem.get("updated_at", datetime.now(timezone.utc).isoformat()),
server_id,
),
)
self._conn.commit()
# Upsert by server_id (server wins)
existing = self._conn.execute(
"SELECT id FROM memories WHERE server_id = ?", (server_id,)
).fetchone()
if server_time:
self.last_sync_ts = server_time
if existing:
self._conn.execute(
"""UPDATE memories SET content = ?, category = ?, tags = ?,
expanded_keywords = ?, importance = ?, is_sensitive = ?,
updated_at = ? WHERE server_id = ?""",
(
mem["content"],
mem["category"],
mem.get("tags", ""),
mem.get("expanded_keywords", ""),
mem["importance"],
1 if mem.get("is_sensitive") else 0,
mem.get("updated_at", datetime.now(timezone.utc).isoformat()),
server_id,
),
)
else:
self._conn.execute(
"""INSERT INTO memories
(content, category, tags, expanded_keywords, importance,
is_sensitive, created_at, updated_at, server_id)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)""",
(
mem["content"],
mem["category"],
mem.get("tags", ""),
mem.get("expanded_keywords", ""),
mem["importance"],
1 if mem.get("is_sensitive") else 0,
mem.get("created_at", datetime.now(timezone.utc).isoformat()),
mem.get("updated_at", datetime.now(timezone.utc).isoformat()),
server_id,
),
)
self._conn.commit()
if server_time:
self.last_sync_ts = server_time
return True
except Exception as e:
logger.warning("Pull changes failed: %s", e)
return False
def enqueue_store(
self,
@ -295,6 +512,11 @@ class SyncEngine:
force_sensitive: bool = False,
) -> int | None:
"""Try to sync a store immediately. Returns server_id or None if failed."""
if self._auth_failed:
self.enqueue_store(
local_id, content, category, tags, expanded_keywords, importance, force_sensitive
)
return None
try:
result = self._api_request("POST", "/api/memories", {
"content": content,
@ -321,6 +543,9 @@ class SyncEngine:
def try_sync_delete(self, server_id: int) -> bool:
"""Try to sync a delete immediately. Returns True if successful."""
if self._auth_failed:
self.enqueue_delete(server_id)
return False
try:
self._api_request("DELETE", f"/api/memories/{server_id}")
return True
@ -332,3 +557,27 @@ class SyncEngine:
except Exception:
self.enqueue_delete(server_id)
return False
def get_counts(self) -> dict[str, Any]:
"""Get memory counts for diagnostics."""
with self._lock:
total = self._conn.execute("SELECT COUNT(*) as c FROM memories").fetchone()["c"]
by_cat = self._conn.execute(
"SELECT category, COUNT(*) as c FROM memories GROUP BY category ORDER BY c DESC"
).fetchall()
orphans = self._conn.execute(
"SELECT COUNT(*) as c FROM memories WHERE server_id IS NULL"
).fetchone()["c"]
pending = self._conn.execute(
"SELECT COUNT(*) as c FROM pending_ops"
).fetchone()["c"]
return {
"total": total,
"by_category": {row["category"]: row["c"] for row in by_cat},
"orphans_no_server_id": orphans,
"pending_ops": pending,
"last_sync_ts": self.last_sync_ts,
"auth_failed": self._auth_failed,
"last_sync_success": self._last_sync_success,
}