resilient memory sync: decouple push/pull, startup full resync, auth failure handling

- Decouple push and pull in _sync_once() so pull always runs even if push fails
- Add startup full resync to catch drift from other agents and schema changes
- Add periodic full resync every ~10 minutes for continuous drift correction
- Add auth failure detection (401/403) with graceful SQLite-only degradation
- Add /api/auth-check endpoint for lightweight key validation
- Add retry cap (5 attempts) on pending ops to prevent infinite queue buildup
- Add orphan reconciliation: push local-only records with content dedup
- Add memory_count MCP tool for sync diagnostics
- Add version-based SQLite schema migration (PRAGMA user_version)
- Fix API key in ~/.claude.json to match server
- Update README with sync resilience docs, test structure, project layout
- Add 30 new tests covering all new behaviors (155 total, all passing)
This commit is contained in:
Viktor Barzin 2026-03-16 18:35:09 +00:00
parent a18b94d310
commit e47efee6b6
No known key found for this signature in database
GPG key ID: 0EB088298288D958
8 changed files with 948 additions and 134 deletions

View file

@ -3,8 +3,9 @@
import json
import os
import sys
import urllib.error
from datetime import datetime, timezone
from unittest.mock import patch
from unittest.mock import patch, MagicMock
import pytest
@ -154,21 +155,25 @@ class TestPushPendingOps:
"""A 404 on delete means already deleted on server — should still clear queue."""
engine.enqueue_delete(42)
import urllib.error
with patch.object(engine, "_api_request") as mock_api:
mock_api.side_effect = RuntimeError("API error 404: not found")
mock_api.side_effect = urllib.error.HTTPError(
url="http://fake", code=404, msg="Not Found", hdrs=None, fp=None
)
engine._push_pending_ops()
cursor = engine._conn.execute("SELECT COUNT(*) as cnt FROM pending_ops")
assert cursor.fetchone()["cnt"] == 0
def test_push_failure_keeps_queue(self, engine):
def test_push_failure_keeps_queue_returns_false(self, engine):
"""Push failure should keep the op in queue and return False (not raise)."""
engine.enqueue_store(1, "test", "facts", "", "kw", 0.5)
with patch.object(engine, "_api_request") as mock_api:
mock_api.side_effect = RuntimeError("Connection refused")
with pytest.raises(RuntimeError):
engine._push_pending_ops()
result = engine._push_pending_ops()
assert result is False
cursor = engine._conn.execute("SELECT COUNT(*) as cnt FROM pending_ops")
assert cursor.fetchone()["cnt"] == 1
@ -393,3 +398,361 @@ class TestFullSyncCycle:
# Should be gone locally
cursor = engine._conn.execute("SELECT * FROM memories WHERE server_id = 500")
assert cursor.fetchone() is None
class TestAuthFailureHandling:
def test_auth_flag_set_on_401(self, engine):
"""401 from _api_request should set _auth_failed flag."""
engine.enqueue_store(1, "test", "facts", "", "kw", 0.5)
with patch.object(engine, "_api_request") as mock_api:
mock_api.side_effect = urllib.error.HTTPError(
url="http://fake", code=401, msg="Unauthorized", hdrs=None, fp=None
)
result = engine._push_pending_ops()
assert result is False
assert engine._auth_failed is True
def test_auth_flag_set_on_403(self, engine):
engine.enqueue_store(1, "test", "facts", "", "kw", 0.5)
with patch.object(engine, "_api_request") as mock_api:
mock_api.side_effect = urllib.error.HTTPError(
url="http://fake", code=403, msg="Forbidden", hdrs=None, fp=None
)
result = engine._push_pending_ops()
assert result is False
assert engine._auth_failed is True
def test_push_aborts_on_auth_failure(self, engine):
"""On 401, push should abort immediately — no further ops attempted."""
engine.enqueue_store(1, "test1", "facts", "", "kw", 0.5)
engine.enqueue_store(2, "test2", "facts", "", "kw", 0.5)
with patch.object(engine, "_api_request") as mock_api:
mock_api.side_effect = urllib.error.HTTPError(
url="http://fake", code=401, msg="Unauthorized", hdrs=None, fp=None
)
engine._push_pending_ops()
# Both ops should still be in queue (aborted before processing second)
cursor = engine._conn.execute("SELECT COUNT(*) as cnt FROM pending_ops")
assert cursor.fetchone()["cnt"] == 2
def test_try_sync_store_queues_when_auth_failed(self, engine):
"""When auth is failed, try_sync_store should queue without attempting API call."""
engine._auth_failed = True
result = engine.try_sync_store(1, "test", "facts", "", "kw", 0.5)
assert result is None
cursor = engine._conn.execute("SELECT COUNT(*) as cnt FROM pending_ops")
assert cursor.fetchone()["cnt"] == 1
def test_try_sync_delete_queues_when_auth_failed(self, engine):
engine._auth_failed = True
result = engine.try_sync_delete(42)
assert result is False
cursor = engine._conn.execute("SELECT COUNT(*) as cnt FROM pending_ops")
assert cursor.fetchone()["cnt"] == 1
def test_check_auth_clears_flag_on_success(self, engine):
engine._auth_failed = True
with patch.object(engine, "_api_request") as mock_api:
mock_api.return_value = {"status": "ok", "user_id": "test"}
result = engine._check_auth()
assert result is True
assert engine._auth_failed is False
def test_check_auth_stays_failed_on_401(self, engine):
engine._auth_failed = True
with patch.object(engine, "_api_request") as mock_api:
mock_api.side_effect = urllib.error.HTTPError(
url="http://fake", code=401, msg="Unauthorized", hdrs=None, fp=None
)
# Also mock urlopen for /health fallback
with patch("urllib.request.urlopen") as mock_urlopen:
mock_urlopen.return_value.__enter__ = MagicMock()
mock_urlopen.return_value.__exit__ = MagicMock(return_value=False)
result = engine._check_auth()
assert result is False
assert engine._auth_failed is True
class TestRetryCount:
def test_retry_count_incremented_on_failure(self, engine):
engine.enqueue_store(1, "test", "facts", "", "kw", 0.5)
with patch.object(engine, "_api_request") as mock_api:
mock_api.side_effect = RuntimeError("Connection refused")
engine._push_pending_ops()
cursor = engine._conn.execute("SELECT retry_count FROM pending_ops WHERE id = 1")
assert cursor.fetchone()["retry_count"] == 1
def test_op_skipped_after_max_retries(self, engine):
engine.enqueue_store(1, "test", "facts", "", "kw", 0.5)
# Set retry_count to max
engine._conn.execute("UPDATE pending_ops SET retry_count = 5 WHERE id = 1")
engine._conn.commit()
with patch.object(engine, "_api_request") as mock_api:
result = engine._push_pending_ops()
# Op should be deleted (skipped), API never called
cursor = engine._conn.execute("SELECT COUNT(*) as cnt FROM pending_ops")
assert cursor.fetchone()["cnt"] == 0
mock_api.assert_not_called()
def test_retry_count_persists_across_pushes(self, engine):
engine.enqueue_store(1, "test", "facts", "", "kw", 0.5)
with patch.object(engine, "_api_request") as mock_api:
mock_api.side_effect = RuntimeError("fail")
engine._push_pending_ops()
engine._push_pending_ops()
engine._push_pending_ops()
cursor = engine._conn.execute("SELECT retry_count FROM pending_ops WHERE id = 1")
assert cursor.fetchone()["retry_count"] == 3
class TestDecoupledPushPull:
def test_pull_runs_even_when_push_fails(self, engine):
"""Pull should execute even if push fails — they're decoupled."""
engine.enqueue_store(1, "test", "facts", "", "kw", 0.5)
now = datetime.now(timezone.utc).isoformat()
call_count = 0
def mock_api(method, path, body=None):
nonlocal call_count
call_count += 1
if "POST" == method:
raise RuntimeError("Push failed")
# GET for pull
return {
"memories": [{
"id": 99, "content": "from server", "category": "facts",
"tags": "", "expanded_keywords": "", "importance": 0.5,
"is_sensitive": False, "created_at": now, "updated_at": now,
"deleted_at": None,
}],
"server_time": now,
}
with patch.object(engine, "_api_request", side_effect=mock_api):
engine._sync_once()
# Pull should have inserted the server memory
cursor = engine._conn.execute("SELECT * FROM memories WHERE server_id = 99")
assert cursor.fetchone() is not None
def test_sync_once_returns_normally_on_partial_failure(self, engine):
"""If push fails but pull succeeds, _sync_once should not raise."""
engine.enqueue_store(1, "test", "facts", "", "kw", 0.5)
def mock_api(method, path, body=None):
if method == "POST":
raise RuntimeError("Push failed")
return {"memories": [], "server_time": "2026-03-16T12:00:00+00:00"}
with patch.object(engine, "_api_request", side_effect=mock_api):
# Should not raise
engine._sync_once()
class TestFullResync:
def test_full_resync_inserts_server_records(self, engine):
now = datetime.now(timezone.utc).isoformat()
with patch.object(engine, "_api_request") as mock_api:
mock_api.return_value = {
"memories": [
{"id": 1, "content": "server mem 1", "category": "facts",
"tags": "", "expanded_keywords": "", "importance": 0.5,
"is_sensitive": False, "created_at": now, "updated_at": now},
{"id": 2, "content": "server mem 2", "category": "projects",
"tags": "", "expanded_keywords": "", "importance": 0.8,
"is_sensitive": False, "created_at": now, "updated_at": now},
],
"server_time": now,
}
engine._full_resync()
cursor = engine._conn.execute("SELECT COUNT(*) as cnt FROM memories")
assert cursor.fetchone()["cnt"] == 2
def test_full_resync_removes_stale_local_records(self, engine):
"""Local records with server_ids not on server should be deleted."""
now = datetime.now(timezone.utc).isoformat()
# Insert a local record with server_id=999 (not on server)
engine._conn.execute(
"INSERT INTO memories (content, category, tags, expanded_keywords, importance, "
"is_sensitive, created_at, updated_at, server_id) VALUES (?,?,?,?,?,?,?,?,?)",
("stale", "facts", "", "", 0.5, 0, now, now, 999),
)
engine._conn.commit()
with patch.object(engine, "_api_request") as mock_api:
mock_api.return_value = {
"memories": [
{"id": 1, "content": "current", "category": "facts",
"tags": "", "expanded_keywords": "", "importance": 0.5,
"is_sensitive": False, "created_at": now, "updated_at": now},
],
"server_time": now,
}
engine._full_resync()
# Stale record should be gone
cursor = engine._conn.execute("SELECT * FROM memories WHERE server_id = 999")
assert cursor.fetchone() is None
# Current record should exist
cursor = engine._conn.execute("SELECT * FROM memories WHERE server_id = 1")
assert cursor.fetchone() is not None
def test_full_resync_deletes_orphans_after_push(self, engine):
"""Orphans (server_id IS NULL) should be cleaned up after push attempt."""
now = datetime.now(timezone.utc).isoformat()
engine._conn.execute(
"INSERT INTO memories (content, category, tags, expanded_keywords, importance, "
"is_sensitive, created_at, updated_at) VALUES (?,?,?,?,?,?,?,?)",
("orphan", "facts", "", "", 0.5, 0, now, now),
)
engine._conn.commit()
with patch.object(engine, "_api_request") as mock_api:
mock_api.return_value = {
"memories": [],
"server_time": now,
}
engine._full_resync()
cursor = engine._conn.execute("SELECT * FROM memories WHERE server_id IS NULL")
assert cursor.fetchone() is None
def test_full_resync_updates_last_sync_ts(self, engine):
server_time = "2026-03-16T15:00:00+00:00"
with patch.object(engine, "_api_request") as mock_api:
mock_api.return_value = {"memories": [], "server_time": server_time}
engine._full_resync()
assert engine.last_sync_ts == server_time
def test_full_resync_updates_existing_records(self, engine):
now = datetime.now(timezone.utc).isoformat()
engine._conn.execute(
"INSERT INTO memories (content, category, tags, expanded_keywords, importance, "
"is_sensitive, created_at, updated_at, server_id) VALUES (?,?,?,?,?,?,?,?,?)",
("old content", "facts", "", "", 0.5, 0, now, now, 10),
)
engine._conn.commit()
with patch.object(engine, "_api_request") as mock_api:
mock_api.return_value = {
"memories": [
{"id": 10, "content": "new content", "category": "projects",
"tags": "updated", "expanded_keywords": "", "importance": 0.9,
"is_sensitive": False, "created_at": now, "updated_at": now},
],
"server_time": now,
}
engine._full_resync()
cursor = engine._conn.execute("SELECT * FROM memories WHERE server_id = 10")
row = cursor.fetchone()
assert row["content"] == "new content"
assert row["category"] == "projects"
assert row["importance"] == 0.9
class TestPushOrphans:
def test_push_orphans_skips_duplicates(self, engine):
now = datetime.now(timezone.utc).isoformat()
# Insert orphan with content matching server
engine._conn.execute(
"INSERT INTO memories (content, category, tags, expanded_keywords, importance, "
"is_sensitive, created_at, updated_at) VALUES (?,?,?,?,?,?,?,?)",
("duplicate content", "facts", "", "", 0.5, 0, now, now),
)
engine._conn.commit()
call_log = []
def mock_api(method, path, body=None):
call_log.append((method, path))
return {
"memories": [{"id": 1, "content": "duplicate content", "category": "facts",
"tags": "", "expanded_keywords": "", "importance": 0.5,
"is_sensitive": False, "created_at": now, "updated_at": now}],
"server_time": now,
}
with patch.object(engine, "_api_request", side_effect=mock_api):
engine._push_orphans()
# Should have called GET for sync but NOT POST (duplicate skipped)
assert all(m != "POST" for m, _ in call_log)
def test_push_orphans_posts_unique(self, engine):
now = datetime.now(timezone.utc).isoformat()
engine._conn.execute(
"INSERT INTO memories (id, content, category, tags, expanded_keywords, importance, "
"is_sensitive, created_at, updated_at) VALUES (?,?,?,?,?,?,?,?,?)",
(1, "unique content", "facts", "", "", 0.5, 0, now, now),
)
engine._conn.commit()
def mock_api(method, path, body=None):
if method == "GET":
return {"memories": [], "server_time": now}
if method == "POST":
return {"id": 100, "category": "facts", "importance": 0.5}
return {}
with patch.object(engine, "_api_request", side_effect=mock_api):
engine._push_orphans()
# Orphan should now have server_id
cursor = engine._conn.execute("SELECT server_id FROM memories WHERE id = 1")
assert cursor.fetchone()["server_id"] == 100
class TestGetCounts:
def test_empty_counts(self, engine):
counts = engine.get_counts()
assert counts["total"] == 0
assert counts["by_category"] == {}
assert counts["orphans_no_server_id"] == 0
assert counts["pending_ops"] == 0
assert counts["auth_failed"] is False
def test_counts_with_data(self, engine):
now = datetime.now(timezone.utc).isoformat()
engine._conn.execute(
"INSERT INTO memories (content, category, tags, expanded_keywords, importance, "
"is_sensitive, created_at, updated_at, server_id) VALUES (?,?,?,?,?,?,?,?,?)",
("mem1", "facts", "", "", 0.5, 0, now, now, 1),
)
engine._conn.execute(
"INSERT INTO memories (content, category, tags, expanded_keywords, importance, "
"is_sensitive, created_at, updated_at) VALUES (?,?,?,?,?,?,?,?)",
("orphan", "projects", "", "", 0.5, 0, now, now),
)
engine.enqueue_store(99, "queued", "facts", "", "", 0.5)
engine._conn.commit()
counts = engine.get_counts()
assert counts["total"] == 2
assert counts["by_category"]["facts"] == 1
assert counts["by_category"]["projects"] == 1
assert counts["orphans_no_server_id"] == 1
assert counts["pending_ops"] == 1