claude-memory-mcp/src/claude_memory/sync.py

"""Background sync between local SQLite cache and remote API.

Uses only stdlib — no pip install required.
"""

import json
import logging
import threading
import urllib.error
import urllib.parse
import urllib.request
from typing import Any
from datetime import datetime, timezone

from claude_memory.local_store import LocalStore

logger = logging.getLogger(__name__)

# Max retries before an individual pending op is permanently skipped
MAX_OP_RETRIES = 5

# Full resync every N sync cycles (~10 min at 60s interval)
FULL_RESYNC_EVERY = 10


class SyncEngine:
    """Background sync between local SQLite cache and remote API."""

    def __init__(
        self,
        db_path: str,
        api_base_url: str,
        api_key: str,
        sync_interval: int = 60,
        store: "LocalStore | None" = None,
    ):
        self.db_path = db_path
        self.api_base_url = api_base_url.rstrip("/")
        self.api_key = api_key
        self.sync_interval = sync_interval

        self._stop_event = threading.Event()
        self._thread: threading.Thread | None = None
        self._last_sync_success = False
        self._auth_failed = False

        # Share the caller's LocalStore (one connection, one lock) when given, so the
        # background sync writer never opens a SECOND connection to the same file and
        # never races the store path on the single SQLite writer. When run standalone
        # (e.g. tests), open our own store. Either way, all SQLite access below goes
        # through self._store / self._conn / self._lock, which now point at one shared
        # connection guarded by one re-entrant lock.
        if store is None:
            self._store = LocalStore.open(db_path)
            self._owns_store = True
        else:
            self._store = store
            self._owns_store = False
        self._conn = self._store.conn
        self._lock = self._store.lock

        self._init_sync_tables()

    def _init_sync_tables(self) -> None:
        """Create sync-specific tables if they don't exist."""
        with self._lock:
            self._conn.executescript("""
                CREATE TABLE IF NOT EXISTS pending_ops (
                    id INTEGER PRIMARY KEY AUTOINCREMENT,
                    op_type TEXT NOT NULL,
                    payload TEXT NOT NULL,
                    created_at TEXT NOT NULL,
                    retry_count INTEGER DEFAULT 0
                );

                CREATE TABLE IF NOT EXISTS sync_meta (
                    key TEXT PRIMARY KEY,
                    value TEXT NOT NULL
                );
            """)
            # Add server_id column to memories if missing
            cursor = self._conn.execute("PRAGMA table_info(memories)")
            columns = {row["name"] for row in cursor.fetchall()}
            if "server_id" not in columns:
                self._conn.execute("ALTER TABLE memories ADD COLUMN server_id INTEGER")
                self._conn.execute(
                    "CREATE UNIQUE INDEX IF NOT EXISTS idx_memories_server_id ON memories(server_id)"
                )

            # Add retry_count column to pending_ops if missing (migration)
            cursor = self._conn.execute("PRAGMA table_info(pending_ops)")
            po_columns = {row["name"] for row in cursor.fetchall()}
            if "retry_count" not in po_columns:
                self._conn.execute("ALTER TABLE pending_ops ADD COLUMN retry_count INTEGER DEFAULT 0")

            self._conn.commit()

    @property
    def last_sync_ts(self) -> str | None:
        with self._lock:
            cursor = self._conn.execute(
                "SELECT value FROM sync_meta WHERE key = 'last_sync_ts'"
            )
            row = cursor.fetchone()
            return row["value"] if row else None

    @last_sync_ts.setter
    def last_sync_ts(self, value: str) -> None:
        with self._lock:
            self._conn.execute(
                "INSERT OR REPLACE INTO sync_meta (key, value) VALUES ('last_sync_ts', ?)",
                (value,),
            )
            self._conn.commit()

    @property
    def api_available(self) -> bool:
        return self._last_sync_success

    def start(self) -> None:
        """Start background sync thread. Runs a full resync on startup."""
        # Full sync on startup (blocking, before background thread)
        try:
            self._full_resync()
            self._last_sync_success = True
            self._auth_failed = False
        except Exception as e:
            logger.warning("Startup full sync failed: %s", e)

        self._thread = threading.Thread(target=self._sync_loop, daemon=True)
        self._thread.start()

    def stop(self) -> None:
        """Signal background thread to stop and wait."""
        self._stop_event.set()
        if self._thread and self._thread.is_alive():
            self._thread.join(timeout=5)
        # Only close the connection if we opened it; when sharing the server's
        # LocalStore, the server owns the connection lifecycle.
        if self._owns_store:
            self._store.close()

    def _sync_loop(self) -> None:
        """Periodic sync loop running in background thread."""
        cycle = 0
        while not self._stop_event.is_set():
            self._stop_event.wait(self.sync_interval)
            if self._stop_event.is_set():
                break
            cycle += 1
            try:
                # If auth previously failed, try a lightweight check first
                if self._auth_failed:
                    if not self._check_auth():
                        continue  # Still failing, skip this cycle

                if cycle % FULL_RESYNC_EVERY == 0:
                    self._full_resync()
                else:
                    self._sync_once()
                self._last_sync_success = True
            except Exception as e:
                logger.warning("Sync cycle failed: %s", e)
                self._last_sync_success = False

    def _check_auth(self) -> bool:
        """Lightweight auth check. Returns True if auth is OK."""
        try:
            self._api_request("GET", "/api/auth-check")
            self._auth_failed = False
            logger.info("Auth check passed — resuming sync")
            return True
        except urllib.error.HTTPError as e:
            if e.code in (401, 403):
                logger.warning(
                    "Auth still failing (HTTP %d) — API key mismatch. "
                    "Update MEMORY_API_KEY in ~/.claude.json", e.code
                )
                return False
            # Non-auth error (e.g. 500) — try the auth-check endpoint might not exist,
            # fall back to /health
            pass
        except Exception:
            pass

        # Fallback: try /health (unauthenticated)
        try:
            url = f"{self.api_base_url}/health"
            req = urllib.request.Request(url, method="GET")
            with urllib.request.urlopen(req, timeout=5):
                pass
            # Server is reachable but auth-check failed — auth is still broken
            return False
        except Exception:
            # Server unreachable — not an auth problem
            return False

    def _sync_once(self) -> None:
        """Push pending ops, then pull remote changes. Both run independently."""
        push_ok = self._push_pending_ops()
        pull_ok = self._pull_changes()
        if not push_ok and not pull_ok:
            raise RuntimeError("Both push and pull failed")

    def _full_resync(self) -> None:
        """Full cache replacement from server — handles drift, deletes, schema changes."""
        # Step 1: Push orphaned local-only records (deduplicated)
        self._push_orphans()

        # Step 2: Pull everything from server (no since filter = non-deleted only)
        result = self._api_request("GET", "/api/memories/sync")
        memories = result.get("memories", [])
        server_time = result.get("server_time")
        server_ids = {m["id"] for m in memories}

        with self._lock:
            # Delete local records whose server_id no longer exists on server
            local_rows = self._conn.execute(
                "SELECT id, server_id FROM memories WHERE server_id IS NOT NULL"
            ).fetchall()
            for row in local_rows:
                if row["server_id"] not in server_ids:
                    self._conn.execute("DELETE FROM memories WHERE id = ?", (row["id"],))

            # Delete remaining orphans (already pushed or duplicates)
            self._conn.execute("DELETE FROM memories WHERE server_id IS NULL")

            # Upsert all server records
            for mem in memories:
                server_id = mem["id"]
                existing = self._conn.execute(
                    "SELECT id FROM memories WHERE server_id = ?", (server_id,)
                ).fetchone()

                if existing:
                    self._conn.execute(
                        """UPDATE memories SET content=?, category=?, tags=?,
                           expanded_keywords=?, importance=?, is_sensitive=?,
                           updated_at=? WHERE server_id=?""",
                        (
                            mem["content"], mem["category"], mem.get("tags", ""),
                            mem.get("expanded_keywords", ""), mem["importance"],
                            1 if mem.get("is_sensitive") else 0,
                            mem.get("updated_at", ""), server_id,
                        ),
                    )
                else:
                    self._conn.execute(
                        """INSERT INTO memories (content, category, tags, expanded_keywords,
                           importance, is_sensitive, created_at, updated_at, server_id)
                           VALUES (?,?,?,?,?,?,?,?,?)""",
                        (
                            mem["content"], mem["category"], mem.get("tags", ""),
                            mem.get("expanded_keywords", ""), mem["importance"],
                            1 if mem.get("is_sensitive") else 0,
                            mem.get("created_at", ""), mem.get("updated_at", ""), server_id,
                        ),
                    )

            self._conn.commit()

        if server_time:
            self.last_sync_ts = server_time

    def _push_orphans(self) -> None:
        """Push local-only records to server, skipping content duplicates."""
        with self._lock:
            orphans = self._conn.execute(
                "SELECT id, content, category, tags, expanded_keywords, importance "
                "FROM memories WHERE server_id IS NULL"
            ).fetchall()

        if not orphans:
            return

        # Get all server content for dedup comparison
        result = self._api_request("GET", "/api/memories/sync")
        server_contents = {m["content"] for m in result.get("memories", [])}

        for orphan in orphans:
            if orphan["content"] in server_contents:
                continue  # Skip duplicate
            try:
                resp = self._api_request("POST", "/api/memories", {
                    "content": orphan["content"],
                    "category": orphan["category"],
                    "tags": orphan["tags"],
                    "expanded_keywords": orphan["expanded_keywords"],
                    "importance": orphan["importance"],
                })
                server_id = resp.get("id")
                if server_id:
                    with self._lock:
                        self._conn.execute(
                            "UPDATE memories SET server_id=? WHERE id=?",
                            (server_id, orphan["id"]),
                        )
                        self._conn.commit()
            except Exception:
                pass  # Will be cleaned up by the full resync delete step

    def _api_request(self, method: str, path: str, body: dict[str, Any] | None = None) -> dict[str, Any]:
        """Make an HTTP request to the memory API."""
        url = f"{self.api_base_url}{path}"
        data = json.dumps(body).encode() if body else None
        req = urllib.request.Request(
            url,
            data=data,
            method=method,
            headers={
                "Authorization": f"Bearer {self.api_key}",
                "Content-Type": "application/json",
            },
        )
        try:
            with urllib.request.urlopen(req, timeout=15) as resp:
                result: dict[str, Any] = json.loads(resp.read().decode())
                return result
        except urllib.error.HTTPError as e:
            if e.code in (401, 403):
                self._auth_failed = True
                logger.warning(
                    "Auth failed (HTTP %d) — API key may have rotated. "
                    "Update MEMORY_API_KEY in ~/.claude.json", e.code
                )
            raise

    def _push_pending_ops(self) -> bool:
        """Push queued operations to the API server. Returns True on success."""
        with self._lock:
            cursor = self._conn.execute(
                "SELECT id, op_type, payload, retry_count FROM pending_ops ORDER BY id"
            )
            ops = cursor.fetchall()

        if not ops:
            return True

        all_ok = True
        for op in ops:
            op_id = op["id"]
            op_type = op["op_type"]
            payload = json.loads(op["payload"])
            retry_count = op["retry_count"] or 0

            # Skip ops that have exceeded retry limit
            if retry_count >= MAX_OP_RETRIES:
                logger.warning(
                    "Skipping op %d (%s) after %d retries — removing from queue",
                    op_id, op_type, retry_count,
                )
                with self._lock:
                    self._conn.execute("DELETE FROM pending_ops WHERE id = ?", (op_id,))
                    self._conn.commit()
                continue

            try:
                if op_type == "store":
                    result = self._api_request("POST", "/api/memories", payload)
                    server_id = result.get("id")
                    if server_id and payload.get("local_id"):
                        with self._lock:
                            self._conn.execute(
                                "UPDATE memories SET server_id = ? WHERE id = ?",
                                (server_id, payload["local_id"]),
                            )
                            self._conn.commit()
                elif op_type == "delete":
                    server_id = payload.get("server_id")
                    if server_id:
                        try:
                            self._api_request("DELETE", f"/api/memories/{server_id}")
                        except urllib.error.HTTPError as e:
                            if e.code == 404:
                                pass  # Already deleted on server
                            else:
                                raise

                # Remove from pending queue on success
                with self._lock:
                    self._conn.execute("DELETE FROM pending_ops WHERE id = ?", (op_id,))
                    self._conn.commit()

            except urllib.error.HTTPError as e:
                if e.code in (401, 403):
                    self._auth_failed = True
                    logger.warning("Auth failed (HTTP %d) — aborting push", e.code)
                    return False  # Abort entire push — no point retrying with bad key
                # Increment retry count for non-auth errors
                with self._lock:
                    self._conn.execute(
                        "UPDATE pending_ops SET retry_count = retry_count + 1 WHERE id = ?",
                        (op_id,),
                    )
                    self._conn.commit()
                logger.warning("Failed to push op %d (%s): HTTP %d", op_id, op_type, e.code)
                all_ok = False
            except Exception as e:
                with self._lock:
                    self._conn.execute(
                        "UPDATE pending_ops SET retry_count = retry_count + 1 WHERE id = ?",
                        (op_id,),
                    )
                    self._conn.commit()
                logger.warning("Failed to push op %d (%s): %s", op_id, op_type, e)
                all_ok = False

        return all_ok

    def _pull_changes(self) -> bool:
        """Pull changes from server since last sync. Returns True on success."""
        try:
            params = ""
            ts = self.last_sync_ts
            if ts:
                params = f"?since={urllib.parse.quote(ts, safe='')}"

            result = self._api_request("GET", f"/api/memories/sync{params}")
            memories = result.get("memories", [])
            server_time = result.get("server_time")

            with self._lock:
                for mem in memories:
                    server_id = mem["id"]
                    deleted_at = mem.get("deleted_at")

                    if deleted_at:
                        # Remove from local cache
                        self._conn.execute(
                            "DELETE FROM memories WHERE server_id = ?", (server_id,)
                        )
                    else:
                        # Upsert by server_id (server wins)
                        existing = self._conn.execute(
                            "SELECT id FROM memories WHERE server_id = ?", (server_id,)
                        ).fetchone()

                        if existing:
                            self._conn.execute(
                                """UPDATE memories SET content = ?, category = ?, tags = ?,
                                   expanded_keywords = ?, importance = ?, is_sensitive = ?,
                                   updated_at = ? WHERE server_id = ?""",
                                (
                                    mem["content"],
                                    mem["category"],
                                    mem.get("tags", ""),
                                    mem.get("expanded_keywords", ""),
                                    mem["importance"],
                                    1 if mem.get("is_sensitive") else 0,
                                    mem.get("updated_at", datetime.now(timezone.utc).isoformat()),
                                    server_id,
                                ),
                            )
                        else:
                            self._conn.execute(
                                """INSERT INTO memories
                                   (content, category, tags, expanded_keywords, importance,
                                    is_sensitive, created_at, updated_at, server_id)
                                   VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)""",
                                (
                                    mem["content"],
                                    mem["category"],
                                    mem.get("tags", ""),
                                    mem.get("expanded_keywords", ""),
                                    mem["importance"],
                                    1 if mem.get("is_sensitive") else 0,
                                    mem.get("created_at", datetime.now(timezone.utc).isoformat()),
                                    mem.get("updated_at", datetime.now(timezone.utc).isoformat()),
                                    server_id,
                                ),
                            )
                self._conn.commit()

            if server_time:
                self.last_sync_ts = server_time
            return True

        except Exception as e:
            logger.warning("Pull changes failed: %s", e)
            return False

    def enqueue_store(
        self,
        local_id: int,
        content: str,
        category: str,
        tags: str,
        expanded_keywords: str,
        importance: float,
        force_sensitive: bool = False,
    ) -> None:
        """Queue a store operation for later sync."""
        payload = {
            "local_id": local_id,
            "content": content,
            "category": category,
            "tags": tags,
            "expanded_keywords": expanded_keywords,
            "importance": importance,
            "force_sensitive": force_sensitive,
        }
        now = datetime.now(timezone.utc).isoformat()
        with self._lock:
            self._conn.execute(
                "INSERT INTO pending_ops (op_type, payload, created_at) VALUES (?, ?, ?)",
                ("store", json.dumps(payload), now),
            )
            self._conn.commit()

    def enqueue_delete(self, server_id: int) -> None:
        """Queue a delete operation for later sync."""
        payload = {"server_id": server_id}
        now = datetime.now(timezone.utc).isoformat()
        with self._lock:
            self._conn.execute(
                "INSERT INTO pending_ops (op_type, payload, created_at) VALUES (?, ?, ?)",
                ("delete", json.dumps(payload), now),
            )
            self._conn.commit()

    def try_sync_store(
        self,
        local_id: int,
        content: str,
        category: str,
        tags: str,
        expanded_keywords: str,
        importance: float,
        force_sensitive: bool = False,
    ) -> int | None:
        """Try to sync a store immediately. Returns server_id or None if failed."""
        if self._auth_failed:
            self.enqueue_store(
                local_id, content, category, tags, expanded_keywords, importance, force_sensitive
            )
            return None
        try:
            result = self._api_request("POST", "/api/memories", {
                "content": content,
                "category": category,
                "tags": tags,
                "expanded_keywords": expanded_keywords,
                "importance": importance,
                "force_sensitive": force_sensitive,
            })
            server_id = result.get("id")
            if server_id:
                with self._lock:
                    self._conn.execute(
                        "UPDATE memories SET server_id = ? WHERE id = ?",
                        (server_id, local_id),
                    )
                    self._conn.commit()
            return server_id
        except Exception:
            self.enqueue_store(
                local_id, content, category, tags, expanded_keywords, importance, force_sensitive
            )
            return None

    def try_sync_delete(self, server_id: int) -> bool:
        """Try to sync a delete immediately. Returns True if successful."""
        if self._auth_failed:
            self.enqueue_delete(server_id)
            return False
        try:
            self._api_request("DELETE", f"/api/memories/{server_id}")
            return True
        except urllib.error.HTTPError as e:
            if e.code == 404:
                return True  # Already deleted on server — not an error
            self.enqueue_delete(server_id)
            return False
        except Exception:
            self.enqueue_delete(server_id)
            return False

    def get_counts(self) -> dict[str, Any]:
        """Get memory counts for diagnostics."""
        with self._lock:
            total = self._conn.execute("SELECT COUNT(*) as c FROM memories").fetchone()["c"]
            by_cat = self._conn.execute(
                "SELECT category, COUNT(*) as c FROM memories GROUP BY category ORDER BY c DESC"
            ).fetchall()
            orphans = self._conn.execute(
                "SELECT COUNT(*) as c FROM memories WHERE server_id IS NULL"
            ).fetchone()["c"]
            pending = self._conn.execute(
                "SELECT COUNT(*) as c FROM pending_ops"
            ).fetchone()["c"]

        return {
            "total": total,
            "by_category": {row["category"]: row["c"] for row in by_cat},
            "orphans_no_server_id": orphans,
            "pending_ops": pending,
            "last_sync_ts": self.last_sync_ts,
            "auth_failed": self._auth_failed,
            "last_sync_success": self._last_sync_success,
        }