claude-memory-mcp/benchmarks/harness/baselines.py

"""Reference LEXICAL baseline retrievers that mirror the production system.

These exist so (a) the eval-set author can VERIFY a query's labels and check
that paraphrase queries genuinely defeat lexical matching, and (b) later agents
have an honest "current system" to beat.

`SqliteFtsRetriever` builds an in-memory SQLite FTS5 index over the corpus and
runs the SAME query shape the production local store uses:
    words -> '"w1" OR "w2" ...' MATCH, ORDER BY bm25(), importance as tiebreak.
(README "SQLite: FTS5 with BM25".) This is the closest faithful, dependency-free
baseline. The Postgres tsvector path is documented in the README; its ranking
differs (weighted A/B/C/D + importance-first default) but for a quality ceiling
comparison the FTS5/BM25 relevance ordering is the right lexical reference.
"""
from __future__ import annotations

import re
import sqlite3
from collections.abc import Sequence

from .types import Memory, MemoryId

# FTS5 reserved-ish tokens; we quote every term anyway, but strip embedded quotes.
_WORD_RE = re.compile(r"[A-Za-z0-9_]+")


class SqliteFtsRetriever:
    """Faithful FTS5/BM25 lexical baseline (mirrors local_store search)."""

    name = "sqlite_fts5_bm25"

    def __init__(self, sort_by: str = "relevance") -> None:
        # "relevance": ORDER BY bm25(), importance DESC  (best for quality eval)
        # "importance": ORDER BY importance DESC, ... (production default)
        self.sort_by = sort_by
        self._con: sqlite3.Connection | None = None

    def build_index(self, corpus: Sequence[Memory]) -> None:
        con = sqlite3.connect(":memory:")
        con.execute(
            """
            CREATE VIRTUAL TABLE memories_fts USING fts5(
                content, category, tags, expanded_keywords,
                memory_id UNINDEXED, importance UNINDEXED
            )
            """
        )
        con.executemany(
            "INSERT INTO memories_fts(content, category, tags, expanded_keywords, memory_id, importance)"
            " VALUES (?,?,?,?,?,?)",
            [
                (m.content, m.category, m.tags, m.expanded_keywords, m.id, m.importance)
                for m in corpus
            ],
        )
        con.commit()
        self._con = con

    def _fts_query(self, query: str) -> str:
        words = _WORD_RE.findall(query.lower())
        if not words:
            return ""
        return " OR ".join(f'"{w}"' for w in words)

    def retrieve(self, query: str, k: int) -> list[MemoryId]:
        assert self._con is not None, "call build_index first"
        match = self._fts_query(query)
        if not match:
            return []
        if self.sort_by == "importance":
            order = "importance DESC, bm25(memories_fts)"
        else:
            order = "bm25(memories_fts), importance DESC"
        try:
            rows = self._con.execute(
                f"SELECT memory_id FROM memories_fts WHERE memories_fts MATCH ? "
                f"ORDER BY {order} LIMIT ?",
                (match, k),
            ).fetchall()
        except sqlite3.OperationalError:
            # mirror production LIKE fallback on FTS syntax errors
            like = f"%{query}%"
            rows = self._con.execute(
                "SELECT memory_id FROM memories_fts WHERE content LIKE ? OR tags LIKE ? "
                "ORDER BY importance DESC LIMIT ?",
                (like, like, k),
            ).fetchall()
        return [r[0] for r in rows]

    def close(self) -> None:
        if self._con is not None:
            self._con.close()
            self._con = None
research: benchmark hybrid (lexical+dense+graph) recall vs current FTS Viktor asked to enhance the memory system with 'semantics' — remember concepts (not just tokens) linked in a graph — and to prove, by benchmarking against the current system, that it actually improves recall. A multi-phase research workflow (18 agents) did landscape research, an adversarially-reviewed integration design, a stratified eval set over the real 5,452-memory corpus, and a head-to-head prototype-vs-current benchmark. Result: hybrid (lexical FTS + dense embeddings, RRF-fused) beats FTS on every overall metric, driven by a robust paraphrase win (recall@10 +0.350). Recommend adopting lexical+dense; the concept graph is DEFERRED. Post-run adversarial review correction (applied to all docs before commit): the prototype's fusion config structurally barred the graph leg from the ranked top-k, so the 'graph contributes nothing' ablation was a math artifact, NOT an empirical result — the graph is UNEVALUATED, not disproven (deferred on cost+uncertainty). Multi-hop deltas are not statistically significant. Glossary in CONTEXT.md; framing in ADR-0001-0003; findings in ADR-0004-0006 + docs/research/. Privacy: the corpus/queries/qrels/results are the user's real memories and stay gitignored (data/, cache/, results/, build_eval_set.py); only harness code, aggregate numbers, and synthetic examples are committed. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com> 2026-06-25 17:51:53 +00:00			`"""Reference LEXICAL baseline retrievers that mirror the production system.`

			`These exist so (a) the eval-set author can VERIFY a query's labels and check`
			`that paraphrase queries genuinely defeat lexical matching, and (b) later agents`
			`have an honest "current system" to beat.`

			`SqliteFtsRetriever` builds an in-memory SQLite FTS5 index over the corpus and
			`runs the SAME query shape the production local store uses:`
			`words -> '"w1" OR "w2" ...' MATCH, ORDER BY bm25(), importance as tiebreak.`
			`(README "SQLite: FTS5 with BM25".) This is the closest faithful, dependency-free`
			`baseline. The Postgres tsvector path is documented in the README; its ranking`
			`differs (weighted A/B/C/D + importance-first default) but for a quality ceiling`
			`comparison the FTS5/BM25 relevance ordering is the right lexical reference.`
			`"""`
			`from __future__ import annotations`

			`import re`
			`import sqlite3`
			`from collections.abc import Sequence`

			`from .types import Memory, MemoryId`

			`# FTS5 reserved-ish tokens; we quote every term anyway, but strip embedded quotes.`
			`_WORD_RE = re.compile(r"[A-Za-z0-9_]+")`


			`class SqliteFtsRetriever:`
			`"""Faithful FTS5/BM25 lexical baseline (mirrors local_store search)."""`

			`name = "sqlite_fts5_bm25"`

			`def __init__(self, sort_by: str = "relevance") -> None:`
			`# "relevance": ORDER BY bm25(), importance DESC (best for quality eval)`
			`# "importance": ORDER BY importance DESC, ... (production default)`
			`self.sort_by = sort_by`
			`self._con: sqlite3.Connection \| None = None`

			`def build_index(self, corpus: Sequence[Memory]) -> None:`
			`con = sqlite3.connect(":memory:")`
			`con.execute(`
			`"""`
			`CREATE VIRTUAL TABLE memories_fts USING fts5(`
			`content, category, tags, expanded_keywords,`
			`memory_id UNINDEXED, importance UNINDEXED`
			`)`
			`"""`
			`)`
			`con.executemany(`
			`"INSERT INTO memories_fts(content, category, tags, expanded_keywords, memory_id, importance)"`
			`" VALUES (?,?,?,?,?,?)",`
			`[`
			`(m.content, m.category, m.tags, m.expanded_keywords, m.id, m.importance)`
			`for m in corpus`
			`],`
			`)`
			`con.commit()`
			`self._con = con`

			`def _fts_query(self, query: str) -> str:`
			`words = _WORD_RE.findall(query.lower())`
			`if not words:`
			`return ""`
			`return " OR ".join(f'"{w}"' for w in words)`

			`def retrieve(self, query: str, k: int) -> list[MemoryId]:`
			`assert self._con is not None, "call build_index first"`
			`match = self._fts_query(query)`
			`if not match:`
			`return []`
			`if self.sort_by == "importance":`
			`order = "importance DESC, bm25(memories_fts)"`
			`else:`
			`order = "bm25(memories_fts), importance DESC"`
			`try:`
			`rows = self._con.execute(`
			`f"SELECT memory_id FROM memories_fts WHERE memories_fts MATCH ? "`
			`f"ORDER BY {order} LIMIT ?",`
			`(match, k),`
			`).fetchall()`
			`except sqlite3.OperationalError:`
			`# mirror production LIKE fallback on FTS syntax errors`
			`like = f"%{query}%"`
			`rows = self._con.execute(`
			`"SELECT memory_id FROM memories_fts WHERE content LIKE ? OR tags LIKE ? "`
			`"ORDER BY importance DESC LIMIT ?",`
			`(like, like, k),`
			`).fetchall()`
			`return [r[0] for r in rows]`

			`def close(self) -> None:`
			`if self._con is not None:`
			`self._con.close()`
			`self._con = None`