claude-memory-mcp/benchmarks/harness/runner.py
Viktor Barzin 1cc8a2b378
Some checks are pending
Build and Push / lint-and-test (push) Waiting to run
Build and Push / build (push) Blocked by required conditions
Build and Push / deploy (push) Blocked by required conditions
Build and Push / notify-failure (push) Blocked by required conditions
research: benchmark hybrid (lexical+dense+graph) recall vs current FTS
Viktor asked to enhance the memory system with 'semantics' — remember concepts
(not just tokens) linked in a graph — and to prove, by benchmarking against the
current system, that it actually improves recall. A multi-phase research workflow
(18 agents) did landscape research, an adversarially-reviewed integration design,
a stratified eval set over the real 5,452-memory corpus, and a head-to-head
prototype-vs-current benchmark.

Result: hybrid (lexical FTS + dense embeddings, RRF-fused) beats FTS on every
overall metric, driven by a robust paraphrase win (recall@10 +0.350). Recommend
adopting lexical+dense; the concept graph is DEFERRED.

Post-run adversarial review correction (applied to all docs before commit): the
prototype's fusion config structurally barred the graph leg from the ranked top-k,
so the 'graph contributes nothing' ablation was a math artifact, NOT an empirical
result — the graph is UNEVALUATED, not disproven (deferred on cost+uncertainty).
Multi-hop deltas are not statistically significant. Glossary in CONTEXT.md; framing
in ADR-0001-0003; findings in ADR-0004-0006 + docs/research/.

Privacy: the corpus/queries/qrels/results are the user's real memories and stay
gitignored (data/, cache/, results/, build_eval_set.py); only harness code,
aggregate numbers, and synthetic examples are committed.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-25 17:51:53 +00:00

223 lines
7.7 KiB
Python

"""Benchmark runner: drive a pluggable retriever over the eval set and report
overall + per-stratum quality metrics, plus per-query latency and (optional)
index build time / size.
Quality decides adoption (recall@k, nDCG@10, MRR). Latency and storage are
measured and reported but DO NOT gate the decision (ADR-0001 success metric).
"""
from __future__ import annotations
import statistics
import time
from collections.abc import Callable
from dataclasses import dataclass, field, asdict
from typing import Any
from . import metrics
from .dataset import Dataset
from .types import MemoryId, Query, Retriever
# A retriever may be the Protocol object or a bare callable retrieve(query, k).
RetrieverLike = Retriever | Callable[[str, int], list[MemoryId]]
# k used for the retrieve() call. We request enough depth to compute all
# metrics (max cutoff is 10) with headroom so ties past k=10 don't distort.
DEFAULT_RETRIEVE_K = 20
def _percentile(values: list[float], pct: float) -> float:
"""Linear-interpolation percentile (pct in [0,100]). Empty -> 0.0."""
if not values:
return 0.0
if len(values) == 1:
return values[0]
s = sorted(values)
rank = (pct / 100.0) * (len(s) - 1)
lo = int(rank)
hi = min(lo + 1, len(s) - 1)
frac = rank - lo
return s[lo] + (s[hi] - s[lo]) * frac
@dataclass
class StratumResult:
stratum: str
n_queries: int
metrics: dict[str, float] # macro-averaged metric -> value
@dataclass
class BenchmarkResult:
retriever_name: str
n_queries: int
retrieve_k: int
overall: dict[str, float]
per_stratum: dict[str, StratumResult]
latency_ms: dict[str, float] # mean / p50 / p95 / max
index_build_seconds: float | None = None
index_size_bytes: int | None = None
per_query: list[dict[str, Any]] = field(default_factory=list)
def to_dict(self) -> dict:
d = asdict(self)
d["per_stratum"] = {k: asdict(v) for k, v in self.per_stratum.items()}
return d
def summary(self) -> str:
lines = [
f"Retriever: {self.retriever_name}",
f"Queries: {self.n_queries} (retrieve_k={self.retrieve_k})",
]
if self.index_build_seconds is not None:
lines.append(f"Index build: {self.index_build_seconds:.3f}s")
if self.index_size_bytes is not None:
lines.append(f"Index size: {self.index_size_bytes / 1e6:.2f} MB")
lat = self.latency_ms
lines.append(
"Latency/query: "
f"p50={lat['p50']:.2f}ms p95={lat['p95']:.2f}ms "
f"mean={lat['mean']:.2f}ms max={lat['max']:.2f}ms"
)
cols = metrics.METRIC_NAMES
header = " ".join(f"{c:>10}" for c in cols)
lines.append("")
lines.append(f"{'stratum':<12}{'n':>5} {header}")
lines.append("-" * (19 + len(header)))
for name in ("overall", *sorted(self.per_stratum)):
if name == "overall":
m, n = self.overall, self.n_queries
else:
sr = self.per_stratum[name]
m, n = sr.metrics, sr.n_queries
row = " ".join(f"{m[c]:>10.4f}" for c in cols)
lines.append(f"{name:<12}{n:>5} {row}")
return "\n".join(lines)
def _get_retrieve_fn(retriever: RetrieverLike) -> Callable[[str, int], list[MemoryId]]:
if hasattr(retriever, "retrieve"):
return retriever.retrieve # type: ignore[attr-defined]
if callable(retriever):
return retriever
raise TypeError("retriever must implement retrieve(query, k) or be callable")
def _maybe_build_index(retriever: RetrieverLike, dataset: Dataset) -> tuple[float | None, int | None]:
"""Call optional lifecycle hooks if present (duck-typed).
- build_index(corpus) -> None : measured wall-clock build time.
- index_size_bytes() -> int : reported on-disk/in-memory index size.
Returns (build_seconds_or_None, size_bytes_or_None).
"""
build_seconds: float | None = None
size_bytes: int | None = None
build = getattr(retriever, "build_index", None)
if callable(build):
t0 = time.perf_counter()
build(dataset.corpus)
build_seconds = time.perf_counter() - t0
size_fn = getattr(retriever, "index_size_bytes", None)
if callable(size_fn):
try:
size_bytes = int(size_fn())
except Exception:
size_bytes = None
return build_seconds, size_bytes
def run_benchmark(
retriever: RetrieverLike,
dataset: Dataset,
*,
retrieve_k: int = DEFAULT_RETRIEVE_K,
retriever_name: str | None = None,
warmup: bool = True,
collect_per_query: bool = True,
) -> BenchmarkResult:
"""Evaluate `retriever` over `dataset`.
The retriever is asked for `retrieve_k` ids per query (>= max metric
cutoff of 10). Metrics are macro-averaged over queries, overall and per
stratum. Latency is measured around each retrieve() call only (index build
is timed separately via the optional build_index hook).
"""
name = retriever_name or getattr(retriever, "name", None) or type(retriever).__name__
retrieve = _get_retrieve_fn(retriever)
qrels = dataset.qrels
build_seconds, size_bytes = _maybe_build_index(retriever, dataset)
# Optional warmup (first call can pay import/JIT/connection costs that would
# skew p95). Excluded from latency stats. Uses the first query if any.
if warmup and dataset.queries:
try:
retrieve(dataset.queries[0].text, retrieve_k)
except Exception:
pass # warmup failures surface on the real call below
per_query_rows: list[dict[str, Any]] = []
latencies_ms: list[float] = []
# accumulate per-stratum metric sums for macro-average
strata: dict[str, dict[str, float]] = {}
strata_counts: dict[str, int] = {}
overall_sums = {m: 0.0 for m in metrics.METRIC_NAMES}
for q in dataset.queries:
rel = qrels[q.query_id]
t0 = time.perf_counter()
ranked = list(retrieve(q.text, retrieve_k))
dt_ms = (time.perf_counter() - t0) * 1000.0
latencies_ms.append(dt_ms)
m = metrics.per_query_metrics(ranked, rel)
for key, val in m.items():
overall_sums[key] += val
strata.setdefault(q.stratum, {mm: 0.0 for mm in metrics.METRIC_NAMES})
strata_counts[q.stratum] = strata_counts.get(q.stratum, 0) + 1
for key, val in m.items():
strata[q.stratum][key] += val
if collect_per_query:
per_query_rows.append(
{
"query_id": q.query_id,
"stratum": q.stratum,
"n_relevant": len(rel),
"latency_ms": round(dt_ms, 3),
"retrieved": ranked[:retrieve_k],
**{k: round(v, 6) for k, v in m.items()},
}
)
n = len(dataset.queries)
overall = {k: (overall_sums[k] / n if n else 0.0) for k in metrics.METRIC_NAMES}
per_stratum: dict[str, StratumResult] = {}
for s, sums in strata.items():
c = strata_counts[s]
per_stratum[s] = StratumResult(
stratum=s,
n_queries=c,
metrics={k: (sums[k] / c if c else 0.0) for k in metrics.METRIC_NAMES},
)
latency_stats = {
"mean": statistics.fmean(latencies_ms) if latencies_ms else 0.0,
"p50": _percentile(latencies_ms, 50),
"p95": _percentile(latencies_ms, 95),
"max": max(latencies_ms) if latencies_ms else 0.0,
}
return BenchmarkResult(
retriever_name=name,
n_queries=n,
retrieve_k=retrieve_k,
overall=overall,
per_stratum=per_stratum,
latency_ms=latency_stats,
index_build_seconds=build_seconds,
index_size_bytes=size_bytes,
per_query=per_query_rows,
)