claude-memory-mcp/benchmarks/scripts/dataset_stats.py

#!/usr/bin/env python3
"""Validate the eval set and print AGGREGATE stats (safe to share / commit-able
numbers only — prints NO raw memory content)."""
from __future__ import annotations

import json
import statistics
import sys
from collections import Counter
from pathlib import Path

sys.path.insert(0, str(Path(__file__).resolve().parents[1]))
from harness import load_dataset  # noqa: E402


def main() -> None:
    ds = load_dataset(validate=True)  # raises on any referential-integrity issue

    strata = Counter(q.stratum for q in ds.queries)
    rel_per_q = {s: [] for s in strata}
    for q in ds.queries:
        rel_per_q[q.stratum].append(len(ds.qrels[q.query_id]))

    # how many DISTINCT corpus memories are exercised as relevant
    relevant_union = set()
    for rels in ds.qrels.values():
        relevant_union |= rels

    out = {
        "corpus_count": len(ds.corpus),
        "query_count": len(ds.queries),
        "strata": dict(strata),
        "relevant_ids_per_query": {
            s: {
                "min": min(v),
                "median": statistics.median(v),
                "max": max(v),
                "mean": round(statistics.fmean(v), 2),
            }
            for s, v in rel_per_q.items()
        },
        "distinct_relevant_memories": len(relevant_union),
        "validation": "PASS (all qrels ids exist in corpus; every query has qrels)",
    }
    print(json.dumps(out, indent=2))


if __name__ == "__main__":
    main()
research: benchmark hybrid (lexical+dense+graph) recall vs current FTS Viktor asked to enhance the memory system with 'semantics' — remember concepts (not just tokens) linked in a graph — and to prove, by benchmarking against the current system, that it actually improves recall. A multi-phase research workflow (18 agents) did landscape research, an adversarially-reviewed integration design, a stratified eval set over the real 5,452-memory corpus, and a head-to-head prototype-vs-current benchmark. Result: hybrid (lexical FTS + dense embeddings, RRF-fused) beats FTS on every overall metric, driven by a robust paraphrase win (recall@10 +0.350). Recommend adopting lexical+dense; the concept graph is DEFERRED. Post-run adversarial review correction (applied to all docs before commit): the prototype's fusion config structurally barred the graph leg from the ranked top-k, so the 'graph contributes nothing' ablation was a math artifact, NOT an empirical result — the graph is UNEVALUATED, not disproven (deferred on cost+uncertainty). Multi-hop deltas are not statistically significant. Glossary in CONTEXT.md; framing in ADR-0001-0003; findings in ADR-0004-0006 + docs/research/. Privacy: the corpus/queries/qrels/results are the user's real memories and stay gitignored (data/, cache/, results/, build_eval_set.py); only harness code, aggregate numbers, and synthetic examples are committed. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com> 2026-06-25 17:51:53 +00:00			`#!/usr/bin/env python3`
			`"""Validate the eval set and print AGGREGATE stats (safe to share / commit-able`
			`numbers only — prints NO raw memory content)."""`
			`from __future__ import annotations`

			`import json`
			`import statistics`
			`import sys`
			`from collections import Counter`
			`from pathlib import Path`

			`sys.path.insert(0, str(Path(__file__).resolve().parents[1]))`
			`from harness import load_dataset # noqa: E402`


			`def main() -> None:`
			`ds = load_dataset(validate=True) # raises on any referential-integrity issue`

			`strata = Counter(q.stratum for q in ds.queries)`
			`rel_per_q = {s: [] for s in strata}`
			`for q in ds.queries:`
			`rel_per_q[q.stratum].append(len(ds.qrels[q.query_id]))`

			`# how many DISTINCT corpus memories are exercised as relevant`
			`relevant_union = set()`
			`for rels in ds.qrels.values():`
			`relevant_union \|= rels`

			`out = {`
			`"corpus_count": len(ds.corpus),`
			`"query_count": len(ds.queries),`
			`"strata": dict(strata),`
			`"relevant_ids_per_query": {`
			`s: {`
			`"min": min(v),`
			`"median": statistics.median(v),`
			`"max": max(v),`
			`"mean": round(statistics.fmean(v), 2),`
			`}`
			`for s, v in rel_per_q.items()`
			`},`
			`"distinct_relevant_memories": len(relevant_union),`
			`"validation": "PASS (all qrels ids exist in corpus; every query has qrels)",`
			`}`
			`print(json.dumps(out, indent=2))`


			`if __name__ == "__main__":`
			`main()`