claude-memory-mcp/benchmarks/harness/dataset.py
Viktor Barzin 1cc8a2b378
Some checks are pending
Build and Push / lint-and-test (push) Waiting to run
Build and Push / build (push) Blocked by required conditions
Build and Push / deploy (push) Blocked by required conditions
Build and Push / notify-failure (push) Blocked by required conditions
research: benchmark hybrid (lexical+dense+graph) recall vs current FTS
Viktor asked to enhance the memory system with 'semantics' — remember concepts
(not just tokens) linked in a graph — and to prove, by benchmarking against the
current system, that it actually improves recall. A multi-phase research workflow
(18 agents) did landscape research, an adversarially-reviewed integration design,
a stratified eval set over the real 5,452-memory corpus, and a head-to-head
prototype-vs-current benchmark.

Result: hybrid (lexical FTS + dense embeddings, RRF-fused) beats FTS on every
overall metric, driven by a robust paraphrase win (recall@10 +0.350). Recommend
adopting lexical+dense; the concept graph is DEFERRED.

Post-run adversarial review correction (applied to all docs before commit): the
prototype's fusion config structurally barred the graph leg from the ranked top-k,
so the 'graph contributes nothing' ablation was a math artifact, NOT an empirical
result — the graph is UNEVALUATED, not disproven (deferred on cost+uncertainty).
Multi-hop deltas are not statistically significant. Glossary in CONTEXT.md; framing
in ADR-0001-0003; findings in ADR-0004-0006 + docs/research/.

Privacy: the corpus/queries/qrels/results are the user's real memories and stay
gitignored (data/, cache/, results/, build_eval_set.py); only harness code,
aggregate numbers, and synthetic examples are committed.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-25 17:51:53 +00:00

115 lines
3.2 KiB
Python

"""Load corpus / queries / qrels JSONL into typed objects."""
from __future__ import annotations
import json
from dataclasses import dataclass
from pathlib import Path
from .types import Memory, Query, Qrels, MemoryId
_DATA_DIR = Path(__file__).resolve().parents[1] / "data"
@dataclass
class Dataset:
corpus: list[Memory]
queries: list[Query]
qrels: Qrels
@property
def corpus_by_id(self) -> dict[MemoryId, Memory]:
return {m.id: m for m in self.corpus}
def strata(self) -> set[str]:
return {q.stratum for q in self.queries}
def _read_jsonl(path: Path) -> list[dict]:
out: list[dict] = []
with path.open(encoding="utf-8") as f:
for line in f:
line = line.strip()
if line:
out.append(json.loads(line))
return out
def load_corpus(path: Path | None = None) -> list[Memory]:
path = path or (_DATA_DIR / "corpus.jsonl")
rows = _read_jsonl(path)
return [
Memory(
id=r["id"],
content=r["content"],
category=r.get("category", "facts"),
tags=r.get("tags", "") or "",
expanded_keywords=r.get("expanded_keywords", "") or "",
importance=r.get("importance", 0.5),
)
for r in rows
]
def load_queries(path: Path | None = None) -> list[Query]:
path = path or (_DATA_DIR / "queries.jsonl")
rows = _read_jsonl(path)
return [
Query(
query_id=r["query_id"],
text=r["text"],
stratum=r["stratum"],
relevant_ids=tuple(r.get("relevant_ids", [])),
)
for r in rows
]
def load_qrels(path: Path | None = None) -> Qrels:
path = path or (_DATA_DIR / "qrels.jsonl")
rows = _read_jsonl(path)
qrels: Qrels = {}
for r in rows:
qid = r["query_id"]
rel = set(r["relevant_ids"])
qrels.setdefault(qid, set()).update(rel)
return qrels
def load_dataset(
corpus_path: Path | None = None,
queries_path: Path | None = None,
qrels_path: Path | None = None,
*,
validate: bool = True,
) -> Dataset:
corpus = load_corpus(corpus_path)
queries = load_queries(queries_path)
qrels = load_qrels(qrels_path)
if validate:
_validate(corpus, queries, qrels)
return Dataset(corpus=corpus, queries=queries, qrels=qrels)
def _validate(corpus: list[Memory], queries: list[Query], qrels: Qrels) -> None:
corpus_ids = {m.id for m in corpus}
q_ids = {q.query_id for q in queries}
# Every query must have a qrels entry, and vice versa.
missing_qrels = q_ids - set(qrels)
if missing_qrels:
raise ValueError(f"queries without qrels: {sorted(missing_qrels)[:10]}")
orphan_qrels = set(qrels) - q_ids
if orphan_qrels:
raise ValueError(f"qrels without queries: {sorted(orphan_qrels)[:10]}")
# Every relevant id must exist in the corpus and the set must be non-empty.
for qid, rels in qrels.items():
if not rels:
raise ValueError(f"empty qrels for query {qid}")
unknown = rels - corpus_ids
if unknown:
raise ValueError(
f"query {qid} references non-corpus ids {sorted(unknown)[:10]}"
)