claude-memory-mcp/benchmarks/harness/dataset.py

"""Load corpus / queries / qrels JSONL into typed objects."""
from __future__ import annotations

import json
from dataclasses import dataclass
from pathlib import Path

from .types import Memory, Query, Qrels, MemoryId

_DATA_DIR = Path(__file__).resolve().parents[1] / "data"


@dataclass
class Dataset:
    corpus: list[Memory]
    queries: list[Query]
    qrels: Qrels

    @property
    def corpus_by_id(self) -> dict[MemoryId, Memory]:
        return {m.id: m for m in self.corpus}

    def strata(self) -> set[str]:
        return {q.stratum for q in self.queries}


def _read_jsonl(path: Path) -> list[dict]:
    out: list[dict] = []
    with path.open(encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if line:
                out.append(json.loads(line))
    return out


def load_corpus(path: Path | None = None) -> list[Memory]:
    path = path or (_DATA_DIR / "corpus.jsonl")
    rows = _read_jsonl(path)
    return [
        Memory(
            id=r["id"],
            content=r["content"],
            category=r.get("category", "facts"),
            tags=r.get("tags", "") or "",
            expanded_keywords=r.get("expanded_keywords", "") or "",
            importance=r.get("importance", 0.5),
        )
        for r in rows
    ]


def load_queries(path: Path | None = None) -> list[Query]:
    path = path or (_DATA_DIR / "queries.jsonl")
    rows = _read_jsonl(path)
    return [
        Query(
            query_id=r["query_id"],
            text=r["text"],
            stratum=r["stratum"],
            relevant_ids=tuple(r.get("relevant_ids", [])),
        )
        for r in rows
    ]


def load_qrels(path: Path | None = None) -> Qrels:
    path = path or (_DATA_DIR / "qrels.jsonl")
    rows = _read_jsonl(path)
    qrels: Qrels = {}
    for r in rows:
        qid = r["query_id"]
        rel = set(r["relevant_ids"])
        qrels.setdefault(qid, set()).update(rel)
    return qrels


def load_dataset(
    corpus_path: Path | None = None,
    queries_path: Path | None = None,
    qrels_path: Path | None = None,
    *,
    validate: bool = True,
) -> Dataset:
    corpus = load_corpus(corpus_path)
    queries = load_queries(queries_path)
    qrels = load_qrels(qrels_path)

    if validate:
        _validate(corpus, queries, qrels)

    return Dataset(corpus=corpus, queries=queries, qrels=qrels)


def _validate(corpus: list[Memory], queries: list[Query], qrels: Qrels) -> None:
    corpus_ids = {m.id for m in corpus}
    q_ids = {q.query_id for q in queries}

    # Every query must have a qrels entry, and vice versa.
    missing_qrels = q_ids - set(qrels)
    if missing_qrels:
        raise ValueError(f"queries without qrels: {sorted(missing_qrels)[:10]}")
    orphan_qrels = set(qrels) - q_ids
    if orphan_qrels:
        raise ValueError(f"qrels without queries: {sorted(orphan_qrels)[:10]}")

    # Every relevant id must exist in the corpus and the set must be non-empty.
    for qid, rels in qrels.items():
        if not rels:
            raise ValueError(f"empty qrels for query {qid}")
        unknown = rels - corpus_ids
        if unknown:
            raise ValueError(
                f"query {qid} references non-corpus ids {sorted(unknown)[:10]}"
            )
research: benchmark hybrid (lexical+dense+graph) recall vs current FTS Viktor asked to enhance the memory system with 'semantics' — remember concepts (not just tokens) linked in a graph — and to prove, by benchmarking against the current system, that it actually improves recall. A multi-phase research workflow (18 agents) did landscape research, an adversarially-reviewed integration design, a stratified eval set over the real 5,452-memory corpus, and a head-to-head prototype-vs-current benchmark. Result: hybrid (lexical FTS + dense embeddings, RRF-fused) beats FTS on every overall metric, driven by a robust paraphrase win (recall@10 +0.350). Recommend adopting lexical+dense; the concept graph is DEFERRED. Post-run adversarial review correction (applied to all docs before commit): the prototype's fusion config structurally barred the graph leg from the ranked top-k, so the 'graph contributes nothing' ablation was a math artifact, NOT an empirical result — the graph is UNEVALUATED, not disproven (deferred on cost+uncertainty). Multi-hop deltas are not statistically significant. Glossary in CONTEXT.md; framing in ADR-0001-0003; findings in ADR-0004-0006 + docs/research/. Privacy: the corpus/queries/qrels/results are the user's real memories and stay gitignored (data/, cache/, results/, build_eval_set.py); only harness code, aggregate numbers, and synthetic examples are committed. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com> 2026-06-25 17:51:53 +00:00			`"""Load corpus / queries / qrels JSONL into typed objects."""`
			`from __future__ import annotations`

			`import json`
			`from dataclasses import dataclass`
			`from pathlib import Path`

			`from .types import Memory, Query, Qrels, MemoryId`

			`_DATA_DIR = Path(__file__).resolve().parents[1] / "data"`


			`@dataclass`
			`class Dataset:`
			`corpus: list[Memory]`
			`queries: list[Query]`
			`qrels: Qrels`

			`@property`
			`def corpus_by_id(self) -> dict[MemoryId, Memory]:`
			`return {m.id: m for m in self.corpus}`

			`def strata(self) -> set[str]:`
			`return {q.stratum for q in self.queries}`


			`def _read_jsonl(path: Path) -> list[dict]:`
			`out: list[dict] = []`
			`with path.open(encoding="utf-8") as f:`
			`for line in f:`
			`line = line.strip()`
			`if line:`
			`out.append(json.loads(line))`
			`return out`


			`def load_corpus(path: Path \| None = None) -> list[Memory]:`
			`path = path or (_DATA_DIR / "corpus.jsonl")`
			`rows = _read_jsonl(path)`
			`return [`
			`Memory(`
			`id=r["id"],`
			`content=r["content"],`
			`category=r.get("category", "facts"),`
			`tags=r.get("tags", "") or "",`
			`expanded_keywords=r.get("expanded_keywords", "") or "",`
			`importance=r.get("importance", 0.5),`
			`)`
			`for r in rows`
			`]`


			`def load_queries(path: Path \| None = None) -> list[Query]:`
			`path = path or (_DATA_DIR / "queries.jsonl")`
			`rows = _read_jsonl(path)`
			`return [`
			`Query(`
			`query_id=r["query_id"],`
			`text=r["text"],`
			`stratum=r["stratum"],`
			`relevant_ids=tuple(r.get("relevant_ids", [])),`
			`)`
			`for r in rows`
			`]`


			`def load_qrels(path: Path \| None = None) -> Qrels:`
			`path = path or (_DATA_DIR / "qrels.jsonl")`
			`rows = _read_jsonl(path)`
			`qrels: Qrels = {}`
			`for r in rows:`
			`qid = r["query_id"]`
			`rel = set(r["relevant_ids"])`
			`qrels.setdefault(qid, set()).update(rel)`
			`return qrels`


			`def load_dataset(`
			`corpus_path: Path \| None = None,`
			`queries_path: Path \| None = None,`
			`qrels_path: Path \| None = None,`
			`*,`
			`validate: bool = True,`
			`) -> Dataset:`
			`corpus = load_corpus(corpus_path)`
			`queries = load_queries(queries_path)`
			`qrels = load_qrels(qrels_path)`

			`if validate:`
			`_validate(corpus, queries, qrels)`

			`return Dataset(corpus=corpus, queries=queries, qrels=qrels)`


			`def _validate(corpus: list[Memory], queries: list[Query], qrels: Qrels) -> None:`
			`corpus_ids = {m.id for m in corpus}`
			`q_ids = {q.query_id for q in queries}`

			`# Every query must have a qrels entry, and vice versa.`
			`missing_qrels = q_ids - set(qrels)`
			`if missing_qrels:`
			`raise ValueError(f"queries without qrels: {sorted(missing_qrels)[:10]}")`
			`orphan_qrels = set(qrels) - q_ids`
			`if orphan_qrels:`
			`raise ValueError(f"qrels without queries: {sorted(orphan_qrels)[:10]}")`

			`# Every relevant id must exist in the corpus and the set must be non-empty.`
			`for qid, rels in qrels.items():`
			`if not rels:`
			`raise ValueError(f"empty qrels for query {qid}")`
			`unknown = rels - corpus_ids`
			`if unknown:`
			`raise ValueError(`
			`f"query {qid} references non-corpus ids {sorted(unknown)[:10]}"`
			`)`