research: benchmark hybrid (lexical+dense+graph) recall vs current FTS

Viktor asked to enhance the memory system with 'semantics' — remember concepts (not just tokens) linked in a graph — and to prove, by benchmarking against the current system, that it actually improves recall. A multi-phase research workflow (18 agents) did landscape research, an adversarially-reviewed integration design, a stratified eval set over the real 5,452-memory corpus, and a head-to-head prototype-vs-current benchmark. Result: hybrid (lexical FTS + dense embeddings, RRF-fused) beats FTS on every overall metric, driven by a robust paraphrase win (recall@10 +0.350). Recommend adopting lexical+dense; the concept graph is DEFERRED. Post-run adversarial review correction (applied to all docs before commit): the prototype's fusion config structurally barred the graph leg from the ranked top-k, so the 'graph contributes nothing' ablation was a math artifact, NOT an empirical result — the graph is UNEVALUATED, not disproven (deferred on cost+uncertainty). Multi-hop deltas are not statistically significant. Glossary in CONTEXT.md; framing in ADR-0001-0003; findings in ADR-0004-0006 + docs/research/. Privacy: the corpus/queries/qrels/results are the user's real memories and stay gitignored (data/, cache/, results/, build_eval_set.py); only harness code, aggregate numbers, and synthetic examples are committed. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-25 17:51:53 +00:00 · 2026-06-25 17:51:53 +00:00 · 1cc8a2b378
commit 1cc8a2b378
parent 7439540f8f
23 changed files with 3428 additions and 0 deletions
--- a/benchmarks/harness/dataset.py
+++ b/benchmarks/harness/dataset.py
@ -0,0 +1,115 @@
+"""Load corpus / queries / qrels JSONL into typed objects."""
+from __future__ import annotations
+
+import json
+from dataclasses import dataclass
+from pathlib import Path
+
+from .types import Memory, Query, Qrels, MemoryId
+
+_DATA_DIR = Path(__file__).resolve().parents[1] / "data"
+
+
+@dataclass
+class Dataset:
+    corpus: list[Memory]
+    queries: list[Query]
+    qrels: Qrels
+
+    @property
+    def corpus_by_id(self) -> dict[MemoryId, Memory]:
+        return {m.id: m for m in self.corpus}
+
+    def strata(self) -> set[str]:
+        return {q.stratum for q in self.queries}
+
+
+def _read_jsonl(path: Path) -> list[dict]:
+    out: list[dict] = []
+    with path.open(encoding="utf-8") as f:
+        for line in f:
+            line = line.strip()
+            if line:
+                out.append(json.loads(line))
+    return out
+
+
+def load_corpus(path: Path | None = None) -> list[Memory]:
+    path = path or (_DATA_DIR / "corpus.jsonl")
+    rows = _read_jsonl(path)
+    return [
+        Memory(
+            id=r["id"],
+            content=r["content"],
+            category=r.get("category", "facts"),
+            tags=r.get("tags", "") or "",
+            expanded_keywords=r.get("expanded_keywords", "") or "",
+            importance=r.get("importance", 0.5),
+        )
+        for r in rows
+    ]
+
+
+def load_queries(path: Path | None = None) -> list[Query]:
+    path = path or (_DATA_DIR / "queries.jsonl")
+    rows = _read_jsonl(path)
+    return [
+        Query(
+            query_id=r["query_id"],
+            text=r["text"],
+            stratum=r["stratum"],
+            relevant_ids=tuple(r.get("relevant_ids", [])),
+        )
+        for r in rows
+    ]
+
+
+def load_qrels(path: Path | None = None) -> Qrels:
+    path = path or (_DATA_DIR / "qrels.jsonl")
+    rows = _read_jsonl(path)
+    qrels: Qrels = {}
+    for r in rows:
+        qid = r["query_id"]
+        rel = set(r["relevant_ids"])
+        qrels.setdefault(qid, set()).update(rel)
+    return qrels
+
+
+def load_dataset(
+    corpus_path: Path | None = None,
+    queries_path: Path | None = None,
+    qrels_path: Path | None = None,
+    *,
+    validate: bool = True,
+) -> Dataset:
+    corpus = load_corpus(corpus_path)
+    queries = load_queries(queries_path)
+    qrels = load_qrels(qrels_path)
+
+    if validate:
+        _validate(corpus, queries, qrels)
+
+    return Dataset(corpus=corpus, queries=queries, qrels=qrels)
+
+
+def _validate(corpus: list[Memory], queries: list[Query], qrels: Qrels) -> None:
+    corpus_ids = {m.id for m in corpus}
+    q_ids = {q.query_id for q in queries}
+
+    # Every query must have a qrels entry, and vice versa.
+    missing_qrels = q_ids - set(qrels)
+    if missing_qrels:
+        raise ValueError(f"queries without qrels: {sorted(missing_qrels)[:10]}")
+    orphan_qrels = set(qrels) - q_ids
+    if orphan_qrels:
+        raise ValueError(f"qrels without queries: {sorted(orphan_qrels)[:10]}")
+
+    # Every relevant id must exist in the corpus and the set must be non-empty.
+    for qid, rels in qrels.items():
+        if not rels:
+            raise ValueError(f"empty qrels for query {qid}")
+        unknown = rels - corpus_ids
+        if unknown:
+            raise ValueError(
+                f"query {qid} references non-corpus ids {sorted(unknown)[:10]}"
+            )