Some checks are pending
Viktor asked to enhance the memory system with 'semantics' — remember concepts (not just tokens) linked in a graph — and to prove, by benchmarking against the current system, that it actually improves recall. A multi-phase research workflow (18 agents) did landscape research, an adversarially-reviewed integration design, a stratified eval set over the real 5,452-memory corpus, and a head-to-head prototype-vs-current benchmark. Result: hybrid (lexical FTS + dense embeddings, RRF-fused) beats FTS on every overall metric, driven by a robust paraphrase win (recall@10 +0.350). Recommend adopting lexical+dense; the concept graph is DEFERRED. Post-run adversarial review correction (applied to all docs before commit): the prototype's fusion config structurally barred the graph leg from the ranked top-k, so the 'graph contributes nothing' ablation was a math artifact, NOT an empirical result — the graph is UNEVALUATED, not disproven (deferred on cost+uncertainty). Multi-hop deltas are not statistically significant. Glossary in CONTEXT.md; framing in ADR-0001-0003; findings in ADR-0004-0006 + docs/research/. Privacy: the corpus/queries/qrels/results are the user's real memories and stay gitignored (data/, cache/, results/, build_eval_set.py); only harness code, aggregate numbers, and synthetic examples are committed. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
204 lines
7.9 KiB
Python
204 lines
7.9 KiB
Python
"""Unit tests for the HYBRID retriever's pure logic: concept normalisation, the
|
|
concept-graph build + 1-hop expansion, weighted RRF fusion, and graceful
|
|
degradation when the dense leg is unavailable.
|
|
|
|
These tests are MODEL-FREE on purpose — they never load sentence-transformers (a
|
|
~1.3 GB / multi-minute CPU load). The dense leg is exercised by monkeypatching the
|
|
ranking method, so the fusion + graph behaviour is verified deterministically and
|
|
fast. The full end-to-end quality run is done via scripts/run_eval.py against the
|
|
real (local, gitignored) corpus.
|
|
|
|
Run: .venv/bin/python -m pytest retrievers/test_hybrid.py -q
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import math
|
|
|
|
from harness.types import Memory
|
|
from retrievers.hybrid import (
|
|
_RRF_K,
|
|
HybridRetriever,
|
|
_concepts_for,
|
|
_normalise_concept,
|
|
)
|
|
|
|
|
|
# ---------------- concept normalisation ----------------
|
|
|
|
def test_normalise_concept_depluralisation():
|
|
cases = {
|
|
"Decisions": "decision",
|
|
"policies": "policy",
|
|
"addresses": "address",
|
|
"boxes": "box",
|
|
"tags": "tag",
|
|
# invariants: don't over-strip
|
|
"access": "access",
|
|
"class": "class",
|
|
"status": "status",
|
|
"analysis": "analysis",
|
|
"kubernetes": "kubernete", # heuristic, acceptable (collapses consistently)
|
|
"k8s": "k8s",
|
|
"GPU": "gpu",
|
|
}
|
|
for inp, exp in cases.items():
|
|
assert _normalise_concept(inp) == exp, f"{inp!r} -> {_normalise_concept(inp)!r}"
|
|
|
|
|
|
def test_normalise_concept_is_stable_under_repetition():
|
|
# normalising an already-normalised token must be a no-op (idempotent), so the
|
|
# graph collapses variants consistently no matter the source field.
|
|
for tok in ["decision", "policy", "address", "tag", "gpu", "access"]:
|
|
assert _normalise_concept(_normalise_concept(tok)) == _normalise_concept(tok)
|
|
|
|
|
|
def test_concepts_for_unions_tags_keywords_content():
|
|
m = Memory(
|
|
id=1,
|
|
content="The Postgres cluster uses pgvector for embeddings.",
|
|
tags="database,postgres",
|
|
expanded_keywords="cnpg vector search",
|
|
)
|
|
cs = _concepts_for(m)
|
|
# from tags (note: 'postgres' de-plurals to 'postgre' — a consistent heuristic
|
|
# collapse; what matters is every memory mentioning it lands on the SAME node).
|
|
assert "database" in cs and "postgre" in cs
|
|
# from expanded_keywords
|
|
assert "cnpg" in cs and "vector" in cs and "search" in cs
|
|
# from content (salient tokens, stop-words removed)
|
|
assert "pgvector" in cs and "embedding" in cs # 'embeddings' -> 'embedding'
|
|
assert "the" not in cs and "for" not in cs # stop-words excluded
|
|
|
|
|
|
# ---------------- graph build + expansion ----------------
|
|
|
|
def _shared_concept_corpus() -> list[Memory]:
|
|
# Three memories share concept "alpha" (df=3); two share "beta" (df=2); "gamma"
|
|
# is unique (df=1, links nothing). With min_df=2 and a generous max_df, alpha
|
|
# and beta both form edges.
|
|
return [
|
|
Memory(id=10, content="alpha topic one", tags="alpha", expanded_keywords="beta"),
|
|
Memory(id=20, content="alpha topic two", tags="alpha", expanded_keywords="beta"),
|
|
Memory(id=30, content="alpha topic three", tags="alpha", expanded_keywords="gamma"),
|
|
Memory(id=40, content="unrelated delta", tags="delta", expanded_keywords="delta"),
|
|
]
|
|
|
|
|
|
def test_graph_build_links_shared_concepts():
|
|
r = HybridRetriever()
|
|
# widen max_df so small-corpus concepts aren't pruned as "hubs"
|
|
import retrievers.hybrid as H
|
|
|
|
old = H._CONCEPT_MAX_DF_FRAC
|
|
H._CONCEPT_MAX_DF_FRAC = 1.0
|
|
try:
|
|
r._build_graph(_shared_concept_corpus())
|
|
finally:
|
|
H._CONCEPT_MAX_DF_FRAC = old
|
|
|
|
g = r._graph
|
|
assert g is not None
|
|
# alpha links 10-20-30 (a triangle); beta links 10-20; "topic" links 10-20-30
|
|
# too (shared content token). So the triangle exists and 10-20 is the heaviest
|
|
# edge (they additionally share 'beta').
|
|
assert g.has_edge(10, 20)
|
|
assert g.has_edge(10, 30)
|
|
assert g.has_edge(20, 30)
|
|
# 10-20 share alpha + beta + topic (=3); 10-30 share alpha + topic (=2). The
|
|
# exact counts aren't load-bearing — the INVARIANT is w(10,20) > w(10,30).
|
|
assert g[10][20]["weight"] > g[10][30]["weight"]
|
|
# the unrelated memory 40 (concept 'delta', df=1) links nothing.
|
|
assert g.degree(40) == 0
|
|
stats = r.graph_stats()
|
|
assert stats["nodes"] == 4 and stats["edges"] >= 3
|
|
|
|
|
|
def test_graph_rank_expands_from_seeds_by_weight():
|
|
r = HybridRetriever()
|
|
import retrievers.hybrid as H
|
|
|
|
old = H._CONCEPT_MAX_DF_FRAC
|
|
H._CONCEPT_MAX_DF_FRAC = 1.0
|
|
try:
|
|
r._build_graph(_shared_concept_corpus())
|
|
finally:
|
|
H._CONCEPT_MAX_DF_FRAC = old
|
|
|
|
# Seed from memory 10; neighbours 20 (w=2) and 30 (w=1) should both surface,
|
|
# with 20 ranked above 30 (heavier shared-concept edge).
|
|
nbrs = r._graph_rank([10], exclude={10}, k=10)
|
|
assert nbrs[:2] == [20, 30]
|
|
# excluded seeds are never returned
|
|
assert 10 not in nbrs
|
|
|
|
|
|
def test_graph_rank_empty_without_graph_or_seeds():
|
|
r = HybridRetriever() # no graph built
|
|
assert r._graph_rank([1, 2], exclude=set(), k=5) == []
|
|
r._graph = object.__new__(type("G", (), {})) # truthy but unused
|
|
assert r._graph_rank([], exclude=set(), k=5) == [] # no seeds
|
|
|
|
|
|
# ---------------- RRF fusion ----------------
|
|
|
|
def test_rrf_accumulate_formula():
|
|
scores: dict[int, float] = {}
|
|
from collections import defaultdict
|
|
|
|
scores = defaultdict(float)
|
|
HybridRetriever._rrf_accumulate(scores, [7, 8, 9], weight=1.0)
|
|
assert math.isclose(scores[7], 1.0 / (_RRF_K + 1))
|
|
assert math.isclose(scores[8], 1.0 / (_RRF_K + 2))
|
|
assert math.isclose(scores[9], 1.0 / (_RRF_K + 3))
|
|
# a second weighted list adds on top
|
|
HybridRetriever._rrf_accumulate(scores, [8], weight=0.5)
|
|
assert math.isclose(scores[8], 1.0 / (_RRF_K + 2) + 0.5 / (_RRF_K + 1))
|
|
|
|
|
|
def test_retrieve_fuses_all_three_legs_and_degrades():
|
|
"""End-to-end fusion with the dense leg STUBBED (no model). Verifies (a) FTS +
|
|
dense agreement floats a doc to the top, (b) the graph leg can introduce a doc
|
|
neither base leg returned, and (c) dense-disabled degrades to FTS(+graph)."""
|
|
corpus = [
|
|
Memory(id=1, content="alpha shared concept", tags="alpha", expanded_keywords="alpha"),
|
|
Memory(id=2, content="alpha shared concept too", tags="alpha", expanded_keywords="alpha"),
|
|
Memory(id=3, content="beta unrelated", tags="beta", expanded_keywords="beta"),
|
|
]
|
|
import retrievers.hybrid as H
|
|
|
|
old = H._CONCEPT_MAX_DF_FRAC
|
|
H._CONCEPT_MAX_DF_FRAC = 1.0
|
|
try:
|
|
r = HybridRetriever()
|
|
# Stub the dense BUILD so the test never loads the ~1.3 GB model nor writes
|
|
# to the shared cache/ dir; build_index then only does FTS + graph.
|
|
r._build_dense = lambda _c: None # type: ignore[method-assign]
|
|
r.build_index(corpus) # FTS + graph build only
|
|
# Stub the dense RANKER deterministically to "agree" with FTS on doc 1.
|
|
r._dense_rank = lambda q, k: [1] # type: ignore[method-assign]
|
|
|
|
# query matching doc 1 lexically; doc 2 shares concept 'alpha' with doc 1
|
|
# (graph neighbour) even if FTS ranks it lower.
|
|
out = r.retrieve("alpha shared concept", k=3)
|
|
assert out, "should return something"
|
|
assert out[0] == 1 # FTS+dense agreement puts doc 1 first
|
|
assert 2 in out # graph expansion (shares 'alpha') pulls doc 2 in
|
|
finally:
|
|
H._CONCEPT_MAX_DF_FRAC = old
|
|
|
|
|
|
def test_graceful_degradation_records_error(monkeypatch):
|
|
"""If the dense build raises, the retriever records it and still serves FTS."""
|
|
corpus = [Memory(id=i, content=f"doc number {i} content", tags="t") for i in range(1, 6)]
|
|
r = HybridRetriever()
|
|
|
|
def boom(_corpus):
|
|
raise RuntimeError("simulated embedding failure")
|
|
|
|
monkeypatch.setattr(r, "_build_dense", boom)
|
|
r.build_index(corpus)
|
|
assert any("dense leg disabled" in e for e in r.errors)
|
|
assert r._emb is None
|
|
# FTS still answers
|
|
out = r.retrieve("doc number 3 content", k=5)
|
|
assert 3 in out
|