claude-memory-mcp/benchmarks/retrievers/test_hybrid.py

"""Unit tests for the HYBRID retriever's pure logic: concept normalisation, the
concept-graph build + 1-hop expansion, weighted RRF fusion, and graceful
degradation when the dense leg is unavailable.

These tests are MODEL-FREE on purpose — they never load sentence-transformers (a
~1.3 GB / multi-minute CPU load). The dense leg is exercised by monkeypatching the
ranking method, so the fusion + graph behaviour is verified deterministically and
fast. The full end-to-end quality run is done via scripts/run_eval.py against the
real (local, gitignored) corpus.

Run:  .venv/bin/python -m pytest retrievers/test_hybrid.py -q
"""
from __future__ import annotations

import math

from harness.types import Memory
from retrievers.hybrid import (
    _RRF_K,
    HybridRetriever,
    _concepts_for,
    _normalise_concept,
)


# ---------------- concept normalisation ----------------

def test_normalise_concept_depluralisation():
    cases = {
        "Decisions": "decision",
        "policies": "policy",
        "addresses": "address",
        "boxes": "box",
        "tags": "tag",
        # invariants: don't over-strip
        "access": "access",
        "class": "class",
        "status": "status",
        "analysis": "analysis",
        "kubernetes": "kubernete",  # heuristic, acceptable (collapses consistently)
        "k8s": "k8s",
        "GPU": "gpu",
    }
    for inp, exp in cases.items():
        assert _normalise_concept(inp) == exp, f"{inp!r} -> {_normalise_concept(inp)!r}"


def test_normalise_concept_is_stable_under_repetition():
    # normalising an already-normalised token must be a no-op (idempotent), so the
    # graph collapses variants consistently no matter the source field.
    for tok in ["decision", "policy", "address", "tag", "gpu", "access"]:
        assert _normalise_concept(_normalise_concept(tok)) == _normalise_concept(tok)


def test_concepts_for_unions_tags_keywords_content():
    m = Memory(
        id=1,
        content="The Postgres cluster uses pgvector for embeddings.",
        tags="database,postgres",
        expanded_keywords="cnpg vector search",
    )
    cs = _concepts_for(m)
    # from tags (note: 'postgres' de-plurals to 'postgre' — a consistent heuristic
    # collapse; what matters is every memory mentioning it lands on the SAME node).
    assert "database" in cs and "postgre" in cs
    # from expanded_keywords
    assert "cnpg" in cs and "vector" in cs and "search" in cs
    # from content (salient tokens, stop-words removed)
    assert "pgvector" in cs and "embedding" in cs  # 'embeddings' -> 'embedding'
    assert "the" not in cs and "for" not in cs  # stop-words excluded


# ---------------- graph build + expansion ----------------

def _shared_concept_corpus() -> list[Memory]:
    # Three memories share concept "alpha" (df=3); two share "beta" (df=2); "gamma"
    # is unique (df=1, links nothing). With min_df=2 and a generous max_df, alpha
    # and beta both form edges.
    return [
        Memory(id=10, content="alpha topic one", tags="alpha", expanded_keywords="beta"),
        Memory(id=20, content="alpha topic two", tags="alpha", expanded_keywords="beta"),
        Memory(id=30, content="alpha topic three", tags="alpha", expanded_keywords="gamma"),
        Memory(id=40, content="unrelated delta", tags="delta", expanded_keywords="delta"),
    ]


def test_graph_build_links_shared_concepts():
    r = HybridRetriever()
    # widen max_df so small-corpus concepts aren't pruned as "hubs"
    import retrievers.hybrid as H

    old = H._CONCEPT_MAX_DF_FRAC
    H._CONCEPT_MAX_DF_FRAC = 1.0
    try:
        r._build_graph(_shared_concept_corpus())
    finally:
        H._CONCEPT_MAX_DF_FRAC = old

    g = r._graph
    assert g is not None
    # alpha links 10-20-30 (a triangle); beta links 10-20; "topic" links 10-20-30
    # too (shared content token). So the triangle exists and 10-20 is the heaviest
    # edge (they additionally share 'beta').
    assert g.has_edge(10, 20)
    assert g.has_edge(10, 30)
    assert g.has_edge(20, 30)
    # 10-20 share alpha + beta + topic (=3); 10-30 share alpha + topic (=2). The
    # exact counts aren't load-bearing — the INVARIANT is w(10,20) > w(10,30).
    assert g[10][20]["weight"] > g[10][30]["weight"]
    # the unrelated memory 40 (concept 'delta', df=1) links nothing.
    assert g.degree(40) == 0
    stats = r.graph_stats()
    assert stats["nodes"] == 4 and stats["edges"] >= 3


def test_graph_rank_expands_from_seeds_by_weight():
    r = HybridRetriever()
    import retrievers.hybrid as H

    old = H._CONCEPT_MAX_DF_FRAC
    H._CONCEPT_MAX_DF_FRAC = 1.0
    try:
        r._build_graph(_shared_concept_corpus())
    finally:
        H._CONCEPT_MAX_DF_FRAC = old

    # Seed from memory 10; neighbours 20 (w=2) and 30 (w=1) should both surface,
    # with 20 ranked above 30 (heavier shared-concept edge).
    nbrs = r._graph_rank([10], exclude={10}, k=10)
    assert nbrs[:2] == [20, 30]
    # excluded seeds are never returned
    assert 10 not in nbrs


def test_graph_rank_empty_without_graph_or_seeds():
    r = HybridRetriever()  # no graph built
    assert r._graph_rank([1, 2], exclude=set(), k=5) == []
    r._graph = object.__new__(type("G", (), {}))  # truthy but unused
    assert r._graph_rank([], exclude=set(), k=5) == []  # no seeds


# ---------------- RRF fusion ----------------

def test_rrf_accumulate_formula():
    scores: dict[int, float] = {}
    from collections import defaultdict

    scores = defaultdict(float)
    HybridRetriever._rrf_accumulate(scores, [7, 8, 9], weight=1.0)
    assert math.isclose(scores[7], 1.0 / (_RRF_K + 1))
    assert math.isclose(scores[8], 1.0 / (_RRF_K + 2))
    assert math.isclose(scores[9], 1.0 / (_RRF_K + 3))
    # a second weighted list adds on top
    HybridRetriever._rrf_accumulate(scores, [8], weight=0.5)
    assert math.isclose(scores[8], 1.0 / (_RRF_K + 2) + 0.5 / (_RRF_K + 1))


def test_retrieve_fuses_all_three_legs_and_degrades():
    """End-to-end fusion with the dense leg STUBBED (no model). Verifies (a) FTS +
    dense agreement floats a doc to the top, (b) the graph leg can introduce a doc
    neither base leg returned, and (c) dense-disabled degrades to FTS(+graph)."""
    corpus = [
        Memory(id=1, content="alpha shared concept", tags="alpha", expanded_keywords="alpha"),
        Memory(id=2, content="alpha shared concept too", tags="alpha", expanded_keywords="alpha"),
        Memory(id=3, content="beta unrelated", tags="beta", expanded_keywords="beta"),
    ]
    import retrievers.hybrid as H

    old = H._CONCEPT_MAX_DF_FRAC
    H._CONCEPT_MAX_DF_FRAC = 1.0
    try:
        r = HybridRetriever()
        # Stub the dense BUILD so the test never loads the ~1.3 GB model nor writes
        # to the shared cache/ dir; build_index then only does FTS + graph.
        r._build_dense = lambda _c: None  # type: ignore[method-assign]
        r.build_index(corpus)  # FTS + graph build only
        # Stub the dense RANKER deterministically to "agree" with FTS on doc 1.
        r._dense_rank = lambda q, k: [1]  # type: ignore[method-assign]

        # query matching doc 1 lexically; doc 2 shares concept 'alpha' with doc 1
        # (graph neighbour) even if FTS ranks it lower.
        out = r.retrieve("alpha shared concept", k=3)
        assert out, "should return something"
        assert out[0] == 1  # FTS+dense agreement puts doc 1 first
        assert 2 in out  # graph expansion (shares 'alpha') pulls doc 2 in
    finally:
        H._CONCEPT_MAX_DF_FRAC = old


def test_graceful_degradation_records_error(monkeypatch):
    """If the dense build raises, the retriever records it and still serves FTS."""
    corpus = [Memory(id=i, content=f"doc number {i} content", tags="t") for i in range(1, 6)]
    r = HybridRetriever()

    def boom(_corpus):
        raise RuntimeError("simulated embedding failure")

    monkeypatch.setattr(r, "_build_dense", boom)
    r.build_index(corpus)
    assert any("dense leg disabled" in e for e in r.errors)
    assert r._emb is None
    # FTS still answers
    out = r.retrieve("doc number 3 content", k=5)
    assert 3 in out