""" RLM-MEM - RECALL Operation Tests D3.2: High-level memory retrieval operation tests RECALL is the high-level operation that: - Takes a natural language query - Uses REPL environment for recursive search - Returns relevant memories with confidence scores - Supports filtering by tags, conversation, etc. Test Philosophy (Linus Style): 1. Tests must find bugs, not just pass 2. Integration-focused - Tests the full retrieval pipeline 3. Negative cases - No matches, invalid queries 4. Edge cases - Ambiguous queries, multiple matches 5. Verify ranking - Most relevant results first """ import unittest from unittest.mock import Mock, MagicMock, patch import tempfile import shutil from pathlib import Path # Handle both relative and direct imports try: from brain.scripts.memory_store import ChunkStore, Chunk from brain.scripts.remember_operation import RememberOperation from brain.scripts.recall_operation import RecallOperation, RecallResult from brain.scripts.repl_environment import REPLSession except ImportError: from memory_store import ChunkStore, Chunk from remember_operation import RememberOperation from recall_operation import RecallOperation, RecallResult from repl_environment import REPLSession class TestRecallBasic(unittest.TestCase): """Test basic RECALL functionality.""" def setUp(self): """Set up temp storage and sample memories.""" self.temp_dir = tempfile.mkdtemp() self.store = ChunkStore(self.temp_dir) self.remember = RememberOperation(self.store) # Create mock LLM self.mock_llm = Mock() # Create sample memories self._create_sample_memories() # Create RecallOperation (without REPL to avoid import issues in tests) self.recall = RecallOperation(self.store, llm_client=None) def tearDown(self): """Clean up temp storage.""" shutil.rmtree(self.temp_dir, ignore_errors=True) def _create_sample_memories(self): """Create sample memories for testing.""" # Memory 1: Python preference m1 = self.remember.remember( content="User prefers Python for data science and machine learning projects", conversation_id="test-conv-1", tags=["preference", "python", "datascience"], confidence=0.95 ) # Memory 2: Editor preference m2 = self.remember.remember( content="User likes VS Code with dark theme for coding", conversation_id="test-conv-1", tags=["preference", "editor", "vscode"], confidence=0.90 ) # Memory 3: Testing preference m3 = self.remember.remember( content="User prefers pytest over unittest for Python testing", conversation_id="test-conv-2", tags=["preference", "testing", "python"], confidence=0.85 ) self.seed_ids = { "python": m1["chunk_ids"][0], "editor": m2["chunk_ids"][0], "pytest": m3["chunk_ids"][0], } def test_recall_initialization(self): """Should initialize with ChunkStore.""" self.assertIsNotNone(self.recall.chunk_store) def test_recall_requires_chunk_store(self): """Should fail fast without ChunkStore.""" with self.assertRaises((ValueError, TypeError)): RecallOperation(chunk_store=None, llm_client=self.mock_llm) def test_recall_works_without_llm_client(self): """Should work without LLM client using basic search.""" recall = RecallOperation(chunk_store=self.store, llm_client=None) result = recall.recall("Python") # Should still return results using basic search self.assertIsNotNone(result) def test_recall_simple_query(self): """Should retrieve memories for simple query.""" # Mock LLM to return a search self.mock_llm.complete = Mock(return_value="FINAL('User prefers Python')") result = self.recall.recall("What language does the user prefer?") self.assertIsInstance(result, RecallResult) self.assertIsNotNone(result.answer) self.assertGreaterEqual(result.confidence, 0.0) self.assertLessEqual(result.confidence, 1.0) def test_recall_returns_relevant_memories(self): """Should return most relevant memories.""" # Mock LLM to search for Python-related memories def mock_complete(prompt): if "python" in prompt.lower(): return "FINAL('User prefers Python for data science and pytest for testing')" return "FINAL('No specific preference found')" self.mock_llm.complete = Mock(side_effect=mock_complete) result = self.recall.recall("Tell me about Python preferences") self.assertTrue(len(result.source_chunks) > 0) self.assertIn("python", result.answer.lower()) def test_recall_no_matches(self): """Should handle case with no relevant memories.""" self.mock_llm.complete = Mock(return_value="FINAL(None)") result = self.recall.recall("What does the user think about Rust programming?") # Should return empty or indicate no memories self.assertIsNotNone(result) def test_recall_respects_max_results(self): """Should limit results to max_results parameter.""" self.mock_llm.complete = Mock(return_value="FINAL('Found preferences')") result = self.recall.recall("What preferences", max_results=2) # Should return at most 2 source chunks self.assertLessEqual(len(result.source_chunks), 2) def test_recall_filters_by_conversation(self): """Should filter by conversation_id when provided.""" self.mock_llm.complete = Mock(return_value="FINAL('VS Code preference')") result = self.recall.recall( "What editor?", conversation_id="test-conv-1" ) # Should only consider memories from test-conv-1 for chunk_id in result.source_chunks: chunk = self.store.get_chunk(chunk_id) self.assertEqual(chunk.metadata.conversation_id, "test-conv-1") def test_recall_confidence_scoring(self): """Should return appropriate confidence score.""" self.mock_llm.complete = Mock(return_value="FINAL('High confidence match')") result = self.recall.recall("Python preferences") # Confidence should be based on match quality self.assertIsInstance(result.confidence, float) self.assertGreaterEqual(result.confidence, 0.0) self.assertLessEqual(result.confidence, 1.0) def test_recall_typo_tolerance_finds_relevant_chunk(self): """Should handle minor typos in non-LLM mode.""" result = self.recall.recall("pytesst prefernce") self.assertGreater(len(result.source_chunks), 0) self.assertEqual(result.source_chunks[0], self.seed_ids["pytest"]) self.assertIn("pytest", result.answer.lower()) def test_recall_tag_boost_can_match_without_content_term(self): """Tag matches should be strong enough even without term in content.""" tagged = self.remember.remember( content="Framework decision captured for future setup.", conversation_id="test-conv-3", tags=["pytest"], confidence=0.92 ) untagged = self.remember.remember( content="Framework decision captured for future setup.", conversation_id="test-conv-3", tags=["framework"], confidence=0.92 ) result = self.recall.recall("pytest", conversation_id="test-conv-3") self.assertGreater(len(result.source_chunks), 0) self.assertEqual(result.source_chunks[0], tagged["chunk_ids"][0]) self.assertNotEqual(result.source_chunks[0], untagged["chunk_ids"][0]) def test_recall_prefers_higher_confidence_on_equal_text_match(self): """Confidence should break ties for otherwise-equal matches.""" high = self.remember.remember( content="User prefers strict linting rules in CI", conversation_id="test-conv-4", tags=["lint", "ci"], confidence=0.95 ) low = self.remember.remember( content="User prefers strict linting rules in CI", conversation_id="test-conv-4", tags=["lint", "ci"], confidence=0.40 ) result = self.recall.recall("strict linting ci", conversation_id="test-conv-4") self.assertGreater(len(result.source_chunks), 0) self.assertEqual(result.source_chunks[0], high["chunk_ids"][0]) self.assertNotEqual(result.source_chunks[0], low["chunk_ids"][0]) def test_recall_tracks_iterations_when_using_repl(self): """Should track iterations when using REPL.""" # This test would need a full REPL setup, skip for basic mode result = self.recall.recall("Query") # Should report iterations (0 for basic search mode) self.assertIsInstance(result.iterations_used, int) def test_recall_empty_query(self): """Should handle empty query gracefully.""" result = self.recall.recall("") # Should return empty result or error gracefully self.assertIsNotNone(result) def test_recall_tracks_cost(self): """Should track LLM API cost.""" # Mock LLM response with cost info mock_response = Mock() mock_response.text = "FINAL('Answer')" mock_response.cost_usd = 0.001 self.mock_llm.complete = Mock(return_value=mock_response) result = self.recall.recall("Query") # Should track cost self.assertIsInstance(result.cost_usd, float) self.assertGreaterEqual(result.cost_usd, 0.0) class TestRecallResult(unittest.TestCase): """Test RecallResult dataclass.""" def test_recall_result_creation(self): """Should create RecallResult with all fields.""" result = RecallResult( answer="User prefers Python", confidence=0.95, source_chunks=["chunk-1", "chunk-2"], iterations_used=3, cost_usd=0.002 ) self.assertEqual(result.answer, "User prefers Python") self.assertEqual(result.confidence, 0.95) self.assertEqual(len(result.source_chunks), 2) self.assertEqual(result.iterations_used, 3) self.assertEqual(result.cost_usd, 0.002) def test_recall_result_defaults(self): """Should have sensible defaults.""" result = RecallResult(answer="Test") self.assertEqual(result.answer, "Test") self.assertEqual(result.confidence, 0.0) self.assertEqual(result.source_chunks, []) self.assertEqual(result.iterations_used, 0) self.assertEqual(result.cost_usd, 0.0) class TestRecallIntegration(unittest.TestCase): """Integration tests for RECALL.""" def setUp(self): """Set up full integration environment.""" self.temp_dir = tempfile.mkdtemp() self.store = ChunkStore(self.temp_dir) self.remember = RememberOperation(self.store) # Create diverse memories self._create_diverse_memories() # Set up mock LLM with intelligent responses self.mock_llm = Mock() self.recall = RecallOperation(self.store, self.mock_llm) def tearDown(self): """Clean up.""" shutil.rmtree(self.temp_dir, ignore_errors=True) def _create_diverse_memories(self): """Create diverse test memories.""" memories = [ ("User prefers Python for backend development", ["python", "backend"]), ("User likes React for frontend", ["javascript", "frontend"]), ("User uses Docker for deployment", ["devops", "docker"]), ("User prefers PostgreSQL over MySQL", ["database", "postgresql"]), ("User likes dark mode in all apps", ["ui", "preference"]), ] for content, tags in memories: self.remember.remember( content=content, conversation_id="test-conv", tags=tags, confidence=0.9 ) @unittest.skip("Requires full REPL setup with LLM") def test_recall_end_to_end(self): """End-to-end test with realistic LLM simulation.""" # Simulate LLM that uses search_chunks and read_chunk def smart_llm(prompt): if "python" in prompt.lower(): return """ results = search_chunks('python', limit=3) if results: chunks = [read_chunk(r) for r in results] content = ' '.join([c['content'] for c in chunks if c]) FINAL(content) else: FINAL('No Python memories found') """ return "FINAL('No relevant memories')" self.mock_llm.complete = Mock(side_effect=smart_llm) result = self.recall.recall("What does the user prefer for backend?") # Should find Python-related memory self.assertIsNotNone(result.answer) self.assertGreater(len(result.source_chunks), 0) if __name__ == "__main__": unittest.main()