""" RLM-MEM - Chunking Engine D1.2: Semantic content chunking for RLM Memory System Splits content into bounded semantic chunks (100-800 tokens) with content type detection. """ import re from typing import List, Optional from dataclasses import dataclass, field # Try to import tiktoken for accurate token counting try: import tiktoken TIKTOKEN_AVAILABLE = True except ImportError: TIKTOKEN_AVAILABLE = False try: from .memory_store import Chunk, ChunkMetadata, ChunkLinks, ChunkType except ImportError: # Fallback for direct execution from memory_store import Chunk, ChunkMetadata, ChunkLinks, ChunkType @dataclass class ChunkResult: """Result of chunking a piece of content.""" content: str tokens: int type: str tags: List[str] = field(default_factory=list) class ChunkingEngine: """ Splits content into bounded semantic chunks. Strategy: Simple Bounded Semantic 1. Split on paragraphs (\n\n) 2. Merge small paragraphs (< min_tokens) with next 3. Split large paragraphs (> max_tokens) at sentence boundaries 4. Detect content type (fact, preference, pattern, note, decision) """ def __init__(self, min_tokens: int = 100, max_tokens: int = 800): """ Initialize the chunking engine. Args: min_tokens: Minimum tokens per chunk (default: 100) max_tokens: Maximum tokens per chunk (default: 800) """ self.min_tokens = min_tokens self.max_tokens = max_tokens # Initialize tiktoken encoder if available self._encoder = None if TIKTOKEN_AVAILABLE: try: self._encoder = tiktoken.get_encoding("cl100k_base") except Exception: pass # Fall back to character-based estimation def count_tokens(self, text: str) -> int: """ Estimate token count. Uses tiktoken if available, otherwise uses len/4 approximation which works reasonably well for English text. Args: text: Text to count tokens for Returns: Estimated token count """ if text is None or text == "": return 0 if self._encoder is not None: try: return len(self._encoder.encode(text)) except Exception: pass # Fall back to approximation # Character-based approximation: ~4 chars per token for English # This is a rough estimate but works for most cases return max(1, len(text) // 4) def detect_content_type(self, content: str) -> str: """ Detect if content is fact, preference, pattern, note, or decision. Detection rules (case-insensitive, word boundaries respected): - Decision: "decided", "chose", "selected", "going with" - Preference: "prefer", "like", "want", "rather" - Fact: "is a", "are a", "works as", "located in" - Pattern: "usually", "often", "tends to", "pattern" - Default: "note" Args: content: Content to analyze Returns: Content type string """ if not content: return ChunkType.NOTE.value content_lower = content.lower() # Decision indicators (highest priority - explicit actions) decision_patterns = [ r'\bdecided\b', r'\bchose\b', r'\bselected\b', r'\bgoing with\b', r'\bwent with\b', r'\bopted for\b', r'\bsettled on\b', r'\bconcluded\b' ] for pattern in decision_patterns: if re.search(pattern, content_lower): return ChunkType.DECISION.value # Pattern indicators (habits, recurring behaviors) - check BEFORE preference # because phrases like "generally prefer" describe patterns, not preferences pattern_patterns = [ r'\busually\b', r'\boften\b', r'\btends to\b', r'\bpattern\b', r'\balways\b', r'\btypically\b', r'\bgenerally\b', r'\bfrequently\b', r'\bregularly\b', r'\bevery time\b', r'\bmost of the time\b', r'\bwhenever\b' ] for pattern in pattern_patterns: if re.search(pattern, content_lower): return ChunkType.PATTERN.value # Preference indicators preference_patterns = [ r'\bprefer\b', r'\blike\b', r'\bwant\b', r'\brather\b', r'\bdislike\b', r'\bhate\b', r'\bwish\b', r'\bwould like\b', r'\bfavorite\b', r'\bfavour\b' ] for pattern in preference_patterns: if re.search(pattern, content_lower): return ChunkType.PREFERENCE.value # Fact indicators (statements of truth) fact_patterns = [ r'\bis a\b', r'\bare a\b', r'\bworks as\b', r'\blocated in\b', r'\bis an\b', r'\bare an\b', r'\bwas a\b', r'\bwere a\b', r'\bworks at\b', r'\bworks for\b', r'\blives in\b', r'\bborn in\b', r'\bstudied at\b', r'\bgraduated from\b', r'\bhas\s+\d+', r'\bthere are\s+\d+', r'\bthere is\s+' ] for pattern in fact_patterns: if re.search(pattern, content_lower): return ChunkType.FACT.value # Default: note return ChunkType.NOTE.value def _split_into_paragraphs(self, content: str) -> List[str]: """ Split content into paragraphs on double newlines. Handles edge cases like multiple consecutive newlines and whitespace. """ # Split on double newlines raw_paragraphs = re.split(r'\n\n+', content) # Clean up each paragraph paragraphs = [] for p in raw_paragraphs: # Strip whitespace and normalize internal whitespace cleaned = p.strip() if cleaned: # Normalize internal newlines (preserve single newlines within paragraphs) cleaned = re.sub(r'[ \t]+', ' ', cleaned) paragraphs.append(cleaned) return paragraphs def _split_sentences(self, text: str) -> List[str]: """ Split text into sentences. Handles abbreviations and edge cases reasonably well. """ # Pattern for sentence boundaries # Matches . ? or ! followed by space or end of string # Handles quotes and parentheses sentence_pattern = r'(?<=[.!?])\s+(?=[A-Z"\'\(])|(?<=[.!?])$' sentences = re.split(sentence_pattern, text) # Clean up result = [] for s in sentences: cleaned = s.strip() if cleaned: result.append(cleaned) return result def _split_large_chunk(self, content: str) -> List[str]: """ Split a large chunk (> max_tokens) at sentence boundaries. Tries to create chunks that are as close to max_tokens as possible without exceeding it. """ sentences = self._split_sentences(content) if len(sentences) <= 1: # Cannot split by sentences, force split by token count return self._force_split(content) chunks = [] current_chunk = [] current_tokens = 0 for sentence in sentences: sentence_tokens = self.count_tokens(sentence) # If a single sentence exceeds max_tokens, force split it if sentence_tokens > self.max_tokens: # First, flush current chunk if any if current_chunk: chunks.append(' '.join(current_chunk)) current_chunk = [] current_tokens = 0 # Force split this long sentence chunks.extend(self._force_split(sentence)) continue # Check if adding this sentence would exceed max_tokens if current_tokens + sentence_tokens > self.max_tokens and current_chunk: # Flush current chunk chunks.append(' '.join(current_chunk)) current_chunk = [sentence] current_tokens = sentence_tokens else: # Add to current chunk current_chunk.append(sentence) current_tokens += sentence_tokens # Don't forget the last chunk if current_chunk: chunks.append(' '.join(current_chunk)) return chunks def _force_split(self, content: str) -> List[str]: """ Force split content into chunks of approximately max_tokens. Used when sentence splitting isn't sufficient. """ total_tokens = self.count_tokens(content) if total_tokens <= self.max_tokens: return [content] # Calculate approximate characters per chunk # We use character count as a proxy for token count chars_per_token = len(content) / total_tokens chars_per_chunk = int(self.max_tokens * chars_per_token * 0.95) # 5% safety margin chunks = [] start = 0 while start < len(content): end = start + chars_per_chunk if end >= len(content): # Last chunk chunks.append(content[start:].strip()) break # Try to find a word boundary # Look for space, period, or other punctuation search_end = min(end + 50, len(content)) # Look ahead 50 chars boundary = end # Find the last space or punctuation before search_end for i in range(search_end - 1, start, -1): if content[i] in ' \t\n.,;:!?': boundary = i + 1 break chunk = content[start:boundary].strip() if chunk: chunks.append(chunk) start = boundary return chunks def chunk(self, content: str, conversation_id: str, tags: List[str] = None) -> List[ChunkResult]: """ Split content into bounded semantic chunks. Strategy: Simple Bounded Semantic 1. Split on paragraphs (\n\n) 2. Merge small paragraphs (< min_tokens) with next 3. Split large paragraphs (> max_tokens) at sentence boundaries 4. Detect content type (fact, preference, pattern, note, decision) Args: content: Text content to chunk conversation_id: Source conversation ID tags: Optional list of tags to apply to all chunks Returns: List of ChunkResult objects ready for storage """ if not content or not content.strip(): return [] tags = tags or [] # Step 1: Split into paragraphs paragraphs = self._split_into_paragraphs(content) # Step 2: Process paragraphs - handle size bounds raw_chunks = [] for paragraph in paragraphs: tokens = self.count_tokens(paragraph) if tokens > self.max_tokens: # Split large paragraph at sentence boundaries split_chunks = self._split_large_chunk(paragraph) raw_chunks.extend(split_chunks) else: raw_chunks.append(paragraph) # Step 3: Merge small chunks merged_chunks = self._merge_small_chunks(raw_chunks) # Step 4: Create ChunkResult objects with type detection results = [] for chunk_content in merged_chunks: chunk_tokens = self.count_tokens(chunk_content) content_type = self.detect_content_type(chunk_content) result = ChunkResult( content=chunk_content, tokens=chunk_tokens, type=content_type, tags=tags.copy() ) results.append(result) return results def _merge_small_chunks(self, chunks: List[str]) -> List[str]: """ Merge chunks that are below min_tokens with adjacent chunks. Strategy: - Try to merge with next chunk (if same content type) - If merging would exceed max_tokens, keep as-is (it's the best we can do) - Don't merge chunks with different content types (semantic boundaries) - Handle the last chunk specially - merge with previous if possible """ if not chunks: return [] if len(chunks) == 1: return chunks result = [] i = 0 while i < len(chunks): current = chunks[i] current_tokens = self.count_tokens(current) current_type = self.detect_content_type(current) # If current chunk is large enough, add it if current_tokens >= self.min_tokens: result.append(current) i += 1 continue # Current chunk is too small - try to merge with next if i + 1 < len(chunks): next_chunk = chunks[i + 1] next_tokens = self.count_tokens(next_chunk) next_type = self.detect_content_type(next_chunk) # Don't merge if content types differ (preserve semantic boundaries) if current_type != next_type: result.append(current) # Add as-is even if small i += 1 continue # Check if merging would exceed max_tokens combined_tokens = current_tokens + next_tokens if combined_tokens <= self.max_tokens: # Merge current with next merged = current + "\n\n" + next_chunk # Replace next chunk with merged version chunks[i + 1] = merged i += 1 continue else: # Can't merge without exceeding max # Add current as-is (it's below min but we can't help it) result.append(current) i += 1 continue else: # This is the last chunk and it's too small # Try to merge with previous result if possible if result: prev = result[-1] prev_tokens = self.count_tokens(prev) prev_type = self.detect_content_type(prev) combined_tokens = prev_tokens + current_tokens # Only merge if types match if combined_tokens <= self.max_tokens and prev_type == current_type: # Merge with previous result[-1] = prev + "\n\n" + current else: # Can't merge, add as-is result.append(current) else: # No previous chunk, add as-is result.append(current) i += 1 return result def chunk_and_store(content: str, conversation_id: str, store, tags: List[str] = None, min_tokens: int = 100, max_tokens: int = 800) -> List[Chunk]: """ Convenience function to chunk content and store in ChunkStore. Args: content: Text to chunk and store conversation_id: Source conversation ID store: ChunkStore instance tags: Optional tags for all chunks min_tokens: Minimum tokens per chunk max_tokens: Maximum tokens per chunk Returns: List of created Chunk objects """ engine = ChunkingEngine(min_tokens=min_tokens, max_tokens=max_tokens) chunk_results = engine.chunk(content, conversation_id, tags) created_chunks = [] for result in chunk_results: chunk = store.create_chunk( content=result.content, chunk_type=result.type, conversation_id=conversation_id, tokens=result.tokens, tags=result.tags ) created_chunks.append(chunk) return created_chunks # ============== Testing ============== if __name__ == "__main__": print("=" * 60) print("Chunking Engine - Self Test") print("=" * 60) # Test 1: Basic multi-paragraph content print("\n[Test 1] Multi-paragraph content") content = """Paragraph 1. Short. Paragraph 2 is longer with multiple sentences. It should stand alone. This is a decision: We chose to use RLM architecture.""" engine = ChunkingEngine() chunks = engine.chunk(content, "test-conv") print(f"Input paragraphs: 3") print(f"Output chunks: {len(chunks)}") for i, c in enumerate(chunks, 1): print(f" Chunk {i}: {c.type}, {c.tokens} tokens") print(f" Content: {c.content[:60]}...") # Test 2: Content type detection print("\n[Test 2] Content type detection") test_cases = [ ("I prefer chocolate over vanilla", "preference"), ("We decided to use Python", "decision"), ("Python is a programming language", "fact"), ("I usually wake up early", "pattern"), ("This is just a random note", "note"), ] for text, expected in test_cases: detected = engine.detect_content_type(text) status = "[OK]" if detected == expected else "[FAIL]" print(f" {status} '{text[:40]}...' -> {detected} (expected: {expected})") # Test 3: Small paragraph merging print("\n[Test 3] Small paragraph merging") content = """A. B. C is a longer paragraph with more content that should stand on its own.""" chunks = engine.chunk(content, "test-conv") print(f"Input paragraphs: 3 (two very short)") print(f"Output chunks: {len(chunks)}") for i, c in enumerate(chunks, 1): print(f" Chunk {i}: {c.tokens} tokens - {c.content[:50]}...") # Test 4: Large paragraph splitting print("\n[Test 4] Large paragraph splitting") # Generate a paragraph that's definitely over 800 tokens large_content = " ".join([f"This is sentence number {i} in a very long paragraph." for i in range(1, 201)]) # ~200 sentences chunks = engine.chunk(large_content, "test-conv") total_tokens = sum(c.tokens for c in chunks) print(f"Input: ~{engine.count_tokens(large_content)} tokens") print(f"Output chunks: {len(chunks)}") for i, c in enumerate(chunks, 1): status = "[OK]" if 100 <= c.tokens <= 800 else "[FAIL]" print(f" {status} Chunk {i}: {c.tokens} tokens") # Test 5: Token counting comparison print("\n[Test 5] Token counting") test_text = "This is a test sentence with exactly twelve tokens." estimated = engine.count_tokens(test_text) print(f" Text: '{test_text}'") print(f" Estimated tokens: {estimated}") print(f" Tiktoken available: {TIKTOKEN_AVAILABLE}") # Test 6: Integration with ChunkStore print("\n[Test 6] Integration with ChunkStore") try: from .memory_store import ChunkStore store = ChunkStore("brain/memory") test_content = """First fact: Python is a programming language. Second decision: We chose to implement async support. Third preference: I prefer using type hints.""" created = chunk_and_store( content=test_content, conversation_id="integration-test", store=store, tags=["test", "integration"] ) print(f" Created {len(created)} chunks:") for c in created: print(f" - {c.id}: {c.type}, {c.tokens} tokens") # Cleanup - archive the test chunks for c in created: store.delete_chunk(c.id, permanent=False) print(" ✓ Test chunks archived") except Exception as e: print(f" [SKIP] Integration test skipped: {e}") print("\n" + "=" * 60) print("All tests completed!") print("=" * 60)