beadboard/.agents/skills/rlm-mem/brain/scripts/chunking_engine.py

582 lines
21 KiB
Python

"""
RLM-MEM - Chunking Engine
D1.2: Semantic content chunking for RLM Memory System
Splits content into bounded semantic chunks (100-800 tokens) with content type detection.
"""
import re
from typing import List, Optional
from dataclasses import dataclass, field
# Try to import tiktoken for accurate token counting
try:
import tiktoken
TIKTOKEN_AVAILABLE = True
except ImportError:
TIKTOKEN_AVAILABLE = False
try:
from .memory_store import Chunk, ChunkMetadata, ChunkLinks, ChunkType
except ImportError:
# Fallback for direct execution
from memory_store import Chunk, ChunkMetadata, ChunkLinks, ChunkType
@dataclass
class ChunkResult:
"""Result of chunking a piece of content."""
content: str
tokens: int
type: str
tags: List[str] = field(default_factory=list)
class ChunkingEngine:
"""
Splits content into bounded semantic chunks.
Strategy: Simple Bounded Semantic
1. Split on paragraphs (\n\n)
2. Merge small paragraphs (< min_tokens) with next
3. Split large paragraphs (> max_tokens) at sentence boundaries
4. Detect content type (fact, preference, pattern, note, decision)
"""
def __init__(self, min_tokens: int = 100, max_tokens: int = 800):
"""
Initialize the chunking engine.
Args:
min_tokens: Minimum tokens per chunk (default: 100)
max_tokens: Maximum tokens per chunk (default: 800)
"""
self.min_tokens = min_tokens
self.max_tokens = max_tokens
# Initialize tiktoken encoder if available
self._encoder = None
if TIKTOKEN_AVAILABLE:
try:
self._encoder = tiktoken.get_encoding("cl100k_base")
except Exception:
pass # Fall back to character-based estimation
def count_tokens(self, text: str) -> int:
"""
Estimate token count.
Uses tiktoken if available, otherwise uses len/4 approximation
which works reasonably well for English text.
Args:
text: Text to count tokens for
Returns:
Estimated token count
"""
if text is None or text == "":
return 0
if self._encoder is not None:
try:
return len(self._encoder.encode(text))
except Exception:
pass # Fall back to approximation
# Character-based approximation: ~4 chars per token for English
# This is a rough estimate but works for most cases
return max(1, len(text) // 4)
def detect_content_type(self, content: str) -> str:
"""
Detect if content is fact, preference, pattern, note, or decision.
Detection rules (case-insensitive, word boundaries respected):
- Decision: "decided", "chose", "selected", "going with"
- Preference: "prefer", "like", "want", "rather"
- Fact: "is a", "are a", "works as", "located in"
- Pattern: "usually", "often", "tends to", "pattern"
- Default: "note"
Args:
content: Content to analyze
Returns:
Content type string
"""
if not content:
return ChunkType.NOTE.value
content_lower = content.lower()
# Decision indicators (highest priority - explicit actions)
decision_patterns = [
r'\bdecided\b', r'\bchose\b', r'\bselected\b',
r'\bgoing with\b', r'\bwent with\b', r'\bopted for\b',
r'\bsettled on\b', r'\bconcluded\b'
]
for pattern in decision_patterns:
if re.search(pattern, content_lower):
return ChunkType.DECISION.value
# Pattern indicators (habits, recurring behaviors) - check BEFORE preference
# because phrases like "generally prefer" describe patterns, not preferences
pattern_patterns = [
r'\busually\b', r'\boften\b', r'\btends to\b', r'\bpattern\b',
r'\balways\b', r'\btypically\b', r'\bgenerally\b',
r'\bfrequently\b', r'\bregularly\b', r'\bevery time\b',
r'\bmost of the time\b', r'\bwhenever\b'
]
for pattern in pattern_patterns:
if re.search(pattern, content_lower):
return ChunkType.PATTERN.value
# Preference indicators
preference_patterns = [
r'\bprefer\b', r'\blike\b', r'\bwant\b', r'\brather\b',
r'\bdislike\b', r'\bhate\b', r'\bwish\b', r'\bwould like\b',
r'\bfavorite\b', r'\bfavour\b'
]
for pattern in preference_patterns:
if re.search(pattern, content_lower):
return ChunkType.PREFERENCE.value
# Fact indicators (statements of truth)
fact_patterns = [
r'\bis a\b', r'\bare a\b', r'\bworks as\b', r'\blocated in\b',
r'\bis an\b', r'\bare an\b', r'\bwas a\b', r'\bwere a\b',
r'\bworks at\b', r'\bworks for\b', r'\blives in\b',
r'\bborn in\b', r'\bstudied at\b', r'\bgraduated from\b',
r'\bhas\s+\d+', r'\bthere are\s+\d+', r'\bthere is\s+'
]
for pattern in fact_patterns:
if re.search(pattern, content_lower):
return ChunkType.FACT.value
# Default: note
return ChunkType.NOTE.value
def _split_into_paragraphs(self, content: str) -> List[str]:
"""
Split content into paragraphs on double newlines.
Handles edge cases like multiple consecutive newlines and whitespace.
"""
# Split on double newlines
raw_paragraphs = re.split(r'\n\n+', content)
# Clean up each paragraph
paragraphs = []
for p in raw_paragraphs:
# Strip whitespace and normalize internal whitespace
cleaned = p.strip()
if cleaned:
# Normalize internal newlines (preserve single newlines within paragraphs)
cleaned = re.sub(r'[ \t]+', ' ', cleaned)
paragraphs.append(cleaned)
return paragraphs
def _split_sentences(self, text: str) -> List[str]:
"""
Split text into sentences.
Handles abbreviations and edge cases reasonably well.
"""
# Pattern for sentence boundaries
# Matches . ? or ! followed by space or end of string
# Handles quotes and parentheses
sentence_pattern = r'(?<=[.!?])\s+(?=[A-Z"\'\(])|(?<=[.!?])$'
sentences = re.split(sentence_pattern, text)
# Clean up
result = []
for s in sentences:
cleaned = s.strip()
if cleaned:
result.append(cleaned)
return result
def _split_large_chunk(self, content: str) -> List[str]:
"""
Split a large chunk (> max_tokens) at sentence boundaries.
Tries to create chunks that are as close to max_tokens as possible
without exceeding it.
"""
sentences = self._split_sentences(content)
if len(sentences) <= 1:
# Cannot split by sentences, force split by token count
return self._force_split(content)
chunks = []
current_chunk = []
current_tokens = 0
for sentence in sentences:
sentence_tokens = self.count_tokens(sentence)
# If a single sentence exceeds max_tokens, force split it
if sentence_tokens > self.max_tokens:
# First, flush current chunk if any
if current_chunk:
chunks.append(' '.join(current_chunk))
current_chunk = []
current_tokens = 0
# Force split this long sentence
chunks.extend(self._force_split(sentence))
continue
# Check if adding this sentence would exceed max_tokens
if current_tokens + sentence_tokens > self.max_tokens and current_chunk:
# Flush current chunk
chunks.append(' '.join(current_chunk))
current_chunk = [sentence]
current_tokens = sentence_tokens
else:
# Add to current chunk
current_chunk.append(sentence)
current_tokens += sentence_tokens
# Don't forget the last chunk
if current_chunk:
chunks.append(' '.join(current_chunk))
return chunks
def _force_split(self, content: str) -> List[str]:
"""
Force split content into chunks of approximately max_tokens.
Used when sentence splitting isn't sufficient.
"""
total_tokens = self.count_tokens(content)
if total_tokens <= self.max_tokens:
return [content]
# Calculate approximate characters per chunk
# We use character count as a proxy for token count
chars_per_token = len(content) / total_tokens
chars_per_chunk = int(self.max_tokens * chars_per_token * 0.95) # 5% safety margin
chunks = []
start = 0
while start < len(content):
end = start + chars_per_chunk
if end >= len(content):
# Last chunk
chunks.append(content[start:].strip())
break
# Try to find a word boundary
# Look for space, period, or other punctuation
search_end = min(end + 50, len(content)) # Look ahead 50 chars
boundary = end
# Find the last space or punctuation before search_end
for i in range(search_end - 1, start, -1):
if content[i] in ' \t\n.,;:!?':
boundary = i + 1
break
chunk = content[start:boundary].strip()
if chunk:
chunks.append(chunk)
start = boundary
return chunks
def chunk(self, content: str, conversation_id: str,
tags: List[str] = None) -> List[ChunkResult]:
"""
Split content into bounded semantic chunks.
Strategy: Simple Bounded Semantic
1. Split on paragraphs (\n\n)
2. Merge small paragraphs (< min_tokens) with next
3. Split large paragraphs (> max_tokens) at sentence boundaries
4. Detect content type (fact, preference, pattern, note, decision)
Args:
content: Text content to chunk
conversation_id: Source conversation ID
tags: Optional list of tags to apply to all chunks
Returns:
List of ChunkResult objects ready for storage
"""
if not content or not content.strip():
return []
tags = tags or []
# Step 1: Split into paragraphs
paragraphs = self._split_into_paragraphs(content)
# Step 2: Process paragraphs - handle size bounds
raw_chunks = []
for paragraph in paragraphs:
tokens = self.count_tokens(paragraph)
if tokens > self.max_tokens:
# Split large paragraph at sentence boundaries
split_chunks = self._split_large_chunk(paragraph)
raw_chunks.extend(split_chunks)
else:
raw_chunks.append(paragraph)
# Step 3: Merge small chunks
merged_chunks = self._merge_small_chunks(raw_chunks)
# Step 4: Create ChunkResult objects with type detection
results = []
for chunk_content in merged_chunks:
chunk_tokens = self.count_tokens(chunk_content)
content_type = self.detect_content_type(chunk_content)
result = ChunkResult(
content=chunk_content,
tokens=chunk_tokens,
type=content_type,
tags=tags.copy()
)
results.append(result)
return results
def _merge_small_chunks(self, chunks: List[str]) -> List[str]:
"""
Merge chunks that are below min_tokens with adjacent chunks.
Strategy:
- Try to merge with next chunk (if same content type)
- If merging would exceed max_tokens, keep as-is (it's the best we can do)
- Don't merge chunks with different content types (semantic boundaries)
- Handle the last chunk specially - merge with previous if possible
"""
if not chunks:
return []
if len(chunks) == 1:
return chunks
result = []
i = 0
while i < len(chunks):
current = chunks[i]
current_tokens = self.count_tokens(current)
current_type = self.detect_content_type(current)
# If current chunk is large enough, add it
if current_tokens >= self.min_tokens:
result.append(current)
i += 1
continue
# Current chunk is too small - try to merge with next
if i + 1 < len(chunks):
next_chunk = chunks[i + 1]
next_tokens = self.count_tokens(next_chunk)
next_type = self.detect_content_type(next_chunk)
# Don't merge if content types differ (preserve semantic boundaries)
if current_type != next_type:
result.append(current) # Add as-is even if small
i += 1
continue
# Check if merging would exceed max_tokens
combined_tokens = current_tokens + next_tokens
if combined_tokens <= self.max_tokens:
# Merge current with next
merged = current + "\n\n" + next_chunk
# Replace next chunk with merged version
chunks[i + 1] = merged
i += 1
continue
else:
# Can't merge without exceeding max
# Add current as-is (it's below min but we can't help it)
result.append(current)
i += 1
continue
else:
# This is the last chunk and it's too small
# Try to merge with previous result if possible
if result:
prev = result[-1]
prev_tokens = self.count_tokens(prev)
prev_type = self.detect_content_type(prev)
combined_tokens = prev_tokens + current_tokens
# Only merge if types match
if combined_tokens <= self.max_tokens and prev_type == current_type:
# Merge with previous
result[-1] = prev + "\n\n" + current
else:
# Can't merge, add as-is
result.append(current)
else:
# No previous chunk, add as-is
result.append(current)
i += 1
return result
def chunk_and_store(content: str, conversation_id: str,
store, tags: List[str] = None,
min_tokens: int = 100, max_tokens: int = 800) -> List[Chunk]:
"""
Convenience function to chunk content and store in ChunkStore.
Args:
content: Text to chunk and store
conversation_id: Source conversation ID
store: ChunkStore instance
tags: Optional tags for all chunks
min_tokens: Minimum tokens per chunk
max_tokens: Maximum tokens per chunk
Returns:
List of created Chunk objects
"""
engine = ChunkingEngine(min_tokens=min_tokens, max_tokens=max_tokens)
chunk_results = engine.chunk(content, conversation_id, tags)
created_chunks = []
for result in chunk_results:
chunk = store.create_chunk(
content=result.content,
chunk_type=result.type,
conversation_id=conversation_id,
tokens=result.tokens,
tags=result.tags
)
created_chunks.append(chunk)
return created_chunks
# ============== Testing ==============
if __name__ == "__main__":
print("=" * 60)
print("Chunking Engine - Self Test")
print("=" * 60)
# Test 1: Basic multi-paragraph content
print("\n[Test 1] Multi-paragraph content")
content = """Paragraph 1. Short.
Paragraph 2 is longer with multiple sentences. It should stand alone.
This is a decision: We chose to use RLM architecture."""
engine = ChunkingEngine()
chunks = engine.chunk(content, "test-conv")
print(f"Input paragraphs: 3")
print(f"Output chunks: {len(chunks)}")
for i, c in enumerate(chunks, 1):
print(f" Chunk {i}: {c.type}, {c.tokens} tokens")
print(f" Content: {c.content[:60]}...")
# Test 2: Content type detection
print("\n[Test 2] Content type detection")
test_cases = [
("I prefer chocolate over vanilla", "preference"),
("We decided to use Python", "decision"),
("Python is a programming language", "fact"),
("I usually wake up early", "pattern"),
("This is just a random note", "note"),
]
for text, expected in test_cases:
detected = engine.detect_content_type(text)
status = "[OK]" if detected == expected else "[FAIL]"
print(f" {status} '{text[:40]}...' -> {detected} (expected: {expected})")
# Test 3: Small paragraph merging
print("\n[Test 3] Small paragraph merging")
content = """A.
B.
C is a longer paragraph with more content that should stand on its own."""
chunks = engine.chunk(content, "test-conv")
print(f"Input paragraphs: 3 (two very short)")
print(f"Output chunks: {len(chunks)}")
for i, c in enumerate(chunks, 1):
print(f" Chunk {i}: {c.tokens} tokens - {c.content[:50]}...")
# Test 4: Large paragraph splitting
print("\n[Test 4] Large paragraph splitting")
# Generate a paragraph that's definitely over 800 tokens
large_content = " ".join([f"This is sentence number {i} in a very long paragraph."
for i in range(1, 201)]) # ~200 sentences
chunks = engine.chunk(large_content, "test-conv")
total_tokens = sum(c.tokens for c in chunks)
print(f"Input: ~{engine.count_tokens(large_content)} tokens")
print(f"Output chunks: {len(chunks)}")
for i, c in enumerate(chunks, 1):
status = "[OK]" if 100 <= c.tokens <= 800 else "[FAIL]"
print(f" {status} Chunk {i}: {c.tokens} tokens")
# Test 5: Token counting comparison
print("\n[Test 5] Token counting")
test_text = "This is a test sentence with exactly twelve tokens."
estimated = engine.count_tokens(test_text)
print(f" Text: '{test_text}'")
print(f" Estimated tokens: {estimated}")
print(f" Tiktoken available: {TIKTOKEN_AVAILABLE}")
# Test 6: Integration with ChunkStore
print("\n[Test 6] Integration with ChunkStore")
try:
from .memory_store import ChunkStore
store = ChunkStore("brain/memory")
test_content = """First fact: Python is a programming language.
Second decision: We chose to implement async support.
Third preference: I prefer using type hints."""
created = chunk_and_store(
content=test_content,
conversation_id="integration-test",
store=store,
tags=["test", "integration"]
)
print(f" Created {len(created)} chunks:")
for c in created:
print(f" - {c.id}: {c.type}, {c.tokens} tokens")
# Cleanup - archive the test chunks
for c in created:
store.delete_chunk(c.id, permanent=False)
print(" ✓ Test chunks archived")
except Exception as e:
print(f" [SKIP] Integration test skipped: {e}")
print("\n" + "=" * 60)
print("All tests completed!")
print("=" * 60)