beadboard/.agents/skills/rlm-mem/brain/scripts/chunking_engine.py

"""
RLM-MEM - Chunking Engine
D1.2: Semantic content chunking for RLM Memory System

Splits content into bounded semantic chunks (100-800 tokens) with content type detection.
"""

import re
from typing import List, Optional
from dataclasses import dataclass, field

# Try to import tiktoken for accurate token counting
try:
    import tiktoken
    TIKTOKEN_AVAILABLE = True
except ImportError:
    TIKTOKEN_AVAILABLE = False

try:
    from .memory_store import Chunk, ChunkMetadata, ChunkLinks, ChunkType
except ImportError:
    # Fallback for direct execution
    from memory_store import Chunk, ChunkMetadata, ChunkLinks, ChunkType


@dataclass
class ChunkResult:
    """Result of chunking a piece of content."""
    content: str
    tokens: int
    type: str
    tags: List[str] = field(default_factory=list)


class ChunkingEngine:
    """
    Splits content into bounded semantic chunks.

    Strategy: Simple Bounded Semantic
    1. Split on paragraphs (\n\n)
    2. Merge small paragraphs (< min_tokens) with next
    3. Split large paragraphs (> max_tokens) at sentence boundaries
    4. Detect content type (fact, preference, pattern, note, decision)
    """

    def __init__(self, min_tokens: int = 100, max_tokens: int = 800):
        """
        Initialize the chunking engine.

        Args:
            min_tokens: Minimum tokens per chunk (default: 100)
            max_tokens: Maximum tokens per chunk (default: 800)
        """
        self.min_tokens = min_tokens
        self.max_tokens = max_tokens

        # Initialize tiktoken encoder if available
        self._encoder = None
        if TIKTOKEN_AVAILABLE:
            try:
                self._encoder = tiktoken.get_encoding("cl100k_base")
            except Exception:
                pass  # Fall back to character-based estimation

    def count_tokens(self, text: str) -> int:
        """
        Estimate token count.

        Uses tiktoken if available, otherwise uses len/4 approximation
        which works reasonably well for English text.

        Args:
            text: Text to count tokens for

        Returns:
            Estimated token count
        """
        if text is None or text == "":
            return 0

        if self._encoder is not None:
            try:
                return len(self._encoder.encode(text))
            except Exception:
                pass  # Fall back to approximation

        # Character-based approximation: ~4 chars per token for English
        # This is a rough estimate but works for most cases
        return max(1, len(text) // 4)

    def detect_content_type(self, content: str) -> str:
        """
        Detect if content is fact, preference, pattern, note, or decision.

        Detection rules (case-insensitive, word boundaries respected):
        - Decision: "decided", "chose", "selected", "going with"
        - Preference: "prefer", "like", "want", "rather"
        - Fact: "is a", "are a", "works as", "located in"
        - Pattern: "usually", "often", "tends to", "pattern"
        - Default: "note"

        Args:
            content: Content to analyze

        Returns:
            Content type string
        """
        if not content:
            return ChunkType.NOTE.value

        content_lower = content.lower()

        # Decision indicators (highest priority - explicit actions)
        decision_patterns = [
            r'\bdecided\b', r'\bchose\b', r'\bselected\b',
            r'\bgoing with\b', r'\bwent with\b', r'\bopted for\b',
            r'\bsettled on\b', r'\bconcluded\b'
        ]
        for pattern in decision_patterns:
            if re.search(pattern, content_lower):
                return ChunkType.DECISION.value

        # Pattern indicators (habits, recurring behaviors) - check BEFORE preference
        # because phrases like "generally prefer" describe patterns, not preferences
        pattern_patterns = [
            r'\busually\b', r'\boften\b', r'\btends to\b', r'\bpattern\b',
            r'\balways\b', r'\btypically\b', r'\bgenerally\b',
            r'\bfrequently\b', r'\bregularly\b', r'\bevery time\b',
            r'\bmost of the time\b', r'\bwhenever\b'
        ]
        for pattern in pattern_patterns:
            if re.search(pattern, content_lower):
                return ChunkType.PATTERN.value

        # Preference indicators
        preference_patterns = [
            r'\bprefer\b', r'\blike\b', r'\bwant\b', r'\brather\b',
            r'\bdislike\b', r'\bhate\b', r'\bwish\b', r'\bwould like\b',
            r'\bfavorite\b', r'\bfavour\b'
        ]
        for pattern in preference_patterns:
            if re.search(pattern, content_lower):
                return ChunkType.PREFERENCE.value

        # Fact indicators (statements of truth)
        fact_patterns = [
            r'\bis a\b', r'\bare a\b', r'\bworks as\b', r'\blocated in\b',
            r'\bis an\b', r'\bare an\b', r'\bwas a\b', r'\bwere a\b',
            r'\bworks at\b', r'\bworks for\b', r'\blives in\b',
            r'\bborn in\b', r'\bstudied at\b', r'\bgraduated from\b',
            r'\bhas\s+\d+', r'\bthere are\s+\d+', r'\bthere is\s+'
        ]
        for pattern in fact_patterns:
            if re.search(pattern, content_lower):
                return ChunkType.FACT.value

        # Default: note
        return ChunkType.NOTE.value

    def _split_into_paragraphs(self, content: str) -> List[str]:
        """
        Split content into paragraphs on double newlines.

        Handles edge cases like multiple consecutive newlines and whitespace.
        """
        # Split on double newlines
        raw_paragraphs = re.split(r'\n\n+', content)

        # Clean up each paragraph
        paragraphs = []
        for p in raw_paragraphs:
            # Strip whitespace and normalize internal whitespace
            cleaned = p.strip()
            if cleaned:
                # Normalize internal newlines (preserve single newlines within paragraphs)
                cleaned = re.sub(r'[ \t]+', ' ', cleaned)
                paragraphs.append(cleaned)

        return paragraphs

    def _split_sentences(self, text: str) -> List[str]:
        """
        Split text into sentences.

        Handles abbreviations and edge cases reasonably well.
        """
        # Pattern for sentence boundaries
        # Matches . ? or ! followed by space or end of string
        # Handles quotes and parentheses
        sentence_pattern = r'(?<=[.!?])\s+(?=[A-Z"\'\(])|(?<=[.!?])$'

        sentences = re.split(sentence_pattern, text)

        # Clean up
        result = []
        for s in sentences:
            cleaned = s.strip()
            if cleaned:
                result.append(cleaned)

        return result

    def _split_large_chunk(self, content: str) -> List[str]:
        """
        Split a large chunk (> max_tokens) at sentence boundaries.

        Tries to create chunks that are as close to max_tokens as possible
        without exceeding it.
        """
        sentences = self._split_sentences(content)

        if len(sentences) <= 1:
            # Cannot split by sentences, force split by token count
            return self._force_split(content)

        chunks = []
        current_chunk = []
        current_tokens = 0

        for sentence in sentences:
            sentence_tokens = self.count_tokens(sentence)

            # If a single sentence exceeds max_tokens, force split it
            if sentence_tokens > self.max_tokens:
                # First, flush current chunk if any
                if current_chunk:
                    chunks.append(' '.join(current_chunk))
                    current_chunk = []
                    current_tokens = 0

                # Force split this long sentence
                chunks.extend(self._force_split(sentence))
                continue

            # Check if adding this sentence would exceed max_tokens
            if current_tokens + sentence_tokens > self.max_tokens and current_chunk:
                # Flush current chunk
                chunks.append(' '.join(current_chunk))
                current_chunk = [sentence]
                current_tokens = sentence_tokens
            else:
                # Add to current chunk
                current_chunk.append(sentence)
                current_tokens += sentence_tokens

        # Don't forget the last chunk
        if current_chunk:
            chunks.append(' '.join(current_chunk))

        return chunks

    def _force_split(self, content: str) -> List[str]:
        """
        Force split content into chunks of approximately max_tokens.

        Used when sentence splitting isn't sufficient.
        """
        total_tokens = self.count_tokens(content)

        if total_tokens <= self.max_tokens:
            return [content]

        # Calculate approximate characters per chunk
        # We use character count as a proxy for token count
        chars_per_token = len(content) / total_tokens
        chars_per_chunk = int(self.max_tokens * chars_per_token * 0.95)  # 5% safety margin

        chunks = []
        start = 0

        while start < len(content):
            end = start + chars_per_chunk

            if end >= len(content):
                # Last chunk
                chunks.append(content[start:].strip())
                break

            # Try to find a word boundary
            # Look for space, period, or other punctuation
            search_end = min(end + 50, len(content))  # Look ahead 50 chars
            boundary = end

            # Find the last space or punctuation before search_end
            for i in range(search_end - 1, start, -1):
                if content[i] in ' \t\n.,;:!?':
                    boundary = i + 1
                    break

            chunk = content[start:boundary].strip()
            if chunk:
                chunks.append(chunk)

            start = boundary

        return chunks

    def chunk(self, content: str, conversation_id: str,
              tags: List[str] = None) -> List[ChunkResult]:
        """
        Split content into bounded semantic chunks.

        Strategy: Simple Bounded Semantic
        1. Split on paragraphs (\n\n)
        2. Merge small paragraphs (< min_tokens) with next
        3. Split large paragraphs (> max_tokens) at sentence boundaries
        4. Detect content type (fact, preference, pattern, note, decision)

        Args:
            content: Text content to chunk
            conversation_id: Source conversation ID
            tags: Optional list of tags to apply to all chunks

        Returns:
            List of ChunkResult objects ready for storage
        """
        if not content or not content.strip():
            return []

        tags = tags or []

        # Step 1: Split into paragraphs
        paragraphs = self._split_into_paragraphs(content)

        # Step 2: Process paragraphs - handle size bounds
        raw_chunks = []

        for paragraph in paragraphs:
            tokens = self.count_tokens(paragraph)

            if tokens > self.max_tokens:
                # Split large paragraph at sentence boundaries
                split_chunks = self._split_large_chunk(paragraph)
                raw_chunks.extend(split_chunks)
            else:
                raw_chunks.append(paragraph)

        # Step 3: Merge small chunks
        merged_chunks = self._merge_small_chunks(raw_chunks)

        # Step 4: Create ChunkResult objects with type detection
        results = []
        for chunk_content in merged_chunks:
            chunk_tokens = self.count_tokens(chunk_content)
            content_type = self.detect_content_type(chunk_content)

            result = ChunkResult(
                content=chunk_content,
                tokens=chunk_tokens,
                type=content_type,
                tags=tags.copy()
            )
            results.append(result)

        return results

    def _merge_small_chunks(self, chunks: List[str]) -> List[str]:
        """
        Merge chunks that are below min_tokens with adjacent chunks.

        Strategy:
        - Try to merge with next chunk (if same content type)
        - If merging would exceed max_tokens, keep as-is (it's the best we can do)
        - Don't merge chunks with different content types (semantic boundaries)
        - Handle the last chunk specially - merge with previous if possible
        """
        if not chunks:
            return []

        if len(chunks) == 1:
            return chunks

        result = []
        i = 0

        while i < len(chunks):
            current = chunks[i]
            current_tokens = self.count_tokens(current)
            current_type = self.detect_content_type(current)

            # If current chunk is large enough, add it
            if current_tokens >= self.min_tokens:
                result.append(current)
                i += 1
                continue

            # Current chunk is too small - try to merge with next
            if i + 1 < len(chunks):
                next_chunk = chunks[i + 1]
                next_tokens = self.count_tokens(next_chunk)
                next_type = self.detect_content_type(next_chunk)

                # Don't merge if content types differ (preserve semantic boundaries)
                if current_type != next_type:
                    result.append(current)  # Add as-is even if small
                    i += 1
                    continue

                # Check if merging would exceed max_tokens
                combined_tokens = current_tokens + next_tokens

                if combined_tokens <= self.max_tokens:
                    # Merge current with next
                    merged = current + "\n\n" + next_chunk
                    # Replace next chunk with merged version
                    chunks[i + 1] = merged
                    i += 1
                    continue
                else:
                    # Can't merge without exceeding max
                    # Add current as-is (it's below min but we can't help it)
                    result.append(current)
                    i += 1
                    continue
            else:
                # This is the last chunk and it's too small
                # Try to merge with previous result if possible
                if result:
                    prev = result[-1]
                    prev_tokens = self.count_tokens(prev)
                    prev_type = self.detect_content_type(prev)
                    combined_tokens = prev_tokens + current_tokens

                    # Only merge if types match
                    if combined_tokens <= self.max_tokens and prev_type == current_type:
                        # Merge with previous
                        result[-1] = prev + "\n\n" + current
                    else:
                        # Can't merge, add as-is
                        result.append(current)
                else:
                    # No previous chunk, add as-is
                    result.append(current)

                i += 1

        return result


def chunk_and_store(content: str, conversation_id: str,
                    store, tags: List[str] = None,
                    min_tokens: int = 100, max_tokens: int = 800) -> List[Chunk]:
    """
    Convenience function to chunk content and store in ChunkStore.

    Args:
        content: Text to chunk and store
        conversation_id: Source conversation ID
        store: ChunkStore instance
        tags: Optional tags for all chunks
        min_tokens: Minimum tokens per chunk
        max_tokens: Maximum tokens per chunk

    Returns:
        List of created Chunk objects
    """
    engine = ChunkingEngine(min_tokens=min_tokens, max_tokens=max_tokens)
    chunk_results = engine.chunk(content, conversation_id, tags)

    created_chunks = []
    for result in chunk_results:
        chunk = store.create_chunk(
            content=result.content,
            chunk_type=result.type,
            conversation_id=conversation_id,
            tokens=result.tokens,
            tags=result.tags
        )
        created_chunks.append(chunk)

    return created_chunks


# ============== Testing ==============

if __name__ == "__main__":
    print("=" * 60)
    print("Chunking Engine - Self Test")
    print("=" * 60)

    # Test 1: Basic multi-paragraph content
    print("\n[Test 1] Multi-paragraph content")
    content = """Paragraph 1. Short.

Paragraph 2 is longer with multiple sentences. It should stand alone.

This is a decision: We chose to use RLM architecture."""

    engine = ChunkingEngine()
    chunks = engine.chunk(content, "test-conv")

    print(f"Input paragraphs: 3")
    print(f"Output chunks: {len(chunks)}")
    for i, c in enumerate(chunks, 1):
        print(f"  Chunk {i}: {c.type}, {c.tokens} tokens")
        print(f"    Content: {c.content[:60]}...")

    # Test 2: Content type detection
    print("\n[Test 2] Content type detection")
    test_cases = [
        ("I prefer chocolate over vanilla", "preference"),
        ("We decided to use Python", "decision"),
        ("Python is a programming language", "fact"),
        ("I usually wake up early", "pattern"),
        ("This is just a random note", "note"),
    ]

    for text, expected in test_cases:
        detected = engine.detect_content_type(text)
        status = "[OK]" if detected == expected else "[FAIL]"
        print(f"  {status} '{text[:40]}...' -> {detected} (expected: {expected})")

    # Test 3: Small paragraph merging
    print("\n[Test 3] Small paragraph merging")
    content = """A.

B.

C is a longer paragraph with more content that should stand on its own."""

    chunks = engine.chunk(content, "test-conv")
    print(f"Input paragraphs: 3 (two very short)")
    print(f"Output chunks: {len(chunks)}")
    for i, c in enumerate(chunks, 1):
        print(f"  Chunk {i}: {c.tokens} tokens - {c.content[:50]}...")

    # Test 4: Large paragraph splitting
    print("\n[Test 4] Large paragraph splitting")
    # Generate a paragraph that's definitely over 800 tokens
    large_content = " ".join([f"This is sentence number {i} in a very long paragraph."
                              for i in range(1, 201)])  # ~200 sentences

    chunks = engine.chunk(large_content, "test-conv")
    total_tokens = sum(c.tokens for c in chunks)
    print(f"Input: ~{engine.count_tokens(large_content)} tokens")
    print(f"Output chunks: {len(chunks)}")
    for i, c in enumerate(chunks, 1):
        status = "[OK]" if 100 <= c.tokens <= 800 else "[FAIL]"
        print(f"  {status} Chunk {i}: {c.tokens} tokens")

    # Test 5: Token counting comparison
    print("\n[Test 5] Token counting")
    test_text = "This is a test sentence with exactly twelve tokens."
    estimated = engine.count_tokens(test_text)
    print(f"  Text: '{test_text}'")
    print(f"  Estimated tokens: {estimated}")
    print(f"  Tiktoken available: {TIKTOKEN_AVAILABLE}")

    # Test 6: Integration with ChunkStore
    print("\n[Test 6] Integration with ChunkStore")
    try:
        from .memory_store import ChunkStore

        store = ChunkStore("brain/memory")
        test_content = """First fact: Python is a programming language.

Second decision: We chose to implement async support.

Third preference: I prefer using type hints."""

        created = chunk_and_store(
            content=test_content,
            conversation_id="integration-test",
            store=store,
            tags=["test", "integration"]
        )

        print(f"  Created {len(created)} chunks:")
        for c in created:
            print(f"    - {c.id}: {c.type}, {c.tokens} tokens")

        # Cleanup - archive the test chunks
        for c in created:
            store.delete_chunk(c.id, permanent=False)
        print("  ✓ Test chunks archived")

    except Exception as e:
        print(f"  [SKIP] Integration test skipped: {e}")

    print("\n" + "=" * 60)
    print("All tests completed!")
    print("=" * 60)