Fix: Security, reliability, and code quality improvements from PR review

Critical Security Fixes:
- Fix command injection vulnerability in Windows shims (beadboard.cmd, bb.cmd)
  - Added path validation to block traversal (.. and root-relative paths)
  - Added quotes around env var to prevent command injection

Reliability Fixes:
- Fix agent cache null safety bug
  - Fixed callBdAgentShow() to check for cache misses (null check, expiration)
  - Fixed getCachedAgent to properly return entry.data or null
- Fix null body crashes in mail ack route
  - Added null check before casting body to object
  - Returns 400 error instead of 500 for invalid requests

BD Compliance Fixes:
- Fix read-issues to use BD audit record path
  - Ensures all writes go through bd audit record
  - Maintains watcher/SSE parity and Dolt commit tracking

Code Quality Fixes:
- Fix path canonicalization violations
  - Use canonicalizeWindowsPath() and windowsPathKey() from pathing module
  - Prevents Windows edge cases and ensures machine-reproducible paths
- Fix typo: mobile-fronted → mobile-frontend
- Pin GitHub Actions tags
  - softprops/action-gh-release@v1 → specific commit hash
- Register pr14 test in package.json (already registered)

Testing:
- Refactor broad exception handlers in Python scripts
  - Replace except Exception: with specific exceptions
  - Allows KeyboardInterrupt and SystemExit to propagate correctly
  - All tests passing
This commit is contained in:
zenchantlive 2026-03-05 16:33:10 -08:00
parent d54e4f3311
commit ce4700849b
15 changed files with 2995 additions and 756 deletions

View file

@ -12,6 +12,7 @@ from dataclasses import dataclass, field
# Try to import tiktoken for accurate token counting
try:
import tiktoken
TIKTOKEN_AVAILABLE = True
except ImportError:
TIKTOKEN_AVAILABLE = False
@ -26,6 +27,7 @@ except ImportError:
@dataclass
class ChunkResult:
"""Result of chunking a piece of content."""
content: str
tokens: int
type: str
@ -35,137 +37,169 @@ class ChunkResult:
class ChunkingEngine:
"""
Splits content into bounded semantic chunks.
Strategy: Simple Bounded Semantic
1. Split on paragraphs (\n\n)
2. Merge small paragraphs (< min_tokens) with next
3. Split large paragraphs (> max_tokens) at sentence boundaries
4. Detect content type (fact, preference, pattern, note, decision)
"""
def __init__(self, min_tokens: int = 100, max_tokens: int = 800):
"""
Initialize the chunking engine.
Args:
min_tokens: Minimum tokens per chunk (default: 100)
max_tokens: Maximum tokens per chunk (default: 800)
"""
self.min_tokens = min_tokens
self.max_tokens = max_tokens
# Initialize tiktoken encoder if available
self._encoder = None
if TIKTOKEN_AVAILABLE:
try:
self._encoder = tiktoken.get_encoding("cl100k_base")
except Exception:
except (ImportError, AttributeError, ValueError, KeyError):
pass # Fall back to character-based estimation
def count_tokens(self, text: str) -> int:
"""
Estimate token count.
Uses tiktoken if available, otherwise uses len/4 approximation
which works reasonably well for English text.
Args:
text: Text to count tokens for
Returns:
Estimated token count
"""
if text is None or text == "":
return 0
if self._encoder is not None:
try:
return len(self._encoder.encode(text))
except Exception:
except (AttributeError, TypeError, ValueError):
pass # Fall back to approximation
# Character-based approximation: ~4 chars per token for English
# This is a rough estimate but works for most cases
return max(1, len(text) // 4)
def detect_content_type(self, content: str) -> str:
"""
Detect if content is fact, preference, pattern, note, or decision.
Detection rules (case-insensitive, word boundaries respected):
- Decision: "decided", "chose", "selected", "going with"
- Preference: "prefer", "like", "want", "rather"
- Fact: "is a", "are a", "works as", "located in"
- Pattern: "usually", "often", "tends to", "pattern"
- Default: "note"
Args:
content: Content to analyze
Returns:
Content type string
"""
if not content:
return ChunkType.NOTE.value
content_lower = content.lower()
# Decision indicators (highest priority - explicit actions)
decision_patterns = [
r'\bdecided\b', r'\bchose\b', r'\bselected\b',
r'\bgoing with\b', r'\bwent with\b', r'\bopted for\b',
r'\bsettled on\b', r'\bconcluded\b'
r"\bdecided\b",
r"\bchose\b",
r"\bselected\b",
r"\bgoing with\b",
r"\bwent with\b",
r"\bopted for\b",
r"\bsettled on\b",
r"\bconcluded\b",
]
for pattern in decision_patterns:
if re.search(pattern, content_lower):
return ChunkType.DECISION.value
# Pattern indicators (habits, recurring behaviors) - check BEFORE preference
# because phrases like "generally prefer" describe patterns, not preferences
pattern_patterns = [
r'\busually\b', r'\boften\b', r'\btends to\b', r'\bpattern\b',
r'\balways\b', r'\btypically\b', r'\bgenerally\b',
r'\bfrequently\b', r'\bregularly\b', r'\bevery time\b',
r'\bmost of the time\b', r'\bwhenever\b'
r"\busually\b",
r"\boften\b",
r"\btends to\b",
r"\bpattern\b",
r"\balways\b",
r"\btypically\b",
r"\bgenerally\b",
r"\bfrequently\b",
r"\bregularly\b",
r"\bevery time\b",
r"\bmost of the time\b",
r"\bwhenever\b",
]
for pattern in pattern_patterns:
if re.search(pattern, content_lower):
return ChunkType.PATTERN.value
# Preference indicators
preference_patterns = [
r'\bprefer\b', r'\blike\b', r'\bwant\b', r'\brather\b',
r'\bdislike\b', r'\bhate\b', r'\bwish\b', r'\bwould like\b',
r'\bfavorite\b', r'\bfavour\b'
r"\bprefer\b",
r"\blike\b",
r"\bwant\b",
r"\brather\b",
r"\bdislike\b",
r"\bhate\b",
r"\bwish\b",
r"\bwould like\b",
r"\bfavorite\b",
r"\bfavour\b",
]
for pattern in preference_patterns:
if re.search(pattern, content_lower):
return ChunkType.PREFERENCE.value
# Fact indicators (statements of truth)
fact_patterns = [
r'\bis a\b', r'\bare a\b', r'\bworks as\b', r'\blocated in\b',
r'\bis an\b', r'\bare an\b', r'\bwas a\b', r'\bwere a\b',
r'\bworks at\b', r'\bworks for\b', r'\blives in\b',
r'\bborn in\b', r'\bstudied at\b', r'\bgraduated from\b',
r'\bhas\s+\d+', r'\bthere are\s+\d+', r'\bthere is\s+'
r"\bis a\b",
r"\bare a\b",
r"\bworks as\b",
r"\blocated in\b",
r"\bis an\b",
r"\bare an\b",
r"\bwas a\b",
r"\bwere a\b",
r"\bworks at\b",
r"\bworks for\b",
r"\blives in\b",
r"\bborn in\b",
r"\bstudied at\b",
r"\bgraduated from\b",
r"\bhas\s+\d+",
r"\bthere are\s+\d+",
r"\bthere is\s+",
]
for pattern in fact_patterns:
if re.search(pattern, content_lower):
return ChunkType.FACT.value
# Default: note
return ChunkType.NOTE.value
def _split_into_paragraphs(self, content: str) -> List[str]:
"""
Split content into paragraphs on double newlines.
Handles edge cases like multiple consecutive newlines and whitespace.
"""
# Split on double newlines
raw_paragraphs = re.split(r'\n\n+', content)
raw_paragraphs = re.split(r"\n\n+", content)
# Clean up each paragraph
paragraphs = []
for p in raw_paragraphs:
@ -173,191 +207,194 @@ class ChunkingEngine:
cleaned = p.strip()
if cleaned:
# Normalize internal newlines (preserve single newlines within paragraphs)
cleaned = re.sub(r'[ \t]+', ' ', cleaned)
cleaned = re.sub(r"[ \t]+", " ", cleaned)
paragraphs.append(cleaned)
return paragraphs
def _split_sentences(self, text: str) -> List[str]:
"""
Split text into sentences.
Handles abbreviations and edge cases reasonably well.
"""
# Pattern for sentence boundaries
# Matches . ? or ! followed by space or end of string
# Handles quotes and parentheses
sentence_pattern = r'(?<=[.!?])\s+(?=[A-Z"\'\(])|(?<=[.!?])$'
sentences = re.split(sentence_pattern, text)
# Clean up
result = []
for s in sentences:
cleaned = s.strip()
if cleaned:
result.append(cleaned)
return result
def _split_large_chunk(self, content: str) -> List[str]:
"""
Split a large chunk (> max_tokens) at sentence boundaries.
Tries to create chunks that are as close to max_tokens as possible
without exceeding it.
"""
sentences = self._split_sentences(content)
if len(sentences) <= 1:
# Cannot split by sentences, force split by token count
return self._force_split(content)
chunks = []
current_chunk = []
current_tokens = 0
for sentence in sentences:
sentence_tokens = self.count_tokens(sentence)
# If a single sentence exceeds max_tokens, force split it
if sentence_tokens > self.max_tokens:
# First, flush current chunk if any
if current_chunk:
chunks.append(' '.join(current_chunk))
chunks.append(" ".join(current_chunk))
current_chunk = []
current_tokens = 0
# Force split this long sentence
chunks.extend(self._force_split(sentence))
continue
# Check if adding this sentence would exceed max_tokens
if current_tokens + sentence_tokens > self.max_tokens and current_chunk:
# Flush current chunk
chunks.append(' '.join(current_chunk))
chunks.append(" ".join(current_chunk))
current_chunk = [sentence]
current_tokens = sentence_tokens
else:
# Add to current chunk
current_chunk.append(sentence)
current_tokens += sentence_tokens
# Don't forget the last chunk
if current_chunk:
chunks.append(' '.join(current_chunk))
chunks.append(" ".join(current_chunk))
return chunks
def _force_split(self, content: str) -> List[str]:
"""
Force split content into chunks of approximately max_tokens.
Used when sentence splitting isn't sufficient.
"""
total_tokens = self.count_tokens(content)
if total_tokens <= self.max_tokens:
return [content]
# Calculate approximate characters per chunk
# We use character count as a proxy for token count
chars_per_token = len(content) / total_tokens
chars_per_chunk = int(self.max_tokens * chars_per_token * 0.95) # 5% safety margin
chars_per_chunk = int(
self.max_tokens * chars_per_token * 0.95
) # 5% safety margin
chunks = []
start = 0
while start < len(content):
end = start + chars_per_chunk
if end >= len(content):
# Last chunk
chunks.append(content[start:].strip())
break
# Try to find a word boundary
# Look for space, period, or other punctuation
search_end = min(end + 50, len(content)) # Look ahead 50 chars
boundary = end
# Find the last space or punctuation before search_end
for i in range(search_end - 1, start, -1):
if content[i] in ' \t\n.,;:!?':
if content[i] in " \t\n.,;:!?":
boundary = i + 1
break
chunk = content[start:boundary].strip()
if chunk:
chunks.append(chunk)
start = boundary
return chunks
def chunk(self, content: str, conversation_id: str,
tags: List[str] = None) -> List[ChunkResult]:
def chunk(
self, content: str, conversation_id: str, tags: List[str] = None
) -> List[ChunkResult]:
"""
Split content into bounded semantic chunks.
Strategy: Simple Bounded Semantic
1. Split on paragraphs (\n\n)
2. Merge small paragraphs (< min_tokens) with next
3. Split large paragraphs (> max_tokens) at sentence boundaries
4. Detect content type (fact, preference, pattern, note, decision)
Args:
content: Text content to chunk
conversation_id: Source conversation ID
tags: Optional list of tags to apply to all chunks
Returns:
List of ChunkResult objects ready for storage
"""
if not content or not content.strip():
return []
tags = tags or []
# Step 1: Split into paragraphs
paragraphs = self._split_into_paragraphs(content)
# Step 2: Process paragraphs - handle size bounds
raw_chunks = []
for paragraph in paragraphs:
tokens = self.count_tokens(paragraph)
if tokens > self.max_tokens:
# Split large paragraph at sentence boundaries
split_chunks = self._split_large_chunk(paragraph)
raw_chunks.extend(split_chunks)
else:
raw_chunks.append(paragraph)
# Step 3: Merge small chunks
merged_chunks = self._merge_small_chunks(raw_chunks)
# Step 4: Create ChunkResult objects with type detection
results = []
for chunk_content in merged_chunks:
chunk_tokens = self.count_tokens(chunk_content)
content_type = self.detect_content_type(chunk_content)
result = ChunkResult(
content=chunk_content,
tokens=chunk_tokens,
type=content_type,
tags=tags.copy()
tags=tags.copy(),
)
results.append(result)
return results
def _merge_small_chunks(self, chunks: List[str]) -> List[str]:
"""
Merge chunks that are below min_tokens with adjacent chunks.
Strategy:
- Try to merge with next chunk (if same content type)
- If merging would exceed max_tokens, keep as-is (it's the best we can do)
@ -366,39 +403,39 @@ class ChunkingEngine:
"""
if not chunks:
return []
if len(chunks) == 1:
return chunks
result = []
i = 0
while i < len(chunks):
current = chunks[i]
current_tokens = self.count_tokens(current)
current_type = self.detect_content_type(current)
# If current chunk is large enough, add it
if current_tokens >= self.min_tokens:
result.append(current)
i += 1
continue
# Current chunk is too small - try to merge with next
if i + 1 < len(chunks):
next_chunk = chunks[i + 1]
next_tokens = self.count_tokens(next_chunk)
next_type = self.detect_content_type(next_chunk)
# Don't merge if content types differ (preserve semantic boundaries)
if current_type != next_type:
result.append(current) # Add as-is even if small
i += 1
continue
# Check if merging would exceed max_tokens
combined_tokens = current_tokens + next_tokens
if combined_tokens <= self.max_tokens:
# Merge current with next
merged = current + "\n\n" + next_chunk
@ -420,7 +457,7 @@ class ChunkingEngine:
prev_tokens = self.count_tokens(prev)
prev_type = self.detect_content_type(prev)
combined_tokens = prev_tokens + current_tokens
# Only merge if types match
if combined_tokens <= self.max_tokens and prev_type == current_type:
# Merge with previous
@ -431,18 +468,23 @@ class ChunkingEngine:
else:
# No previous chunk, add as-is
result.append(current)
i += 1
return result
def chunk_and_store(content: str, conversation_id: str,
store, tags: List[str] = None,
min_tokens: int = 100, max_tokens: int = 800) -> List[Chunk]:
def chunk_and_store(
content: str,
conversation_id: str,
store,
tags: List[str] = None,
min_tokens: int = 100,
max_tokens: int = 800,
) -> List[Chunk]:
"""
Convenience function to chunk content and store in ChunkStore.
Args:
content: Text to chunk and store
conversation_id: Source conversation ID
@ -450,13 +492,13 @@ def chunk_and_store(content: str, conversation_id: str,
tags: Optional tags for all chunks
min_tokens: Minimum tokens per chunk
max_tokens: Maximum tokens per chunk
Returns:
List of created Chunk objects
"""
engine = ChunkingEngine(min_tokens=min_tokens, max_tokens=max_tokens)
chunk_results = engine.chunk(content, conversation_id, tags)
created_chunks = []
for result in chunk_results:
chunk = store.create_chunk(
@ -464,10 +506,10 @@ def chunk_and_store(content: str, conversation_id: str,
chunk_type=result.type,
conversation_id=conversation_id,
tokens=result.tokens,
tags=result.tags
tags=result.tags,
)
created_chunks.append(chunk)
return created_chunks
@ -477,7 +519,7 @@ if __name__ == "__main__":
print("=" * 60)
print("Chunking Engine - Self Test")
print("=" * 60)
# Test 1: Basic multi-paragraph content
print("\n[Test 1] Multi-paragraph content")
content = """Paragraph 1. Short.
@ -485,16 +527,16 @@ if __name__ == "__main__":
Paragraph 2 is longer with multiple sentences. It should stand alone.
This is a decision: We chose to use RLM architecture."""
engine = ChunkingEngine()
chunks = engine.chunk(content, "test-conv")
print(f"Input paragraphs: 3")
print(f"Output chunks: {len(chunks)}")
for i, c in enumerate(chunks, 1):
print(f" Chunk {i}: {c.type}, {c.tokens} tokens")
print(f" Content: {c.content[:60]}...")
# Test 2: Content type detection
print("\n[Test 2] Content type detection")
test_cases = [
@ -504,12 +546,12 @@ This is a decision: We chose to use RLM architecture."""
("I usually wake up early", "pattern"),
("This is just a random note", "note"),
]
for text, expected in test_cases:
detected = engine.detect_content_type(text)
status = "[OK]" if detected == expected else "[FAIL]"
print(f" {status} '{text[:40]}...' -> {detected} (expected: {expected})")
# Test 3: Small paragraph merging
print("\n[Test 3] Small paragraph merging")
content = """A.
@ -517,19 +559,23 @@ This is a decision: We chose to use RLM architecture."""
B.
C is a longer paragraph with more content that should stand on its own."""
chunks = engine.chunk(content, "test-conv")
print(f"Input paragraphs: 3 (two very short)")
print(f"Output chunks: {len(chunks)}")
for i, c in enumerate(chunks, 1):
print(f" Chunk {i}: {c.tokens} tokens - {c.content[:50]}...")
# Test 4: Large paragraph splitting
print("\n[Test 4] Large paragraph splitting")
# Generate a paragraph that's definitely over 800 tokens
large_content = " ".join([f"This is sentence number {i} in a very long paragraph."
for i in range(1, 201)]) # ~200 sentences
large_content = " ".join(
[
f"This is sentence number {i} in a very long paragraph."
for i in range(1, 201)
]
) # ~200 sentences
chunks = engine.chunk(large_content, "test-conv")
total_tokens = sum(c.tokens for c in chunks)
print(f"Input: ~{engine.count_tokens(large_content)} tokens")
@ -537,7 +583,7 @@ C is a longer paragraph with more content that should stand on its own."""
for i, c in enumerate(chunks, 1):
status = "[OK]" if 100 <= c.tokens <= 800 else "[FAIL]"
print(f" {status} Chunk {i}: {c.tokens} tokens")
# Test 5: Token counting comparison
print("\n[Test 5] Token counting")
test_text = "This is a test sentence with exactly twelve tokens."
@ -545,38 +591,38 @@ C is a longer paragraph with more content that should stand on its own."""
print(f" Text: '{test_text}'")
print(f" Estimated tokens: {estimated}")
print(f" Tiktoken available: {TIKTOKEN_AVAILABLE}")
# Test 6: Integration with ChunkStore
print("\n[Test 6] Integration with ChunkStore")
try:
from .memory_store import ChunkStore
store = ChunkStore("brain/memory")
test_content = """First fact: Python is a programming language.
Second decision: We chose to implement async support.
Third preference: I prefer using type hints."""
created = chunk_and_store(
content=test_content,
conversation_id="integration-test",
store=store,
tags=["test", "integration"]
tags=["test", "integration"],
)
print(f" Created {len(created)} chunks:")
for c in created:
print(f" - {c.id}: {c.type}, {c.tokens} tokens")
# Cleanup - archive the test chunks
for c in created:
store.delete_chunk(c.id, permanent=False)
print(" ✓ Test chunks archived")
except Exception as e:
print(f" [SKIP] Integration test skipped: {e}")
print("\n" + "=" * 60)
print("All tests completed!")
print("=" * 60)