use codex_memory::chunking::{SemanticChunker, ChunkingStrategy, BoundaryType};
use std::collections::HashMap;
#[test]
fn test_sentence_chunking_preserves_complete_sentences() {
let chunker = SemanticChunker::new(ChunkingStrategy::Sentence, 100, 20);
let content = "First complete sentence. Second sentence follows immediately. Third sentence with more details here. Fourth and final sentence.";
let chunks = chunker.chunk(content).unwrap();
// Should create multiple chunks
assert!(chunks.len() > 1, "Should create multiple chunks for long content");
// Each chunk should end with a sentence boundary (except possibly the last fallback)
for chunk in &chunks {
if chunk.boundary_type != BoundaryType::Fallback {
assert!(
chunk.content.ends_with(". ") || chunk.content.ends_with(".") ||
chunk.content.ends_with("! ") || chunk.content.ends_with("!") ||
chunk.content.ends_with("? ") || chunk.content.ends_with("?"),
"Sentence chunk should end at sentence boundary: '{}'", chunk.content
);
}
}
// Verify chunk indices are sequential
for (i, chunk) in chunks.iter().enumerate() {
assert_eq!(chunk.chunk_index, i, "Chunk indices should be sequential");
}
}
#[test]
fn test_paragraph_chunking_preserves_paragraph_structure() {
let chunker = SemanticChunker::new(ChunkingStrategy::Paragraph, 80, 15);
let content = "First paragraph with multiple sentences. This continues in the same paragraph.\n\nSecond paragraph starts here with new content. It has several sentences too.\n\nThird paragraph is shorter.\n\nFourth paragraph concludes the document with final thoughts.";
let chunks = chunker.chunk(content).unwrap();
// Should create chunks respecting paragraph boundaries
assert!(chunks.len() >= 2, "Should create multiple chunks for multi-paragraph content");
// Verify paragraph boundaries are preserved
for chunk in chunks {
if chunk.boundary_type == BoundaryType::Paragraph {
// Paragraph chunks should not have internal double newlines unless they're at the end
let double_newline_count = chunk.content.matches("\n\n").count();
assert!(
double_newline_count <= 1,
"Paragraph chunk should not split paragraphs: '{}'", chunk.content
);
}
}
}
#[test]
fn test_hybrid_chunking_balances_size_and_semantics() {
let chunker = SemanticChunker::new(ChunkingStrategy::Hybrid, 150, 20);
let content = "This is a test document with multiple sentences. Each sentence adds important context. Some sentences are longer than others, providing more detailed information. The hybrid approach should balance size constraints with semantic boundaries.\n\nThis is a new paragraph that should be considered as a separate semantic unit. It contains additional sentences that extend the content length. The chunker should prefer semantic boundaries when possible, but fall back to size limits when necessary.";
let chunks = chunker.chunk(content).unwrap();
// Should create multiple chunks
assert!(chunks.len() > 1, "Should create multiple chunks for long content");
// Most chunks should respect size limits
for chunk in &chunks {
if chunk.chunk_index < chunks.len() - 1 {
assert!(
chunk.content.len() <= 170, // Allow small buffer over limit
"Chunk should respect size limits: {} chars", chunk.content.len()
);
}
}
// Should have a mix of boundary types
let boundary_types: HashMap<BoundaryType, usize> = chunks.iter()
.map(|c| c.boundary_type.clone())
.fold(HashMap::new(), |mut acc, bt| {
*acc.entry(bt).or_insert(0) += 1;
acc
});
// Hybrid should prefer semantic boundaries when possible
let semantic_count = boundary_types.get(&BoundaryType::Sentence).unwrap_or(&0) +
boundary_types.get(&BoundaryType::Paragraph).unwrap_or(&0) +
boundary_types.get(&BoundaryType::Topic).unwrap_or(&0);
let fallback_count = boundary_types.get(&BoundaryType::Fallback).unwrap_or(&0);
// Should prefer semantic boundaries over fallbacks
assert!(
semantic_count >= fallback_count,
"Hybrid should prefer semantic boundaries: {} semantic vs {} fallback",
semantic_count, fallback_count
);
}
#[test]
fn test_semantic_chunking_combines_multiple_boundary_types() {
let chunker = SemanticChunker::new(ChunkingStrategy::Semantic, 200, 25);
let content = "Introduction paragraph sets the context. This paragraph introduces the main topic.\n\nFirst main section begins here. It has multiple sentences with detailed explanations. Each sentence builds upon the previous one.\n\nSecond main section covers different aspects. The content flows logically from one idea to the next.\n\nConclusion paragraph wraps up all points. This provides a comprehensive summary.";
let chunks = chunker.chunk(content).unwrap();
// Should create semantically meaningful chunks
assert!(chunks.len() >= 2, "Should create multiple semantic chunks");
// Verify semantic boundaries are detected
let has_paragraph_boundaries = chunks.iter().any(|c| c.boundary_type == BoundaryType::Paragraph);
let has_topic_boundaries = chunks.iter().any(|c| c.boundary_type == BoundaryType::Topic);
assert!(
has_paragraph_boundaries || has_topic_boundaries,
"Should detect semantic boundaries (paragraph or topic)"
);
// Chunks should preserve logical content flow
for chunk in chunks {
assert!(
!chunk.content.trim().is_empty(),
"Chunks should not be empty"
);
// Should not break mid-sentence arbitrarily
if chunk.boundary_type != BoundaryType::Fallback {
let ends_properly = chunk.content.ends_with(". ") ||
chunk.content.ends_with(".") ||
chunk.content.ends_with("\n\n") ||
chunk.content.ends_with("\n");
assert!(
ends_properly,
"Semantic chunk should end at natural boundary: '{}'",
&chunk.content[chunk.content.len().saturating_sub(20)..]
);
}
}
}
#[test]
fn test_small_content_creates_single_chunk() {
let chunker = SemanticChunker::new(ChunkingStrategy::Hybrid, 1000, 100);
let content = "This is a short document. It has only two sentences.";
let chunks = chunker.chunk(content).unwrap();
assert_eq!(chunks.len(), 1, "Small content should create single chunk");
assert_eq!(chunks[0].content, content, "Single chunk should contain all content");
assert_eq!(chunks[0].start_char, 0, "Single chunk should start at 0");
assert_eq!(chunks[0].end_char, content.len(), "Single chunk should end at content length");
assert_eq!(chunks[0].boundary_type, BoundaryType::Section, "Small content should be marked as section");
assert_eq!(chunks[0].chunk_index, 0, "Single chunk should have index 0");
}
#[test]
fn test_chunk_overlap_preserves_context() {
let chunker = SemanticChunker::new(ChunkingStrategy::Sentence, 80, 20);
let content = "First sentence establishes context. Second sentence builds on first. Third sentence adds more detail. Fourth sentence concludes the thought. Fifth sentence starts new topic.";
let chunks = chunker.chunk(content).unwrap();
if chunks.len() > 1 {
// Check for content overlap between adjacent chunks
for i in 0..chunks.len() - 1 {
let current_chunk = &chunks[i];
let next_chunk = &chunks[i + 1];
// Should have some overlap in character positions
assert!(
current_chunk.end_char > next_chunk.start_char,
"Chunks should overlap: chunk {} ends at {}, chunk {} starts at {}",
i, current_chunk.end_char, i + 1, next_chunk.start_char
);
}
}
}
#[test]
fn test_boundary_type_classification_accuracy() {
let chunker = SemanticChunker::new(ChunkingStrategy::Hybrid, 200, 30);
// Test sentence boundaries
let sentence_content = "First sentence. Second sentence. Third sentence.";
let sentence_chunks = chunker.chunk(sentence_content).unwrap();
// Test paragraph boundaries
let paragraph_content = "First paragraph here.\n\nSecond paragraph starts.\n\nThird paragraph follows.";
let paragraph_chunks = chunker.chunk(paragraph_content).unwrap();
// Verify boundary type detection
for chunk in sentence_chunks {
if chunk.boundary_type == BoundaryType::Sentence {
assert!(
chunk.content.contains('.'),
"Sentence boundary chunk should contain sentence endings"
);
}
}
for chunk in paragraph_chunks {
if chunk.boundary_type == BoundaryType::Paragraph {
// Paragraph chunks should contain paragraph structure indicators
assert!(
chunk.content.contains('\n') || chunk.content.ends_with('.'),
"Paragraph boundary chunk should contain newlines or proper endings"
);
}
}
}
#[test]
fn test_chunking_strategies_produce_different_results() {
let content = "First paragraph with multiple sentences. This continues the paragraph with more content.\n\nSecond paragraph begins here. It also has several sentences that provide context.\n\nThird paragraph is the final one. It concludes the document with summary.";
let sentence_chunker = SemanticChunker::new(ChunkingStrategy::Sentence, 100, 20);
let paragraph_chunker = SemanticChunker::new(ChunkingStrategy::Paragraph, 100, 20);
let hybrid_chunker = SemanticChunker::new(ChunkingStrategy::Hybrid, 100, 20);
let sentence_chunks = sentence_chunker.chunk(content).unwrap();
let paragraph_chunks = paragraph_chunker.chunk(content).unwrap();
let hybrid_chunks = hybrid_chunker.chunk(content).unwrap();
// Different strategies should produce different chunking results
let sentence_boundaries: Vec<_> = sentence_chunks.iter().map(|c| (c.start_char, c.end_char)).collect();
let paragraph_boundaries: Vec<_> = paragraph_chunks.iter().map(|c| (c.start_char, c.end_char)).collect();
let hybrid_boundaries: Vec<_> = hybrid_chunks.iter().map(|c| (c.start_char, c.end_char)).collect();
// At least one strategy should produce different boundaries
assert!(
sentence_boundaries != paragraph_boundaries ||
sentence_boundaries != hybrid_boundaries ||
paragraph_boundaries != hybrid_boundaries,
"Different chunking strategies should produce different results"
);
// All strategies should preserve content integrity
for chunks in [&sentence_chunks, ¶graph_chunks, &hybrid_chunks] {
let reconstructed = chunks.iter()
.map(|c| &c.content)
.collect::<Vec<_>>()
.join("")
.replace(|c: char| c.is_whitespace(), " ")
.split_whitespace()
.collect::<Vec<_>>()
.join(" ");
let original_normalized = content.replace(|c: char| c.is_whitespace(), " ")
.split_whitespace()
.collect::<Vec<_>>()
.join(" ");
// Content should be preserved (allowing for overlap differences)
assert!(
reconstructed.contains(&original_normalized[..original_normalized.len() / 2]),
"Chunking should preserve original content structure"
);
}
}
#[test]
fn test_edge_case_empty_content() {
let chunker = SemanticChunker::new(ChunkingStrategy::Hybrid, 100, 20);
let content = "";
let chunks = chunker.chunk(content).unwrap();
assert_eq!(chunks.len(), 1, "Empty content should create single empty chunk");
assert_eq!(chunks[0].content, "", "Empty chunk should have empty content");
assert_eq!(chunks[0].start_char, 0, "Empty chunk should start at 0");
assert_eq!(chunks[0].end_char, 0, "Empty chunk should end at 0");
}
#[test]
fn test_edge_case_single_very_long_sentence() {
let chunker = SemanticChunker::new(ChunkingStrategy::Sentence, 50, 10);
let content = "This is an extremely long sentence that exceeds the chunk size limit significantly and should be handled gracefully by the chunker even though it cannot be split at natural sentence boundaries because it is just one very long continuous sentence without any proper ending punctuation until the very end.";
let chunks = chunker.chunk(content).unwrap();
// Should create chunks even for very long sentences
assert!(chunks.len() > 1, "Very long sentence should be chunked");
// Some chunks should be fallback type due to size constraints
let has_fallback = chunks.iter().any(|c| c.boundary_type == BoundaryType::Fallback);
assert!(has_fallback, "Should use fallback chunking for oversized sentences");
}
#[test]
fn test_chunk_metadata_consistency() {
let chunker = SemanticChunker::new(ChunkingStrategy::Hybrid, 120, 25);
let content = "First section with important content. This section has multiple sentences.\n\nSecond section begins here. It provides additional context and information.\n\nThird section concludes everything. The final thoughts are presented clearly.";
let chunks = chunker.chunk(content).unwrap();
// Verify chunk metadata consistency
for (i, chunk) in chunks.iter().enumerate() {
// Check index consistency
assert_eq!(chunk.chunk_index, i, "Chunk index should match position");
// Check content bounds
assert_eq!(
chunk.content.len(),
chunk.end_char - chunk.start_char,
"Content length should match char bounds"
);
// Check content matches source
if chunk.start_char < content.len() && chunk.end_char <= content.len() {
let expected_content = &content[chunk.start_char..chunk.end_char];
assert_eq!(
chunk.content, expected_content,
"Chunk content should match source slice"
);
}
// Check boundary type is valid
match chunk.boundary_type {
BoundaryType::Sentence | BoundaryType::Paragraph |
BoundaryType::Section | BoundaryType::Topic | BoundaryType::Fallback => {
// All types are valid
}
}
}
}
#[test]
fn test_performance_with_large_content() {
let chunker = SemanticChunker::new(ChunkingStrategy::Hybrid, 1000, 100);
// Create large content (about 10KB)
let paragraph = "This is a test paragraph with multiple sentences for performance testing. ".repeat(20);
let content = format!("{}\n\n{}\n\n{}", paragraph, paragraph, paragraph);
let start = std::time::Instant::now();
let chunks = chunker.chunk(&content).unwrap();
let duration = start.elapsed();
// Should complete quickly (under 100ms for 10KB content)
assert!(
duration.as_millis() < 100,
"Chunking should be fast: took {:?} for {} bytes", duration, content.len()
);
// Should produce reasonable number of chunks
assert!(
chunks.len() > 1 && chunks.len() < 20,
"Should produce reasonable number of chunks: {}", chunks.len()
);
// All chunks should be valid
for chunk in chunks {
assert!(!chunk.content.trim().is_empty(), "Chunks should not be empty");
assert!(chunk.start_char <= chunk.end_char, "Chunk bounds should be valid");
}
}