#[test]
fn test_chunk_text_with_overlap_basic() {
let text = "First sentence. Second sentence. Third sentence. Fourth sentence.";
let chunks = chunk_text_with_overlap(text, 30, 10);
assert!(chunks.len() >= 2, "Should produce multiple chunks");
for i in 1..chunks.len() {
let prev_end = &chunks[i - 1];
let curr_start = &chunks[i];
let overlap_region = &prev_end[prev_end.len().saturating_sub(10)..];
assert!(
curr_start.starts_with(overlap_region)
|| prev_end.ends_with(&curr_start[..10.min(curr_start.len())]),
"Chunks should have overlap: prev_end='{}...', curr_start='{}...'",
&prev_end[prev_end.len().saturating_sub(20)..],
&curr_start[..20.min(curr_start.len())]
);
}
}
#[test]
fn test_chunk_text_preserves_word_boundaries() {
let text =
"The quick brown fox jumps over the lazy dog. It runs repeatedly until exhausted.";
let chunks = chunk_text_recursive(text, 40, 10);
assert!(!chunks.is_empty(), "Should produce chunks");
let mut boundary_count = 0;
for chunk in &chunks {
let trimmed = chunk.trim();
if !trimmed.is_empty() {
let last_char = trimmed.chars().last().unwrap();
if last_char.is_alphanumeric() || last_char == '.' || last_char == ' ' {
boundary_count += 1;
}
}
}
let boundary_ratio = boundary_count as f64 / chunks.len() as f64;
assert!(
boundary_ratio >= 0.5,
"At least half of chunks should end at word/sentence boundaries: ratio = {:.2}",
boundary_ratio
);
}
#[test]
fn test_recursive_chunker_respects_paragraphs() {
let text = "First paragraph with some content.\n\nSecond paragraph with different content.\n\nThird paragraph to conclude.";
let chunks = chunk_text_recursive(text, 60, 10);
for chunk in &chunks {
let internal_breaks = chunk.matches("\n\n").count();
assert!(
internal_breaks <= 1,
"Chunks should not contain more than one paragraph break: found {} in '{}'",
internal_breaks,
chunk
);
}
}
#[test]
fn test_overlap_for_rag_retrieval() {
let text = "The beginning of the document. Middle section with target keyword here. The end of the document.";
let chunks = chunk_text_with_overlap(text, 40, 15);
assert!(chunks.len() >= 2, "Should produce multiple chunks");
let mut overlap_found = false;
for i in 1..chunks.len() {
let prev = &chunks[i - 1];
let curr = &chunks[i];
let prev_words: Vec<_> = prev.split_whitespace().collect();
let curr_words: Vec<_> = curr.split_whitespace().collect();
if prev_words.len() >= 2 && curr_words.len() >= 2 {
for prev_word in prev_words.iter().rev().take(5) {
if curr_words.iter().take(5).any(|w| w == prev_word) {
overlap_found = true;
break;
}
}
}
if overlap_found {
break;
}
}
let combined: String = chunks.join("");
assert!(
combined.contains("target")
|| combined.contains("keyword")
|| combined.contains("Middle"),
"Chunks should collectively contain the original content"
);
}
#[test]
fn test_chunk_text_empty_input() {
let chunks = chunk_text_with_overlap("", 100, 20);
assert!(chunks.is_empty(), "Empty input should produce no chunks");
}
#[test]
fn test_chunk_text_single_chunk() {
let text = "Short text.";
let chunks = chunk_text_with_overlap(text, 100, 20);
assert_eq!(chunks.len(), 1, "Small text should produce single chunk");
assert_eq!(chunks[0], "Short text.");
}
#[test]
fn test_recursive_chunker_sentence_boundaries() {
let text = "First sentence here. Second sentence follows. Third sentence now. Fourth sentence ends.";
let chunks = chunk_text_recursive(text, 45, 10);
for chunk in &chunks {
let trimmed = chunk.trim();
if !trimmed.is_empty() && !trimmed.ends_with('.') {
assert!(
chunks.iter().position(|c| c == chunk) == Some(chunks.len() - 1)
|| trimmed.len() < 45,
"Mid-sentence split should be avoided when possible: '{}'",
trimmed
);
}
}
}
#[test]
fn test_hybrid_ast_text_chunking() {
let rust_source = r#"
/// A complex function that does many things.
/// This is a long docstring that explains the function.
fn complex_function() {
let a = 1;
let b = 2;
let c = 3;
// Many lines of code
println!("Line 1");
println!("Line 2");
println!("Line 3");
println!("Line 4");
println!("Line 5");
}
/// Another function with documentation.
fn another_function() {
println!("Hello");
}
"#;
let ast_chunks = chunk_code(rust_source, Language::Rust).unwrap();
let mut final_chunks = Vec::new();
for chunk in ast_chunks {
if chunk.content.len() > 100 {
let text_chunks = chunk_text_with_overlap(&chunk.content, 80, 20);
for (i, text) in text_chunks.iter().enumerate() {
final_chunks.push(CodeChunk {
file_path: chunk.file_path.clone(),
chunk_type: chunk.chunk_type.clone(),
chunk_name: format!("{}_part{}", chunk.chunk_name, i),
language: chunk.language.clone(),
start_line: chunk.start_line,
end_line: chunk.end_line,
content: text.clone(),
content_checksum: compute_checksum(text),
});
}
} else {
final_chunks.push(chunk);
}
}
assert!(!final_chunks.is_empty());
let complex_parts: Vec<_> = final_chunks
.iter()
.filter(|c| c.chunk_name.starts_with("complex_function"))
.collect();
assert!(
complex_parts.len() >= 1,
"Complex function should produce at least one chunk"
);
}
#[test]
fn test_trueno_rag_chunker_integration() {
use trueno_rag::chunk::{Chunker, RecursiveChunker};
use trueno_rag::Document;
let chunker = RecursiveChunker::new(50, 10);
let doc = Document::new(
"First paragraph content.\n\nSecond paragraph content.\n\nThird paragraph content.",
);
let result = chunker.chunk(&doc);
assert!(result.is_ok(), "trueno-rag RecursiveChunker should work");
let chunks = result.unwrap();
assert!(!chunks.is_empty(), "Should produce chunks");
for chunk in &chunks {
assert!(!chunk.content.is_empty());
assert!(chunk.start_offset < chunk.end_offset);
}
}