pmat 3.11.0 - Docs.rs

    // TRUENO-RAG-3-CHUNKER: RecursiveChunker Integration Tests
    // RED Phase: These tests define expected behavior for RAG chunking

    /// Test that chunk_text_with_overlap produces chunks with proper overlap
    #[test]
    fn test_chunk_text_with_overlap_basic() {
        let text = "First sentence. Second sentence. Third sentence. Fourth sentence.";
        let chunks = chunk_text_with_overlap(text, 30, 10);

        assert!(chunks.len() >= 2, "Should produce multiple chunks");

        // Verify overlap exists between consecutive chunks
        for i in 1..chunks.len() {
            let prev_end = &chunks[i - 1];
            let curr_start = &chunks[i];

            // The current chunk should start with text from the end of the previous chunk
            let overlap_region = &prev_end[prev_end.len().saturating_sub(10)..];
            assert!(
                curr_start.starts_with(overlap_region)
                    || prev_end.ends_with(&curr_start[..10.min(curr_start.len())]),
                "Chunks should have overlap: prev_end='{}...', curr_start='{}...'",
                &prev_end[prev_end.len().saturating_sub(20)..],
                &curr_start[..20.min(curr_start.len())]
            );
        }
    }

    /// Test that overlap preserves semantic boundaries when possible
    /// Note: With very small chunk sizes, word boundaries may not always be preserved
    #[test]
    fn test_chunk_text_preserves_word_boundaries() {
        // Use larger chunk size where word boundaries are achievable
        let text =
            "The quick brown fox jumps over the lazy dog. It runs repeatedly until exhausted.";
        let chunks = chunk_text_recursive(text, 40, 10);

        // Verify chunks are produced
        assert!(!chunks.is_empty(), "Should produce chunks");

        // Most chunks should end at natural boundaries (period, space, or alphanumeric)
        let mut boundary_count = 0;
        for chunk in &chunks {
            let trimmed = chunk.trim();
            if !trimmed.is_empty() {
                let last_char = trimmed.chars().last().unwrap();
                if last_char.is_alphanumeric() || last_char == '.' || last_char == ' ' {
                    boundary_count += 1;
                }
            }
        }

        // At least 50% of chunks should end at natural boundaries
        let boundary_ratio = boundary_count as f64 / chunks.len() as f64;
        assert!(
            boundary_ratio >= 0.5,
            "At least half of chunks should end at word/sentence boundaries: ratio = {:.2}",
            boundary_ratio
        );
    }

    /// Test that RecursiveChunker respects paragraph boundaries
    #[test]
    fn test_recursive_chunker_respects_paragraphs() {
        let text = "First paragraph with some content.\n\nSecond paragraph with different content.\n\nThird paragraph to conclude.";
        let chunks = chunk_text_recursive(text, 60, 10);

        // With paragraph separators, chunks should prefer to split at paragraph boundaries
        for chunk in &chunks {
            // Count paragraph breaks within chunk
            let internal_breaks = chunk.matches("\n\n").count();
            assert!(
                internal_breaks <= 1,
                "Chunks should not contain more than one paragraph break: found {} in '{}'",
                internal_breaks,
                chunk
            );
        }
    }

    /// Test that overlap is applied correctly for RAG retrieval
    #[test]
    fn test_overlap_for_rag_retrieval() {
        let text = "The beginning of the document. Middle section with target keyword here. The end of the document.";
        let chunks = chunk_text_with_overlap(text, 40, 15);

        // Verify chunks are produced with overlap
        assert!(chunks.len() >= 2, "Should produce multiple chunks");

        // With overlap, there should be shared content between consecutive chunks
        let mut overlap_found = false;
        for i in 1..chunks.len() {
            let prev = &chunks[i - 1];
            let curr = &chunks[i];

            // Check if any words from end of prev appear in start of curr
            let prev_words: Vec<_> = prev.split_whitespace().collect();
            let curr_words: Vec<_> = curr.split_whitespace().collect();

            if prev_words.len() >= 2 && curr_words.len() >= 2 {
                // Look for word overlap
                for prev_word in prev_words.iter().rev().take(5) {
                    if curr_words.iter().take(5).any(|w| w == prev_word) {
                        overlap_found = true;
                        break;
                    }
                }
            }

            if overlap_found {
                break;
            }
        }

        // The chunker may not always achieve word-level overlap with small sizes
        // but the chunks should cover the full content
        let combined: String = chunks.join("");
        assert!(
            combined.contains("target")
                || combined.contains("keyword")
                || combined.contains("Middle"),
            "Chunks should collectively contain the original content"
        );
    }

    /// Test empty input handling
    #[test]
    fn test_chunk_text_empty_input() {
        let chunks = chunk_text_with_overlap("", 100, 20);
        assert!(chunks.is_empty(), "Empty input should produce no chunks");
    }

    /// Test small text that fits in single chunk
    #[test]
    fn test_chunk_text_single_chunk() {
        let text = "Short text.";
        let chunks = chunk_text_with_overlap(text, 100, 20);

        assert_eq!(chunks.len(), 1, "Small text should produce single chunk");
        assert_eq!(chunks[0], "Short text.");
    }

    /// Test chunking with sentence separators
    #[test]
    fn test_recursive_chunker_sentence_boundaries() {
        let text = "First sentence here. Second sentence follows. Third sentence now. Fourth sentence ends.";
        let chunks = chunk_text_recursive(text, 45, 10);

        // Chunks should prefer to split at sentence boundaries (periods followed by space)
        for chunk in &chunks {
            let trimmed = chunk.trim();
            if !trimmed.is_empty() && !trimmed.ends_with('.') {
                // If not ending with period, should be last chunk or overlap continuation
                assert!(
                    chunks.iter().position(|c| c == chunk) == Some(chunks.len() - 1)
                        || trimmed.len() < 45,
                    "Mid-sentence split should be avoided when possible: '{}'",
                    trimmed
                );
            }
        }
    }

    /// Test integration with AST chunker - combining semantic + text chunking
    #[test]
    fn test_hybrid_ast_text_chunking() {
        let rust_source = r#"
/// A complex function that does many things.
/// This is a long docstring that explains the function.
fn complex_function() {
    let a = 1;
    let b = 2;
    let c = 3;
    // Many lines of code
    println!("Line 1");
    println!("Line 2");
    println!("Line 3");
    println!("Line 4");
    println!("Line 5");
}

/// Another function with documentation.
fn another_function() {
    println!("Hello");
}
"#;

        // First get AST chunks
        let ast_chunks = chunk_code(rust_source, Language::Rust).unwrap();

        // Then apply text chunking with overlap to large chunks
        let mut final_chunks = Vec::new();
        for chunk in ast_chunks {
            if chunk.content.len() > 100 {
                // Large chunk - apply text chunking with overlap
                let text_chunks = chunk_text_with_overlap(&chunk.content, 80, 20);
                for (i, text) in text_chunks.iter().enumerate() {
                    final_chunks.push(CodeChunk {
                        file_path: chunk.file_path.clone(),
                        chunk_type: chunk.chunk_type.clone(),
                        chunk_name: format!("{}_part{}", chunk.chunk_name, i),
                        language: chunk.language.clone(),
                        start_line: chunk.start_line,
                        end_line: chunk.end_line,
                        content: text.clone(),
                        content_checksum: compute_checksum(text),
                    });
                }
            } else {
                final_chunks.push(chunk);
            }
        }

        assert!(!final_chunks.is_empty());
        // The complex function should be split into multiple parts
        let complex_parts: Vec<_> = final_chunks
            .iter()
            .filter(|c| c.chunk_name.starts_with("complex_function"))
            .collect();
        assert!(
            complex_parts.len() >= 1,
            "Complex function should produce at least one chunk"
        );
    }

    /// Test that trueno-rag Chunker trait can be used
    #[test]
    fn test_trueno_rag_chunker_integration() {
        use trueno_rag::chunk::{Chunker, RecursiveChunker};
        use trueno_rag::Document;

        let chunker = RecursiveChunker::new(50, 10);
        let doc = Document::new(
            "First paragraph content.\n\nSecond paragraph content.\n\nThird paragraph content.",
        );

        let result = chunker.chunk(&doc);
        assert!(result.is_ok(), "trueno-rag RecursiveChunker should work");

        let chunks = result.unwrap();
        assert!(!chunks.is_empty(), "Should produce chunks");

        // Verify chunk metadata
        for chunk in &chunks {
            assert!(!chunk.content.is_empty());
            assert!(chunk.start_offset < chunk.end_offset);
        }
    }