codex-memory 3.0.15

use codex_memory::chunking::{SemanticChunker, ChunkingStrategy, BoundaryType};
use std::collections::HashMap;

#[test]
fn test_sentence_chunking_preserves_complete_sentences() {
    let chunker = SemanticChunker::new(ChunkingStrategy::Sentence, 100, 20);
    let content = "First complete sentence. Second sentence follows immediately. Third sentence with more details here. Fourth and final sentence.";
    
    let chunks = chunker.chunk(content).unwrap();
    
    // Should create multiple chunks
    assert!(chunks.len() > 1, "Should create multiple chunks for long content");
    
    // Each chunk should end with a sentence boundary (except possibly the last fallback)
    for chunk in &chunks {
        if chunk.boundary_type != BoundaryType::Fallback {
            assert!(
                chunk.content.ends_with(". ") || chunk.content.ends_with(".") ||
                chunk.content.ends_with("! ") || chunk.content.ends_with("!") ||
                chunk.content.ends_with("? ") || chunk.content.ends_with("?"),
                "Sentence chunk should end at sentence boundary: '{}'", chunk.content
            );
        }
    }
    
    // Verify chunk indices are sequential
    for (i, chunk) in chunks.iter().enumerate() {
        assert_eq!(chunk.chunk_index, i, "Chunk indices should be sequential");
    }
}

#[test]
fn test_paragraph_chunking_preserves_paragraph_structure() {
    let chunker = SemanticChunker::new(ChunkingStrategy::Paragraph, 80, 15);
    let content = "First paragraph with multiple sentences. This continues in the same paragraph.\n\nSecond paragraph starts here with new content. It has several sentences too.\n\nThird paragraph is shorter.\n\nFourth paragraph concludes the document with final thoughts.";
    
    let chunks = chunker.chunk(content).unwrap();
    
    // Should create chunks respecting paragraph boundaries
    assert!(chunks.len() >= 2, "Should create multiple chunks for multi-paragraph content");
    
    // Verify paragraph boundaries are preserved
    for chunk in chunks {
        if chunk.boundary_type == BoundaryType::Paragraph {
            // Paragraph chunks should not have internal double newlines unless they're at the end
            let double_newline_count = chunk.content.matches("\n\n").count();
            assert!(
                double_newline_count <= 1,
                "Paragraph chunk should not split paragraphs: '{}'", chunk.content
            );
        }
    }
}

#[test]
fn test_hybrid_chunking_balances_size_and_semantics() {
    let chunker = SemanticChunker::new(ChunkingStrategy::Hybrid, 150, 20);
    let content = "This is a test document with multiple sentences. Each sentence adds important context. Some sentences are longer than others, providing more detailed information. The hybrid approach should balance size constraints with semantic boundaries.\n\nThis is a new paragraph that should be considered as a separate semantic unit. It contains additional sentences that extend the content length. The chunker should prefer semantic boundaries when possible, but fall back to size limits when necessary.";
    
    let chunks = chunker.chunk(content).unwrap();
    
    // Should create multiple chunks
    assert!(chunks.len() > 1, "Should create multiple chunks for long content");
    
    // Most chunks should respect size limits
    for chunk in &chunks {
        if chunk.chunk_index < chunks.len() - 1 {
            assert!(
                chunk.content.len() <= 170, // Allow small buffer over limit
                "Chunk should respect size limits: {} chars", chunk.content.len()
            );
        }
    }
    
    // Should have a mix of boundary types
    let boundary_types: HashMap<BoundaryType, usize> = chunks.iter()
        .map(|c| c.boundary_type.clone())
        .fold(HashMap::new(), |mut acc, bt| {
            *acc.entry(bt).or_insert(0) += 1;
            acc
        });
    
    // Hybrid should prefer semantic boundaries when possible
    let semantic_count = boundary_types.get(&BoundaryType::Sentence).unwrap_or(&0) +
                        boundary_types.get(&BoundaryType::Paragraph).unwrap_or(&0) +
                        boundary_types.get(&BoundaryType::Topic).unwrap_or(&0);
    let fallback_count = boundary_types.get(&BoundaryType::Fallback).unwrap_or(&0);
    
    // Should prefer semantic boundaries over fallbacks
    assert!(
        semantic_count >= fallback_count,
        "Hybrid should prefer semantic boundaries: {} semantic vs {} fallback", 
        semantic_count, fallback_count
    );
}

#[test]
fn test_semantic_chunking_combines_multiple_boundary_types() {
    let chunker = SemanticChunker::new(ChunkingStrategy::Semantic, 200, 25);
    let content = "Introduction paragraph sets the context. This paragraph introduces the main topic.\n\nFirst main section begins here. It has multiple sentences with detailed explanations. Each sentence builds upon the previous one.\n\nSecond main section covers different aspects. The content flows logically from one idea to the next.\n\nConclusion paragraph wraps up all points. This provides a comprehensive summary.";
    
    let chunks = chunker.chunk(content).unwrap();
    
    // Should create semantically meaningful chunks
    assert!(chunks.len() >= 2, "Should create multiple semantic chunks");
    
    // Verify semantic boundaries are detected
    let has_paragraph_boundaries = chunks.iter().any(|c| c.boundary_type == BoundaryType::Paragraph);
    let has_topic_boundaries = chunks.iter().any(|c| c.boundary_type == BoundaryType::Topic);
    
    assert!(
        has_paragraph_boundaries || has_topic_boundaries,
        "Should detect semantic boundaries (paragraph or topic)"
    );
    
    // Chunks should preserve logical content flow
    for chunk in chunks {
        assert!(
            !chunk.content.trim().is_empty(),
            "Chunks should not be empty"
        );
        
        // Should not break mid-sentence arbitrarily
        if chunk.boundary_type != BoundaryType::Fallback {
            let ends_properly = chunk.content.ends_with(". ") || 
                               chunk.content.ends_with(".") ||
                               chunk.content.ends_with("\n\n") ||
                               chunk.content.ends_with("\n");
            assert!(
                ends_properly,
                "Semantic chunk should end at natural boundary: '{}'", 
                &chunk.content[chunk.content.len().saturating_sub(20)..]
            );
        }
    }
}

#[test]
fn test_small_content_creates_single_chunk() {
    let chunker = SemanticChunker::new(ChunkingStrategy::Hybrid, 1000, 100);
    let content = "This is a short document. It has only two sentences.";
    
    let chunks = chunker.chunk(content).unwrap();
    
    assert_eq!(chunks.len(), 1, "Small content should create single chunk");
    assert_eq!(chunks[0].content, content, "Single chunk should contain all content");
    assert_eq!(chunks[0].start_char, 0, "Single chunk should start at 0");
    assert_eq!(chunks[0].end_char, content.len(), "Single chunk should end at content length");
    assert_eq!(chunks[0].boundary_type, BoundaryType::Section, "Small content should be marked as section");
    assert_eq!(chunks[0].chunk_index, 0, "Single chunk should have index 0");
}

#[test]
fn test_chunk_overlap_preserves_context() {
    let chunker = SemanticChunker::new(ChunkingStrategy::Sentence, 80, 20);
    let content = "First sentence establishes context. Second sentence builds on first. Third sentence adds more detail. Fourth sentence concludes the thought. Fifth sentence starts new topic.";
    
    let chunks = chunker.chunk(content).unwrap();
    
    if chunks.len() > 1 {
        // Check for content overlap between adjacent chunks
        for i in 0..chunks.len() - 1 {
            let current_chunk = &chunks[i];
            let next_chunk = &chunks[i + 1];
            
            // Should have some overlap in character positions
            assert!(
                current_chunk.end_char > next_chunk.start_char,
                "Chunks should overlap: chunk {} ends at {}, chunk {} starts at {}", 
                i, current_chunk.end_char, i + 1, next_chunk.start_char
            );
        }
    }
}

#[test]
fn test_boundary_type_classification_accuracy() {
    let chunker = SemanticChunker::new(ChunkingStrategy::Hybrid, 200, 30);
    
    // Test sentence boundaries
    let sentence_content = "First sentence. Second sentence. Third sentence.";
    let sentence_chunks = chunker.chunk(sentence_content).unwrap();
    
    // Test paragraph boundaries  
    let paragraph_content = "First paragraph here.\n\nSecond paragraph starts.\n\nThird paragraph follows.";
    let paragraph_chunks = chunker.chunk(paragraph_content).unwrap();
    
    // Verify boundary type detection
    for chunk in sentence_chunks {
        if chunk.boundary_type == BoundaryType::Sentence {
            assert!(
                chunk.content.contains('.'),
                "Sentence boundary chunk should contain sentence endings"
            );
        }
    }
    
    for chunk in paragraph_chunks {
        if chunk.boundary_type == BoundaryType::Paragraph {
            // Paragraph chunks should contain paragraph structure indicators
            assert!(
                chunk.content.contains('\n') || chunk.content.ends_with('.'),
                "Paragraph boundary chunk should contain newlines or proper endings"
            );
        }
    }
}

#[test]
fn test_chunking_strategies_produce_different_results() {
    let content = "First paragraph with multiple sentences. This continues the paragraph with more content.\n\nSecond paragraph begins here. It also has several sentences that provide context.\n\nThird paragraph is the final one. It concludes the document with summary.";
    
    let sentence_chunker = SemanticChunker::new(ChunkingStrategy::Sentence, 100, 20);
    let paragraph_chunker = SemanticChunker::new(ChunkingStrategy::Paragraph, 100, 20);
    let hybrid_chunker = SemanticChunker::new(ChunkingStrategy::Hybrid, 100, 20);
    
    let sentence_chunks = sentence_chunker.chunk(content).unwrap();
    let paragraph_chunks = paragraph_chunker.chunk(content).unwrap();
    let hybrid_chunks = hybrid_chunker.chunk(content).unwrap();
    
    // Different strategies should produce different chunking results
    let sentence_boundaries: Vec<_> = sentence_chunks.iter().map(|c| (c.start_char, c.end_char)).collect();
    let paragraph_boundaries: Vec<_> = paragraph_chunks.iter().map(|c| (c.start_char, c.end_char)).collect();
    let hybrid_boundaries: Vec<_> = hybrid_chunks.iter().map(|c| (c.start_char, c.end_char)).collect();
    
    // At least one strategy should produce different boundaries
    assert!(
        sentence_boundaries != paragraph_boundaries || 
        sentence_boundaries != hybrid_boundaries ||
        paragraph_boundaries != hybrid_boundaries,
        "Different chunking strategies should produce different results"
    );
    
    // All strategies should preserve content integrity
    for chunks in [&sentence_chunks, &paragraph_chunks, &hybrid_chunks] {
        let reconstructed = chunks.iter()
            .map(|c| &c.content)
            .collect::<Vec<_>>()
            .join("")
            .replace(|c: char| c.is_whitespace(), " ")
            .split_whitespace()
            .collect::<Vec<_>>()
            .join(" ");
        
        let original_normalized = content.replace(|c: char| c.is_whitespace(), " ")
            .split_whitespace()
            .collect::<Vec<_>>()
            .join(" ");
        
        // Content should be preserved (allowing for overlap differences)
        assert!(
            reconstructed.contains(&original_normalized[..original_normalized.len() / 2]),
            "Chunking should preserve original content structure"
        );
    }
}

#[test]
fn test_edge_case_empty_content() {
    let chunker = SemanticChunker::new(ChunkingStrategy::Hybrid, 100, 20);
    let content = "";
    
    let chunks = chunker.chunk(content).unwrap();
    
    assert_eq!(chunks.len(), 1, "Empty content should create single empty chunk");
    assert_eq!(chunks[0].content, "", "Empty chunk should have empty content");
    assert_eq!(chunks[0].start_char, 0, "Empty chunk should start at 0");
    assert_eq!(chunks[0].end_char, 0, "Empty chunk should end at 0");
}

#[test]
fn test_edge_case_single_very_long_sentence() {
    let chunker = SemanticChunker::new(ChunkingStrategy::Sentence, 50, 10);
    let content = "This is an extremely long sentence that exceeds the chunk size limit significantly and should be handled gracefully by the chunker even though it cannot be split at natural sentence boundaries because it is just one very long continuous sentence without any proper ending punctuation until the very end.";
    
    let chunks = chunker.chunk(content).unwrap();
    
    // Should create chunks even for very long sentences
    assert!(chunks.len() > 1, "Very long sentence should be chunked");
    
    // Some chunks should be fallback type due to size constraints
    let has_fallback = chunks.iter().any(|c| c.boundary_type == BoundaryType::Fallback);
    assert!(has_fallback, "Should use fallback chunking for oversized sentences");
}

#[test]
fn test_chunk_metadata_consistency() {
    let chunker = SemanticChunker::new(ChunkingStrategy::Hybrid, 120, 25);
    let content = "First section with important content. This section has multiple sentences.\n\nSecond section begins here. It provides additional context and information.\n\nThird section concludes everything. The final thoughts are presented clearly.";
    
    let chunks = chunker.chunk(content).unwrap();
    
    // Verify chunk metadata consistency
    for (i, chunk) in chunks.iter().enumerate() {
        // Check index consistency
        assert_eq!(chunk.chunk_index, i, "Chunk index should match position");
        
        // Check content bounds
        assert_eq!(
            chunk.content.len(),
            chunk.end_char - chunk.start_char,
            "Content length should match char bounds"
        );
        
        // Check content matches source
        if chunk.start_char < content.len() && chunk.end_char <= content.len() {
            let expected_content = &content[chunk.start_char..chunk.end_char];
            assert_eq!(
                chunk.content, expected_content,
                "Chunk content should match source slice"
            );
        }
        
        // Check boundary type is valid
        match chunk.boundary_type {
            BoundaryType::Sentence | BoundaryType::Paragraph | 
            BoundaryType::Section | BoundaryType::Topic | BoundaryType::Fallback => {
                // All types are valid
            }
        }
    }
}

#[test]
fn test_performance_with_large_content() {
    let chunker = SemanticChunker::new(ChunkingStrategy::Hybrid, 1000, 100);
    
    // Create large content (about 10KB)
    let paragraph = "This is a test paragraph with multiple sentences for performance testing. ".repeat(20);
    let content = format!("{}\n\n{}\n\n{}", paragraph, paragraph, paragraph);
    
    let start = std::time::Instant::now();
    let chunks = chunker.chunk(&content).unwrap();
    let duration = start.elapsed();
    
    // Should complete quickly (under 100ms for 10KB content)
    assert!(
        duration.as_millis() < 100,
        "Chunking should be fast: took {:?} for {} bytes", duration, content.len()
    );
    
    // Should produce reasonable number of chunks
    assert!(
        chunks.len() > 1 && chunks.len() < 20,
        "Should produce reasonable number of chunks: {}", chunks.len()
    );
    
    // All chunks should be valid
    for chunk in chunks {
        assert!(!chunk.content.trim().is_empty(), "Chunks should not be empty");
        assert!(chunk.start_char <= chunk.end_char, "Chunk bounds should be valid");
    }
}