reasonkit-core 0.1.8

//! Chunking strategies for document preprocessing.
//!
//! Provides multiple strategies for splitting documents into chunks:
//! - Fixed-size chunking (simple character count)
//! - Semantic chunking (paragraph, sentence boundaries)
//! - Markdown-aware semantic chunking (respects headings, code blocks)

#[cfg(feature = "memory")]
use reasonkit_mem::{Chunk, EmbeddingIds};
use uuid::Uuid;

/// Chunking strategy configuration
#[derive(Debug, Clone, Copy)]
pub enum ChunkingStrategy {
    /// Fixed-size chunking with overlap
    FixedSize {
        /// Target chunk size in characters
        target_chars: usize,
        /// Overlap between chunks in characters
        overlap_chars: usize,
    },
    /// Semantic chunking based on paragraph boundaries
    Semantic {
        /// Maximum chunk size in characters
        max_chars: usize,
        /// Overlap between chunks in characters
        overlap_chars: usize,
    },
}

impl Default for ChunkingStrategy {
    fn default() -> Self {
        ChunkingStrategy::FixedSize {
            target_chars: 2000,
            overlap_chars: 200,
        }
    }
}

/// Split text into chunks according to the specified strategy
#[cfg(feature = "memory")]
pub fn chunk_text(text: &str, strategy: ChunkingStrategy) -> Vec<Chunk> {
    match strategy {
        ChunkingStrategy::FixedSize {
            target_chars,
            overlap_chars,
        } => chunk_fixed_size(text, target_chars, overlap_chars),
        ChunkingStrategy::Semantic {
            max_chars,
            overlap_chars,
        } => chunk_semantic(text, max_chars, overlap_chars),
    }
}

/// Fixed-size chunking: split by character count with overlap
#[cfg(feature = "memory")]
fn chunk_fixed_size(text: &str, target_chars: usize, overlap_chars: usize) -> Vec<Chunk> {
    let mut chunks = Vec::new();
    let mut start = 0;
    let mut index = 0;

    while start < text.len() {
        let end = (start + target_chars).min(text.len());
        let chunk_text = &text[start..end];

        chunks.push(Chunk {
            id: Uuid::new_v4(),
            text: chunk_text.to_string(),
            index,
            start_char: start,
            end_char: end,
            token_count: None, // token count not calculated
            section: None,
            page: None,
            embedding_ids: EmbeddingIds::default(),
        });

        // Move start forward, considering overlap
        start += target_chars.saturating_sub(overlap_chars);
        index += 1;
    }

    chunks
}

/// Semantic chunking: split by paragraph boundaries (double newline)
#[cfg(feature = "memory")]
fn chunk_semantic(text: &str, max_chars: usize, overlap_chars: usize) -> Vec<Chunk> {
    let paragraphs: Vec<&str> = text.split("\n\n").collect();
    let mut chunks = Vec::new();
    let mut index = 0;
    let mut char_offset = 0;

    for para in paragraphs {
        if para.is_empty() {
            continue;
        }
        // If paragraph fits within max_chars, treat as single chunk
        if para.len() <= max_chars {
            chunks.push(Chunk {
                id: Uuid::new_v4(),
                text: para.to_string(),
                index,
                start_char: char_offset,
                end_char: char_offset + para.len(),
                token_count: None,
                section: None,
                page: None,
                embedding_ids: EmbeddingIds::default(),
            });
            index += 1;
        } else {
            // Paragraph too large, fall back to fixed-size within paragraph
            let mut start = 0;
            while start < para.len() {
                let end = (start + max_chars).min(para.len());
                let chunk_text = &para[start..end];
                chunks.push(Chunk {
                    id: Uuid::new_v4(),
                    text: chunk_text.to_string(),
                    index,
                    start_char: char_offset + start,
                    end_char: char_offset + end,
                    token_count: None,
                    section: None,
                    page: None,
                    embedding_ids: EmbeddingIds::default(),
                });
                start += max_chars.saturating_sub(overlap_chars);
                index += 1;
            }
        }
        char_offset += para.len() + 2; // +2 for "\n\n"
    }

    chunks
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    #[cfg(feature = "memory")]
    fn test_fixed_size_chunking() {
        let text = "This is a sample document. ".repeat(100);
        let chunks = chunk_fixed_size(&text, 100, 20);
        assert!(!chunks.is_empty());
        assert!(chunks[0].text.len() <= 100);
    }

    #[test]
    #[cfg(feature = "memory")]
    fn test_semantic_chunking() {
        let text = "First paragraph.\n\nSecond paragraph.\n\nThird paragraph.";
        let chunks = chunk_semantic(&text, 50, 10);
        assert_eq!(chunks.len(), 3);
        assert!(chunks.iter().all(|c| c.text.contains("paragraph")));
    }
}