chonkier 0.0.2 - Docs.rs

/*
    Sentence Chunker chunks the text based on the sentence boundaries,
    which makes it useful for chunking text without sub-token splits.
*/

use std::cmp::{max, min};

use rayon::prelude::*;
use crate::tokenizer::CharacterTokenizer;
use crate::tokenizer::Tokenizer;
use crate::types::{Sentence, SentenceChunk};

/// Determines how delimiters are included in the chunked sentences
pub enum DelimiterHandling {
    /// Include delimiters with the previous sentence
    Previous,
    /// Include delimiters with the next sentence
    Next,
    /// Do not include delimiters in either sentence
    None,
}

impl Default for DelimiterHandling {
    fn default() -> Self {
        DelimiterHandling::Previous
    }
}

/// SentenceChunker splits text into chunks based on sentence boundaries and token limits
pub struct SentenceChunker<T: Tokenizer> {
    /// The tokenizer to use for counting tokens
    pub tokenizer: T,
    /// Maximum number of tokens per chunk
    pub chunk_size: usize,
    /// Number of tokens to overlap between chunks
    pub chunk_overlap: usize,
    /// Minimum number of sentences per chunk
    pub min_sentences_per_chunk: usize,
    /// Minimum number of characters per sentence
    pub min_characters_per_sentence: usize,
    /// Whether to use approximate token counting
    pub approximate: bool,
    /// Delimiters to split sentences on
    pub delimiters: Vec<String>,
    /// How to handle delimiters in sentences
    pub include_delim: DelimiterHandling,
    /// Separator used internally for sentence splitting
    separator: String,
    /// Average characters per token (for estimation)
    chars_per_token: f32,
}

impl<T: Tokenizer> SentenceChunker<T> {
    // Constructor for SentenceChunker
    pub fn new<S: Into<String>>(
        tokenizer: T,
        chunk_size: usize,
        chunk_overlap: usize,
        min_sentences_per_chunk: usize,
        min_characters_per_sentence: usize,
        approximate: bool,
        delimiters: Vec<S>,
        include_delim: DelimiterHandling,
    ) -> Result<Self, &'static str> {
        // Convert the delimiters into a vector of strings
        let delimiters = delimiters
            .into_iter()
            .map(|s| s.into())
            .collect::<Vec<String>>();

        // chunk_size is usize, which ensures it can hold a non-negative integer value.
        // chunk_size must be greater than 0.
        if chunk_size == 0 {
            return Err("chunk_size must be greater than 0");
        }
        // chunk_overlap must be less than chunk_size and greater than or equal to 0.
        if chunk_overlap >= chunk_size {
            return Err(
                "chunk_overlap must be less than chunk_size and greater than or equal to 0",
            );
        }
        if min_sentences_per_chunk == 0 {
            return Err("min_sentences_per_chunk must be greater than 0");
        }
        if min_characters_per_sentence == 0 {
            return Err("min_characters_per_sentence must be greater than 0");
        }
        if delimiters.is_empty() {
            return Err("delimiters must not be empty");
        }

        Ok(Self {
            tokenizer,
            chunk_size,
            chunk_overlap,
            min_sentences_per_chunk,
            min_characters_per_sentence,
            approximate,
            delimiters,
            include_delim,
            separator: "🦛".to_string(),
            chars_per_token: 6.0,
        })
    }

    /// Split text into sentences based on delimiters
    fn split_sentences(&self, text: &str) -> Vec<String> {
        let mut t = text.to_string();

        // Replace delimiters with separator based on include_delim setting
        for delim in &self.delimiters {
            match self.include_delim {
                DelimiterHandling::Previous => {
                    t = t.replace(delim, &format!("{}{}", delim, self.separator));
                }
                DelimiterHandling::Next => {
                    t = t.replace(delim, &format!("{}{}", self.separator, delim));
                }
                DelimiterHandling::None => {
                    t = t.replace(delim, &self.separator);
                }
            }
        }

        // Initial split
        let splits: Vec<String> = t
            .split(&self.separator)
            .filter(|s| !s.is_empty())
            .map(|s| s.to_string())
            .collect();

        // Combine short splits with previous sentence
        let mut current = String::new();
        let mut sentences = Vec::new();

        for s in splits {
            // If the split is short, add to current
            if s.len() < self.min_characters_per_sentence {
                current.push_str(&s);
            } else if !current.is_empty() {
                // If we have accumulated text and the new split is long enough
                current.push_str(&s);
                sentences.push(current.clone());
                current.clear();
            } else {
                // If the split is long enough and we don't have accumulated text
                sentences.push(s);
            }

            // If the current sentence is long enough, add it to sentences
            if current.len() >= self.min_characters_per_sentence {
                sentences.push(current.clone());
                current.clear();
            }
        }

        // If there is remaining text, add it to sentences
        if !current.is_empty() {
            sentences.push(current);
        }

        sentences
    }

    /// Estimate token counts based on character length
    fn estimate_token_counts(&self, texts: &[String]) -> Vec<usize> {
        texts
            .iter()
            .map(|text| max(1, (text.len() as f32 / self.chars_per_token) as usize))
            .collect()
    }

    /// Prepare sentences with token counts and positions
    fn prepare_sentences(&self, text: &str) -> Vec<Sentence> {
        // Split text into sentences
        let sentence_texts = self.split_sentences(text);
        if sentence_texts.is_empty() {
            return Vec::new();
        }

        // Calculate positions
        let mut positions = Vec::with_capacity(sentence_texts.len());
        let mut current_pos = 0;

        for sent in &sentence_texts {
            positions.push(current_pos);
            current_pos += sent.len();
        }

        // Calculate token counts
        let token_counts = if self.approximate {
            self.estimate_token_counts(&sentence_texts)
        } else {
            // For accurate counting, we'd encode each sentence
            sentence_texts
                .iter()
                .map(|sent| self.tokenizer.count_tokens(sent))
                .collect()
        };

        // Create sentence objects
        sentence_texts
            .iter()
            .zip(positions.iter())
            .zip(token_counts.iter())
            .map(|((sent, &pos), &count)| Sentence::new(sent, pos, pos + sent.len(), count))
            .collect()
    }

    /// Calculate feedback value to adjust token estimates
    fn get_feedback(&self, estimate: usize, actual: usize) -> f32 {
        let estimate = max(1, estimate) as f32;
        let actual = max(1, actual) as f32;
        (1.0 - ((estimate - actual) / estimate)).max(0.01)
    }

    /// Create a chunk from a list of sentences
    fn create_chunk(&self, sentences: &[Sentence]) -> SentenceChunk {
        let chunk_text: String = sentences
            .iter()
            .map(|sentence| sentence.text.clone())
            .collect();
        let token_count: usize = sentences.iter().map(|s| s.token_count).sum();

        SentenceChunk::new(
            &chunk_text,
            sentences.first().unwrap().start_index,
            sentences.last().unwrap().end_index,
            token_count,
            Some(sentences.to_vec()),
        )
    }

    // Split text into chunks based on sentences and token limits
    pub fn chunk(&self, text: &str) -> Vec<SentenceChunk> {
        if text.trim().is_empty() {
            return Vec::new();
        }

        // Get prepared sentences with token counts
        let sentences = self.prepare_sentences(text);
        if sentences.is_empty() {
            return Vec::new();
        }

        // Pre-calculate cumulative token counts
        let mut token_sums = Vec::with_capacity(sentences.len() + 1);
        token_sums.push(0);
        let mut sum = 0;
        for sentence in &sentences {
            sum += sentence.token_count;
            token_sums.push(sum);
        }

        let mut chunks = Vec::new();
        let mut feedback = 1.0;
        let mut pos = 0;

        while pos < sentences.len() {
            // Apply feedback to token sums
            let adjusted_token_sums: Vec<usize> = token_sums
                .iter()
                .map(|&sum| (sum as f32 * feedback) as usize)
                .collect();

            // Use binary search to find split point
            let target_tokens = adjusted_token_sums[pos] + self.chunk_size;
            let mut split_idx = match adjusted_token_sums.binary_search(&target_tokens) {
                Ok(idx) => idx,
                Err(idx) => idx.saturating_sub(1),
            };

            // Ensure we include at least one sentence beyond pos
            split_idx = max(split_idx, pos + 1);
            split_idx = min(split_idx, sentences.len());

            // Handle minimum sentences requirement
            if split_idx - pos < self.min_sentences_per_chunk {
                if pos + self.min_sentences_per_chunk <= sentences.len() {
                    split_idx = pos + self.min_sentences_per_chunk;
                } else {
                    eprintln!(
                        "Warning: Minimum sentences per chunk as {} could not be met for all chunks. \
                        Last chunk of the text will have only {} sentences. \
                        Consider increasing the chunk_size or decreasing the min_sentences_per_chunk.",
                        self.min_sentences_per_chunk,
                        sentences.len() - pos
                    );
                    split_idx = sentences.len();
                }
            }

            // Get estimated token count
            let estimate = adjusted_token_sums[split_idx] - adjusted_token_sums[pos];

            // Get candidate sentences and verify actual token count
            let chunk_sentences = &sentences[pos..split_idx];
            let chunk_text: String = chunk_sentences.iter().map(|s| s.text.clone()).collect();
            let actual = if self.approximate {
                self.tokenizer.encode(&chunk_text).len()
            } else {
                // We already have accurate counts
                chunk_sentences.iter().map(|s| s.token_count).sum()
            };

            // Update feedback for the next iteration
            feedback = self.get_feedback(estimate, actual);

            // Back off one sentence at a time if we exceeded chunk size
            let mut final_split_idx = split_idx;
            let mut final_chunk_sentences = chunk_sentences.to_vec();
            let mut final_actual = actual;

            while final_actual > self.chunk_size
                && final_chunk_sentences.len() > self.min_sentences_per_chunk
            {
                final_split_idx -= 1;
                final_chunk_sentences = sentences[pos..final_split_idx].to_vec();
                let final_chunk_text: String = final_chunk_sentences
                    .iter()
                    .map(|s| s.text.clone())
                    .collect();
                final_actual = if self.approximate {
                    self.tokenizer.encode(&final_chunk_text).len()
                } else {
                    final_chunk_sentences.iter().map(|s| s.token_count).sum()
                };
            }

            chunks.push(self.create_chunk(&final_chunk_sentences));

            // Calculate next position with overlap
            if self.chunk_overlap > 0 && final_split_idx < sentences.len() {
                // Calculate how many sentences we need for overlap
                let mut overlap_tokens = 0;
                let mut overlap_idx = final_split_idx - 1;

                while overlap_idx > pos && overlap_tokens < self.chunk_overlap {
                    let sent = &sentences[overlap_idx];
                    let next_tokens = overlap_tokens + sent.token_count + 1; // +1 for space
                    if next_tokens > self.chunk_overlap {
                        break;
                    }
                    overlap_tokens = next_tokens;
                    overlap_idx -= 1;
                }

                // Move position to after the overlap
                pos = overlap_idx + 1;
            } else {
                pos = final_split_idx;
            }
        }

        chunks
    }

    /// Chunk multiple texts in a batch
    pub fn chunk_batch(&self, texts: &Vec<String>) -> Vec<Vec<SentenceChunk>> {
        texts.par_iter().map(|text| self.chunk(text)).collect()
    }
}

impl Default for SentenceChunker<CharacterTokenizer> {
    fn default() -> Self {
        let tokenizer = CharacterTokenizer::new();
        Self::new(
            tokenizer,
            512,                         // chunk_size
            0,                           // chunk_overlap
            1,                           // min_sentences_per_chunk
            12,                          // min_characters_per_sentence
            true,                        // approximate
            vec![".", "!", "?", "\n"],   // delimiters
            DelimiterHandling::Previous, // include_delim
        )
        .expect("Default parameters should be valid")
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::tokenizer::CharacterTokenizer;

    #[test]
    fn test_split_sentences() {
        let tokenizer = CharacterTokenizer::new();
        let chunker = SentenceChunker::new(
            tokenizer,
            20,
            0,
            1,
            5,
            true,
            vec![".", "!", "?", "\n"],
            DelimiterHandling::Previous,
        )
        .unwrap(); // TODO: Figure out a better way to handle this than unwrap

        let text: &str = "Hello, world! This is a test. How are you?";
        let sentences = chunker.split_sentences(text);

        assert_eq!(sentences.len(), 3);
        assert_eq!(sentences[0], "Hello, world!");
        assert_eq!(sentences[1], " This is a test.");
        assert_eq!(sentences[2], " How are you?");
    }

    #[test]
    fn test_chunk() {
        let tokenizer = CharacterTokenizer::new();
        let chunker = SentenceChunker::new(
            tokenizer,
            20,
            0,
            1,
            5,
            true,
            vec![".", "!", "?"],
            DelimiterHandling::Previous,
        )
        .unwrap(); // TODO: Figure out a better way to handle this than unwrap

        let text: &str = "Hello, world! This is a test. How are you? I am fine, thank you!";
        let chunks = chunker.chunk(&text.to_string());

        // With a small chunk size, we should get multiple chunks
        assert!(chunks.len() > 1);

        // Verify the chunks cover the entire text
        let combined: String = chunks.iter().map(|c| c.text.clone()).collect();
        assert_eq!(combined, text);
    }

    #[test]
    fn test_min_sentences_per_chunk() {
        let tokenizer = CharacterTokenizer::new();
        let chunker = SentenceChunker::new(
            tokenizer,
            100,
            0,
            2, // Require at least 2 sentences per chunk
            5,
            true,
            vec![".", "!", "?", "\n"],
            DelimiterHandling::Previous,
        )
        .unwrap();

        let text = "Short. Another short. And another. One more.";
        let chunks = chunker.chunk(&text.to_string());

        // With min_sentences_per_chunk=2, we should get at most 2 chunks from 4 sentences
        assert!(chunks.len() <= 2);

        // Each chunk should have at least 2 sentences
        for chunk in &chunks {
            assert!(chunk.sentences.as_ref().unwrap().len() >= 2);
        }
    }
}