chonkier 0.0.2 - Docs.rs

/*
    RecursiveChunker chunks the text recursively based on the specified
    user defined rules. Since it is extensible, it can be used to chunk
    text on a variety of different conditions quite simply.
*/
use crate::tokenizer::{CharacterTokenizer, Tokenizer};
use crate::types::{RecursiveChunk, RecursiveLevel, RecursiveRules};
use lru::LruCache;
use std::num::NonZeroUsize;
use std::sync::RwLock;
use rayon::prelude::*;

pub struct RecursiveChunker<T: Tokenizer> {
    // The tokenizer used to count the number of tokens in the text.
    pub tokenizer: T,

    // The maximum chunk size of the chunks
    pub chunk_size: usize,

    // The rules used to chunk the text.
    pub rules: RecursiveRules,

    // The seperator used interally to split the text.
    seperator: String,
    // The approximate number of characters per token
    chars_per_token: f32,
    // Create a cache to store the number of tokens
    token_count_cache: RwLock<LruCache<String, usize>>,
}

impl<T: Tokenizer> RecursiveChunker<T> {
    /// Create a new instance of RecursiveChunker.
    pub fn new(tokenizer: T, chunk_size: usize, rules: RecursiveRules) -> Self {
        Self {
            tokenizer,
            chunk_size,
            rules,
            seperator: "🦛".to_string(), //TODO: Figure out a good seperator to use
            chars_per_token: 6.38,       // Mean of GPT2 tokenizer vocab
            token_count_cache: RwLock::new(LruCache::new(NonZeroUsize::new(4096).unwrap())),
        }
    }

    /// This method splits the text into chunks based on the specified level passed.
    ///
    /// # Arguments
    ///
    /// * `text` - The text to be split into chunks.
    /// * `level` - The level of recursion to be used for splitting the text.
    ///
    /// # Returns
    ///
    /// A vector of strings, where each string is a chunk of the original text.
    pub fn split_text(&self, text: &str, level: &RecursiveLevel) -> Vec<String> {
        // Create a copy of the text that we can work with
        let mut formatted_text = text.to_owned();

        // At every delimiter in the level, add the seperator
        if !level.delimiters.is_empty() {
            for delim in level.delimiters.iter() {
                formatted_text =
                    formatted_text.replace(delim, &format!("{}{}", delim, &self.seperator));
            }

            // Split the text based on the seperator
            let result: Vec<String> = formatted_text
                .split(&self.seperator)
                .map(|s| s.to_string())
                .collect::<Vec<String>>();

            // Return the result
            result
        } else if level.whitespace {
            // Split the text based on whitespace
            let result: Vec<String> = text
                .split_whitespace()
                .map(|s| s.to_string())
                .collect::<Vec<String>>();

            // Return the result
            result
        } else {
            // The last option is split via the tokenizer
            let tokens: Vec<usize> = self.tokenizer.encode(text);

            // Generate token splits
            let mut token_splits: Vec<Vec<usize>> = Vec::new();
            let mut start = 0;
            let mut end = self.chunk_size;

            while end <= tokens.len() {
                token_splits.push(tokens[start..end].to_vec());
                start = end;
                end += self.chunk_size;
            }

            // Check if there are any remaining tokens, and add them to the splits
            if start < tokens.len() {
                token_splits.push(tokens[start..].to_vec());
            }

            // Decode and return the splits
            let result: Vec<String> = token_splits
                .iter()
                .map(|split| self.tokenizer.decode(split))
                .collect::<Vec<String>>();

            result
        }
    }

    fn get_cumulative_token_counts(&self, token_counts: &Vec<usize>, extra: usize) -> Vec<u64> {
        let mut counter: u64 = 0;
        let mut cumcounts: Vec<u64> = Vec::with_capacity(token_counts.len() + 1);
        cumcounts.push(counter);

        for count in token_counts {
            counter += (*count as u64) + (extra as u64);
            cumcounts.push(counter);
        }

        // Return the cumulative counts
        cumcounts
    }

    // This method merges the splits that are too short in length
    // such that they come to be just below the chunk_size passed in.
    pub fn merge_splits(
        &self,
        splits: Vec<String>,
        token_counts: Vec<usize>,
        combine_with_whitespace: bool,
    ) -> (Vec<String>, Vec<usize>) {
        // If there are no splits or token_counts then simply return an empty vec
        if splits.is_empty() || token_counts.is_empty() {
            return (splits, token_counts);
        }
        // Lastly, check if all of the token counts are greater than the chunk size and
        // return them as is if they are
        if token_counts.iter().all(|&count| count > self.chunk_size) {
            return (splits, token_counts);
        }

        // Get the cumulative counts for the text
        let cumcounts: Vec<u64> = 
        if !combine_with_whitespace {
            self.get_cumulative_token_counts(&token_counts, 0)
        } else {
            self.get_cumulative_token_counts(&token_counts, 1) // Add one for the whitespace
        };

        // Initialize the merged_splits vector and current_index
        let mut merged_splits: Vec<String> = Vec::new();
        let mut merged_token_counts: Vec<usize> = Vec::new();

        let mut current_index: usize = 0;
        let mut next_index: usize = 1; // Minimum size of 1 if not an empty splits

        let mut current_token_count: u64 = cumcounts[current_index];
        let mut next_token_count: u64 = current_token_count + (self.chunk_size as u64);

        // TODO: Use the binary search with match functionality to get the index where
        // the split should occur inside a while loop, which would keep adding the ideal splits
        // and joining the text (with or without whitespace, based on the level's bool flag)

        while next_index < splits.len() {
            // Get the ideal token count for the next split")
            next_index = match cumcounts.binary_search(&next_token_count) {
                Ok(index) => index,
                Err(index) => index - 1,
            };

            // Handle the edge case where the next index is less than or equal to the current index
            if next_index <= current_index {
                next_index = current_index + 1;
            }

            // This is the next index point, so we can use it to slice the text
            // and then join based on the level's bool flag
            if combine_with_whitespace {
                merged_splits.push(splits[current_index..next_index].join(" "));
            } else {
                merged_splits.push(splits[current_index..next_index].join(""));
            }

            // Add the new merged_token_count to the vector based on the next_index
            next_token_count = cumcounts[next_index]; // This is the actual next token count now, rather than the ideally required once
            merged_token_counts.push((next_token_count - current_token_count) as usize);

            current_index = next_index;
            current_token_count = cumcounts[current_index];
            next_token_count = current_token_count + (self.chunk_size as u64);
        }

        // Return the merged splits
        (merged_splits, merged_token_counts)
    }

    fn recursive_chunk(
        &self,
        text: &String,
        level: usize,
    ) -> (Vec<String>, Vec<usize>, Vec<usize>) {
        //TODO: The recursive_chunk function should only take in a text and not a whole ass Vec<String> which is
        // heavy as hell to pass around and pollutes the signature space. The way this would work is that we
        // would call recursive chunk with a text string and a level information (because we need to keep track of the level)
        // Then we do an encode call on the tokenizer to check if the text is shorter than the chunk size. If it is,
        // we can just return the text as a single chunk, alongside the token count and level info.
        // Otherwise, we can split the text and merge the short ones together. Once this operation is done, we
        // need to check if all the chunks are below the chunk size or not. If they are not, we need to recursively call the
        // recursive_chunk function on each chunk until all chunks are below the chunk size.

        // Wait, if we are passing it to the recursive_chunk anyways, I think we can simply check for the
        // token count in the previous function itself.

        // Firstly, since we know that the text is larger than the chunk_size, we should do an initial split on it
        let splits: Vec<String> = self.split_text(text, &self.rules.levels[level]);

        // Then we merge the split.
        let token_counts: Vec<usize> = splits.iter().map(|s| self.count_tokens(s)).collect();
        let (splits, token_counts) = self.merge_splits(
            splits,
            token_counts,
            self.rules.levels[level].whitespace,
        );

        // Get the levels for the current split level
        let levels: Vec<usize> = vec![level; splits.len()];

        // Now that we have the merged splits, we need to check these splits.
        let mut result_splits: Vec<String> = Vec::new();
        let mut result_token_counts: Vec<usize> = Vec::new();
        let mut result_levels: Vec<usize> = Vec::new();
        for ((split, &token_count), &lvl) in
            splits.iter().zip(token_counts.iter()).zip(levels.iter())
        {
            if token_count > self.chunk_size {
                let (subsplits, sub_token_counts, sub_levels) =
                    self.recursive_chunk(split, level + 1);
                result_splits.extend(subsplits);
                result_token_counts.extend(sub_token_counts);
                result_levels.extend(sub_levels);
            } else {
                result_splits.push(split.clone());
                result_token_counts.push(token_count);
                result_levels.push(lvl);
            }
        }

        (result_splits, result_token_counts, result_levels)
    }

    fn count_tokens(&self, text: &String) -> usize {
        let count: usize;
        let estimate = (text.len() as f32 / self.chars_per_token) as usize;
        if estimate > self.chunk_size {
            // Is this a good enough estimate?
            self.chunk_size + 1
        } else {
            // Get the cache
            let mut cache = self.token_count_cache.write().unwrap();
            // Check if the cache has the element already?
            // If it does, return the cached value
            if let Some(&count) = cache.get(text) {
                return count;
            }
            count = self.tokenizer.count_tokens(text);
            // Cache the result for the future
            cache.put(text.clone(), count);

            // return the count
            count
        }
    }

    /// This method chunks the text into RecursiveChunks based on the
    /// classes parameters and rules.
    pub fn chunk(&self, text: &String) -> Vec<RecursiveChunk> {
        // NOTE: This function takes in a mutable ref to self because it
        // might mutate the tokenizer (CharacterTokenizer) internal state.

        // NOTE: Currently, this method is causing issues because the merging
        // logic is not implemented on the level 0 yet.

        // If the text is smaller than or equal to the chunk size, return a single chunk
        let text_token_count: usize = self.count_tokens(text); // NOTE: The usize here might cause issues for really long texts
        if text_token_count <= self.chunk_size {
            return vec![RecursiveChunk::new(
                text.clone(),
                0,
                text.len(),
                text_token_count,
                0,
            )];
        }
        // Since the text is definitely larger than the chunk size, we need to chunk it
        let (chunks, token_counts, levels) = self.recursive_chunk(text, 0);

        // Add the start and end indices for the entire text
        // Assuming that we didn't make any mistake in the previous chunking,
        // we can start the index from 0 and keep adding the length of each chunk.
        let mut result: Vec<RecursiveChunk> = Vec::new();
        let mut start_index = 0;
        let mut end_index = 0;
        for ((chunk, token_count), level) in chunks
            .into_iter()
            .zip(token_counts.into_iter())
            .zip(levels.into_iter())
        {
            end_index += chunk.len();
            result.push(RecursiveChunk::new(
                chunk,
                start_index,
                end_index,
                token_count,
                level,
            ));
            start_index = end_index;
        }

        result
    }

    /// This method chunks the text into RecursiveChunks using the Rayon crate.
    pub fn chunk_batch(&self, texts: &Vec<String>) -> Vec<Vec<RecursiveChunk>> {
        // Just run through each text and chunk it
        texts.par_iter().map(|text| self.chunk(text)).collect()
    }

    /// This method chunks the text into RecursiveChunks using a simple for loop.
    pub fn chunk_batch_sequential(&self, texts: &Vec<String>) -> Vec<Vec<RecursiveChunk>> {
        texts.iter().map(|text| self.chunk(text)).collect()
    }
}

impl Default for RecursiveChunker<CharacterTokenizer> {
    fn default() -> Self {
        RecursiveChunker::new(CharacterTokenizer::new(), 512, RecursiveRules::default())
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_recursive_chunker_split_text() {
        // Create an instance of a CharacterTokenizer
        let tokenizer: CharacterTokenizer = CharacterTokenizer::new();
        // Create a recursive chunker instance
        let chunker: RecursiveChunker<CharacterTokenizer> =
            RecursiveChunker::new(tokenizer, 50, RecursiveRules::default());

        // Define some text to be split
        let text: String = String::from(
            "The quick brown fox jumps over the lazy dog.\n\nThe quick brown fox jumps over the lazy dog.",
        );

        // Define a new set of rules
        let rules: RecursiveRules = RecursiveRules::default();

        // Split the text using the chunker
        let chunks = chunker.split_text(&text, &rules.levels[0]);

        // Print the chunks
        println!("Chunks: {:?}", chunks);

        // Assert that the number of chunks is correct
        assert_eq!(chunks.len(), 2);

        // Assert that the chunks are correct
        assert_eq!(
            chunks[0],
            "The quick brown fox jumps over the lazy dog.\n\n"
        );
        assert_eq!(chunks[1], "The quick brown fox jumps over the lazy dog.");
    }

    // #[test]
    // fn test_recursive_chunker_merge_splits() {
    //     // Create an instance of a CharacterTokenizer
    //     let tokenizer: CharacterTokenizer = CharacterTokenizer::new();
    //     // Create a recursive chunker instance
    //     let chunker: RecursiveChunker<CharacterTokenizer> =
    //         match RecursiveChunker::new(tokenizer, 50, RecursiveRules::default()) {
    //             Ok(chunker) => chunker,
    //             Err(err) => panic!("Failed to create chunker: {}", err),
    //         };

    //     // Let's define some splits to use
    //     let splits = vec![
    //         "The quick brown fox jumps over the lazy dog.\n\n".to_string(),
    //         "The quick brown fox jumps over the lazy dog.".to_string(),
    //     ];
    //     let token_counts = vec![20, 20];

    //     // Merge the splits and token counts
    //     let (merged, _) = chunker
    //         .merge_splits(splits, token_counts, false)
    //         .expect("Merge splits returned an error!");

    //     // Assert that the merged chunks are correct
    //     assert_eq!(
    //         merged[0],
    //         "The quick brown fox jumps over the lazy dog.\n\nThe quick brown fox jumps over the lazy dog."
    //     );
    // }
}