chonkier 0.0.2

🦛 Chonkie, now in Rust 🦀: No-nonsense, ultra-fast, ultra-light chunking library
Documentation
/*
    A simple example of recursive chunking with Chonkier's RecursiveChunker.
*/
use chonkier::CharacterTokenizer;
use chonkier::RecursiveChunker;
use chonkier::RecursiveRules;
use chonkier::Tokenizer;

fn main() {
    let chunker = RecursiveChunker::new(CharacterTokenizer::new(), 100, RecursiveRules::default());

    let text =
        "The quick brown fox jumps over the lazy dog.\n\nThe quick brown fox jumps over the lazy dog.\n\nThe quick brown fox jumps over the lazy dog.\n\nThe quick brown fox jumps over the lazy dog."
            .to_string();

    // Check the split at level 0
    let lvl0 = RecursiveRules::default().levels[0].clone();
    let splits0: Vec<String> = chunker.split_text(&text, &lvl0);
    println!("Splits at level 0: {:?}", splits0);

    // Token counts of the splits at level 0
    let token_counts0: Vec<usize> = splits0
        .iter()
        .map(|s| chunker.tokenizer.encode(s).len())
        .collect();
    println!("Token counts at level 0: {:?}", token_counts0);

    // Check the merges at level 0
    let (merges0, merged_token_counts0) =
        chunker.merge_splits(splits0.clone(), token_counts0.clone(), false);
    println!(
        "Merges at level 0: {:?}\n{:?}",
        merges0, merged_token_counts0[0]
    );
    // This is giving out 90 --> which means merge is not checking if the splits are valid
    // And also that the binary_search might not be behaving as expected.

    // Let's make a list of accumulated token counts
    let mut accumulated_token_counts = Vec::new();
    let mut current_count = 0;
    accumulated_token_counts.push(current_count);
    for count in token_counts0.clone() {
        current_count += count;
        accumulated_token_counts.push(current_count);
    }
    println!(
        "Accumulated token counts at level 0: {:?}",
        accumulated_token_counts
    );

    // Then let's do binary search
    let target: usize = 30;
    let index: usize = match accumulated_token_counts.binary_search(&target) {
        Ok(index) => index,
        Err(index) => index - 1,
    };
    println!(
        "Index of target: {} ({})",
        index, accumulated_token_counts[index]
    );
    println!("Total number of splits: {}", splits0.clone().len());
    //NOTE: The binary search works so weirdly
    // When the Error case happens (which is most of the time) it returns the first element of the vector
    // larger than the target. When the target is found, which is the Ok case, it returns the index of the target.
    // So if I need it just smaller than the target, I can subtract 1 from the error case.

    // Let's do an experiment
    let mut current_target;
    let mut current_index = 0;
    let mut next_index = 1;
    while next_index < splits0.len() {
        current_target = accumulated_token_counts[current_index] + 30;
        next_index = match accumulated_token_counts.binary_search(&current_target) {
            Ok(index) => index,
            Err(index) => index - 1,
        };
        if next_index <= current_index {
            next_index = current_index + 1;
        }
        // Print out everything!
        println!("Current target: {}", current_target);
        println!("Current index: {}", current_index);
        println!("Next index: {}", next_index);
        println!("{:?}", splits0[current_index..next_index].join(""));
        println!("\n");

        current_index = next_index;
    }

    // Now try the final chunking algorithm over the entire RecursiveChunker
    let chunks = chunker.chunk(&text);
    println!("Chunks: {:?}", chunks);
}