use chonkier::CharacterTokenizer;
use chonkier::RecursiveChunker;
use chonkier::RecursiveRules;
use chonkier::Tokenizer;
fn main() {
let chunker = RecursiveChunker::new(CharacterTokenizer::new(), 100, RecursiveRules::default());
let text =
"The quick brown fox jumps over the lazy dog.\n\nThe quick brown fox jumps over the lazy dog.\n\nThe quick brown fox jumps over the lazy dog.\n\nThe quick brown fox jumps over the lazy dog."
.to_string();
let lvl0 = RecursiveRules::default().levels[0].clone();
let splits0: Vec<String> = chunker.split_text(&text, &lvl0);
println!("Splits at level 0: {:?}", splits0);
let token_counts0: Vec<usize> = splits0
.iter()
.map(|s| chunker.tokenizer.encode(s).len())
.collect();
println!("Token counts at level 0: {:?}", token_counts0);
let (merges0, merged_token_counts0) =
chunker.merge_splits(splits0.clone(), token_counts0.clone(), false);
println!(
"Merges at level 0: {:?}\n{:?}",
merges0, merged_token_counts0[0]
);
let mut accumulated_token_counts = Vec::new();
let mut current_count = 0;
accumulated_token_counts.push(current_count);
for count in token_counts0.clone() {
current_count += count;
accumulated_token_counts.push(current_count);
}
println!(
"Accumulated token counts at level 0: {:?}",
accumulated_token_counts
);
let target: usize = 30;
let index: usize = match accumulated_token_counts.binary_search(&target) {
Ok(index) => index,
Err(index) => index - 1,
};
println!(
"Index of target: {} ({})",
index, accumulated_token_counts[index]
);
println!("Total number of splits: {}", splits0.clone().len());
let mut current_target;
let mut current_index = 0;
let mut next_index = 1;
while next_index < splits0.len() {
current_target = accumulated_token_counts[current_index] + 30;
next_index = match accumulated_token_counts.binary_search(¤t_target) {
Ok(index) => index,
Err(index) => index - 1,
};
if next_index <= current_index {
next_index = current_index + 1;
}
println!("Current target: {}", current_target);
println!("Current index: {}", current_index);
println!("Next index: {}", next_index);
println!("{:?}", splits0[current_index..next_index].join(""));
println!("\n");
current_index = next_index;
}
let chunks = chunker.chunk(&text);
println!("Chunks: {:?}", chunks);
}