Expand description
§High performance text chunking in Rust
A port of umarbutler/semchunk into Rust for splitting text into semantically meaningful chunks.
§Example
use semchunk_rs::Chunker;
let chunker = Chunker::new(4, Box::new(|s: &str| s.len() - s.replace(" ", "").len() + 1));
let text = "The quick brown fox jumps over the lazy dog.";
let chunks = chunker.chunk(text);
assert_eq!(chunks, vec!["The quick brown fox", "jumps over the lazy", "dog."]);
With rust_tokenizers
:
use rust_tokenizers::tokenizer::{RobertaTokenizer, Tokenizer};
use semchunk_rs::Chunker;
let tokenizer = RobertaTokenizer::from_file(
"data/roberta-base-vocab.json",
"data/roberta-base-merges.txt",
false,
false,
).expect("Error loading tokenizer");
let token_counter = Box::new(move |s: &str| tokenizer.tokenize(s).len());
let chunker = Chunker::new(4, token_counter);
let text = "The quick brown fox jumps over the lazy dog.";
let chunks = chunker.chunk(text);
assert_eq!(chunks, vec!["The quick brown fox", "jumps over the", "lazy dog."]);