text_splitter/chunk_size/tiktoken.rs
1use tiktoken_rs::CoreBPE;
2
3use crate::ChunkSizer;
4
5impl ChunkSizer for CoreBPE {
6 /// Returns the number of tokens in a given text after tokenization.
7 fn size(&self, chunk: &str) -> usize {
8 self.encode_ordinary(chunk).len()
9 }
10}
11
12#[cfg(test)]
13mod tests {
14 use super::*;
15
16 use tiktoken_rs::cl100k_base;
17
18 #[test]
19 fn returns_offsets() {
20 let tokenizer = cl100k_base().unwrap();
21 let size = tokenizer.size("An apple a");
22 assert_eq!(size, 3);
23 }
24}