text_splitter/chunk_size/
tiktoken.rs

1use tiktoken_rs::CoreBPE;
2
3use crate::ChunkSizer;
4
5impl ChunkSizer for CoreBPE {
6    /// Returns the number of tokens in a given text after tokenization.
7    fn size(&self, chunk: &str) -> usize {
8        self.encode_ordinary(chunk).len()
9    }
10}
11
12#[cfg(test)]
13mod tests {
14    use super::*;
15
16    use tiktoken_rs::cl100k_base;
17
18    #[test]
19    fn returns_offsets() {
20        let tokenizer = cl100k_base().unwrap();
21        let size = tokenizer.size("An apple a");
22        assert_eq!(size, 3);
23    }
24}