text_splitter/chunk_size/
tiktoken.rs

1use tiktoken_rs::CoreBPE;
2
3use crate::ChunkSizer;
4
5impl ChunkSizer for &CoreBPE {
6    /// Returns the number of tokens in a given text after tokenization.
7    fn size(&self, chunk: &str) -> usize {
8        self.encode_ordinary(chunk).len()
9    }
10}
11
12impl ChunkSizer for CoreBPE {
13    /// Returns the number of tokens in a given text after tokenization.
14    fn size(&self, chunk: &str) -> usize {
15        (&self).size(chunk)
16    }
17}
18
19#[cfg(test)]
20mod tests {
21    use super::*;
22
23    use tiktoken_rs::cl100k_base;
24
25    #[test]
26    fn returns_offsets() {
27        let tokenizer = cl100k_base().unwrap();
28        let size = tokenizer.size("An apple a");
29        assert_eq!(size, 3);
30    }
31}