text_splitter/chunk_size/
tiktoken.rs

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
use tiktoken_rs::CoreBPE;

use crate::ChunkSizer;

impl ChunkSizer for &CoreBPE {
    /// Returns the number of tokens in a given text after tokenization.
    fn size(&self, chunk: &str) -> usize {
        self.encode_ordinary(chunk).len()
    }
}

impl ChunkSizer for CoreBPE {
    /// Returns the number of tokens in a given text after tokenization.
    fn size(&self, chunk: &str) -> usize {
        (&self).size(chunk)
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    use tiktoken_rs::cl100k_base;

    #[test]
    fn returns_offsets() {
        let tokenizer = cl100k_base().unwrap();
        let size = tokenizer.size("An apple a");
        assert_eq!(size, 3);
    }
}