text_splitter/chunk_size/
tiktoken.rs1use tiktoken_rs::CoreBPE;
2
3use crate::ChunkSizer;
4
5impl ChunkSizer for &CoreBPE {
6 fn size(&self, chunk: &str) -> usize {
8 self.encode_ordinary(chunk).len()
9 }
10}
11
12impl ChunkSizer for CoreBPE {
13 fn size(&self, chunk: &str) -> usize {
15 (&self).size(chunk)
16 }
17}
18
19#[cfg(test)]
20mod tests {
21 use super::*;
22
23 use tiktoken_rs::cl100k_base;
24
25 #[test]
26 fn returns_offsets() {
27 let tokenizer = cl100k_base().unwrap();
28 let size = tokenizer.size("An apple a");
29 assert_eq!(size, 3);
30 }
31}