text_splitter/chunk_size/
huggingface.rs1use tokenizers::{Encoding, Tokenizer};
2
3use crate::ChunkSizer;
4
5fn num_tokens_with_overflow(encoding: &Encoding, pad_id: Option<u32>) -> usize {
9 let base = encoding
10 .get_ids()
11 .iter()
12 .skip_while(|&id| pad_id.is_some_and(|pad_id| id == &pad_id))
14 .take_while(|&id| pad_id.map_or(true, |pad_id| id != &pad_id))
15 .count();
16
17 let overflow: usize = encoding
19 .get_overflowing()
20 .iter()
21 .map(|enc| num_tokens_with_overflow(enc, pad_id))
22 .sum();
23
24 base + overflow
25}
26
27impl ChunkSizer for &Tokenizer {
28 fn size(&self, chunk: &str) -> usize {
35 let encoding = self
36 .encode(chunk, false)
37 .expect("Unable to tokenize the following string {chunk}");
38
39 let pad_id = self.get_padding().map(|params| params.pad_id);
40 num_tokens_with_overflow(&encoding, pad_id)
41 }
42}
43
44impl ChunkSizer for Tokenizer {
45 fn size(&self, chunk: &str) -> usize {
52 (&self).size(chunk)
53 }
54}
55
56#[cfg(test)]
57mod tests {
58 use super::*;
59
60 #[test]
61 fn returns_size() {
62 let tokenizer = Tokenizer::from_pretrained("bert-base-cased", None).unwrap();
63 let size = tokenizer.size(" An apple a");
64 assert_eq!(size, 3);
65 }
66
67 #[test]
68 fn returns_size_handles_prefix() {
69 let tokenizer =
70 tokenizers::Tokenizer::from_file("./tests/tokenizers/huggingface.json").unwrap();
71
72 let size = tokenizer.size("An apple a");
73 assert_eq!(size, 3);
74 }
75
76 #[test]
77 fn handles_padding() {
78 let tokenizer = Tokenizer::from_pretrained("thenlper/gte-small", None).unwrap();
79 let size = tokenizer.size("An apple a");
80 assert_eq!(size, 3);
81 }
82
83 #[test]
84 fn handle_truncation() {
85 let tokenizer = Tokenizer::from_pretrained("sentence-transformers/all-MiniLM-L6-v2", None)
86 .expect("Could not load tokenizer 'sentence-transformers/all-MiniLM-L6-v2'");
87
88 assert_eq!(
90 tokenizer.size("An apple a day keeps the doctor away.".repeat(100).as_str()),
91 900
92 );
93 }
94}