synaptic_splitters/
token.rs1use crate::TextSplitter;
2
3pub struct TokenTextSplitter {
8 chunk_size: usize,
9 chunk_overlap: usize,
10}
11
12impl TokenTextSplitter {
13 pub fn new(chunk_size: usize) -> Self {
17 Self {
18 chunk_size,
19 chunk_overlap: 0,
20 }
21 }
22
23 pub fn with_chunk_overlap(mut self, overlap: usize) -> Self {
24 self.chunk_overlap = overlap;
25 self
26 }
27
28 fn estimate_tokens(text: &str) -> usize {
29 (text.len() / 4).max(1)
30 }
31}
32
33impl TextSplitter for TokenTextSplitter {
34 fn split_text(&self, text: &str) -> Vec<String> {
35 let words: Vec<&str> = text.split_whitespace().collect();
36 if words.is_empty() {
37 return vec![];
38 }
39
40 let mut chunks = Vec::new();
41 let mut current_words: Vec<&str> = Vec::new();
42
43 for word in &words {
44 current_words.push(word);
45 let current_text = current_words.join(" ");
46 let tokens = Self::estimate_tokens(¤t_text);
47
48 if tokens > self.chunk_size && current_words.len() > 1 {
49 current_words.pop();
51 let chunk = current_words.join(" ");
52 chunks.push(chunk);
53
54 if self.chunk_overlap > 0 {
56 let overlap_text = current_words.join(" ");
57 let overlap_tokens = Self::estimate_tokens(&overlap_text);
58 while Self::estimate_tokens(¤t_words.join(" ")) > self.chunk_overlap
59 && current_words.len() > 1
60 {
61 current_words.remove(0);
62 }
63 let _ = overlap_tokens; } else {
65 current_words.clear();
66 }
67
68 current_words.push(word);
69 }
70 }
71
72 if !current_words.is_empty() {
73 chunks.push(current_words.join(" "));
74 }
75
76 chunks
77 }
78}