Skip to main content

synaptic_splitters/
token.rs

1use crate::TextSplitter;
2
3/// Splits text by estimated token count using a ~4 chars/token heuristic.
4///
5/// Splits at word boundaries to keep chunks readable. Consistent with
6/// the token estimation used in `ConversationTokenBufferMemory`.
7pub struct TokenTextSplitter {
8    chunk_size: usize,
9    chunk_overlap: usize,
10}
11
12impl TokenTextSplitter {
13    /// Create a new token text splitter.
14    ///
15    /// `chunk_size` is in estimated tokens (not characters).
16    pub fn new(chunk_size: usize) -> Self {
17        Self {
18            chunk_size,
19            chunk_overlap: 0,
20        }
21    }
22
23    pub fn with_chunk_overlap(mut self, overlap: usize) -> Self {
24        self.chunk_overlap = overlap;
25        self
26    }
27
28    fn estimate_tokens(text: &str) -> usize {
29        (text.len() / 4).max(1)
30    }
31}
32
33impl TextSplitter for TokenTextSplitter {
34    fn split_text(&self, text: &str) -> Vec<String> {
35        let words: Vec<&str> = text.split_whitespace().collect();
36        if words.is_empty() {
37            return vec![];
38        }
39
40        let mut chunks = Vec::new();
41        let mut current_words: Vec<&str> = Vec::new();
42
43        for word in &words {
44            current_words.push(word);
45            let current_text = current_words.join(" ");
46            let tokens = Self::estimate_tokens(&current_text);
47
48            if tokens > self.chunk_size && current_words.len() > 1 {
49                // Remove last word, emit chunk
50                current_words.pop();
51                let chunk = current_words.join(" ");
52                chunks.push(chunk);
53
54                // Keep overlap words
55                if self.chunk_overlap > 0 {
56                    let overlap_text = current_words.join(" ");
57                    let overlap_tokens = Self::estimate_tokens(&overlap_text);
58                    while Self::estimate_tokens(&current_words.join(" ")) > self.chunk_overlap
59                        && current_words.len() > 1
60                    {
61                        current_words.remove(0);
62                    }
63                    let _ = overlap_tokens; // just used for logic above
64                } else {
65                    current_words.clear();
66                }
67
68                current_words.push(word);
69            }
70        }
71
72        if !current_words.is_empty() {
73            chunks.push(current_words.join(" "));
74        }
75
76        chunks
77    }
78}