Skip to main content

autoagents_speech/tts/
chunker.rs

1//! Sentence chunker for TTS input.
2//!
3//! Splits streaming text into natural sentence boundaries so each chunk can be
4//! synthesized independently. Handles decimal numbers and enforces min/max
5//! chunk sizes to avoid TTS quality issues.
6//!
7//! # Design
8//!
9//! Inspired by production chunkers in LiveKit Agents and Pipecat:
10//! - Split at `[.!?]` followed by whitespace + uppercase (or end of input)
11//! - Split at `\n\n` (paragraph break) unconditionally
12//! - Do NOT split on decimal numbers (`$4.50`, `v2.0`)
13//! - Force flush when buffer exceeds `max_chunk_chars`
14//! - Hold result if it would be shorter than `min_chunk_chars`
15
16/// Configuration for the sentence chunker.
17#[derive(Debug, Clone)]
18pub struct ChunkerConfig {
19    /// Minimum characters before emitting a chunk (default: 20).
20    /// Avoids sending very short fragments like "Yes." to TTS.
21    pub min_chunk_chars: usize,
22    /// Maximum characters before forcing a flush (default: 250).
23    /// Prevents TTS timeouts on very long runs without punctuation.
24    pub max_chunk_chars: usize,
25}
26
27impl Default for ChunkerConfig {
28    fn default() -> Self {
29        Self {
30            min_chunk_chars: 20,
31            max_chunk_chars: 250,
32        }
33    }
34}
35
36/// A sentence chunker that accumulates tokens and emits complete sentences.
37///
38/// Feed tokens via [`push_token`](SentenceChunker::push_token) and collect
39/// emitted sentences. When the input stream ends, call
40/// [`force_flush`](SentenceChunker::force_flush) to emit any remainder.
41#[derive(Debug, Default)]
42pub struct SentenceChunker {
43    buffer: String,
44    config: ChunkerConfig,
45}
46
47impl SentenceChunker {
48    /// Create a new chunker with the default configuration.
49    pub fn new() -> Self {
50        Self::default()
51    }
52
53    /// Create a new chunker with a custom configuration.
54    pub fn with_config(config: ChunkerConfig) -> Self {
55        Self {
56            buffer: String::default(),
57            config,
58        }
59    }
60
61    /// Push a token into the chunker.
62    ///
63    /// Returns a (possibly empty) list of complete sentences that were detected
64    /// after appending `token`. A single token may produce multiple sentences
65    /// if it spans several sentence boundaries.
66    ///
67    /// Returns an empty `Vec` if still accumulating.
68    pub fn push_token(&mut self, token: &str) -> Vec<String> {
69        self.buffer.push_str(token);
70        self.emit_all()
71    }
72
73    /// Drain all complete sentences currently available in the buffer.
74    fn emit_all(&mut self) -> Vec<String> {
75        let mut results = Vec::new();
76        while let Some(sentence) = self.try_emit() {
77            results.push(sentence);
78        }
79        results
80    }
81
82    /// Force-flush the internal buffer, returning any remaining text.
83    ///
84    /// Call this when the input stream has ended (e.g., LLM finished generating).
85    pub fn force_flush(&mut self) -> Option<String> {
86        if self.buffer.trim().is_empty() {
87            self.buffer.clear();
88            return None;
89        }
90        let text = std::mem::take(&mut self.buffer);
91        Some(text)
92    }
93
94    /// Try to emit a sentence from the buffer.
95    fn try_emit(&mut self) -> Option<String> {
96        // Force flush if buffer exceeds max length
97        if self.buffer.len() > self.config.max_chunk_chars {
98            return self.force_flush_at_best_point();
99        }
100
101        // Check for paragraph break (\n\n)
102        if let Some(pos) = self.buffer.find("\n\n") {
103            let split_pos = pos + 2; // include the \n\n in the first chunk
104            let candidate = self.buffer[..split_pos].trim().to_string();
105            if candidate.is_empty() {
106                // Just whitespace before the break — discard it
107                self.buffer = self.buffer[split_pos..].to_string();
108                return None;
109            }
110            if candidate.len() >= self.config.min_chunk_chars {
111                self.buffer = self.buffer[split_pos..].to_string();
112                return Some(candidate);
113            }
114            // Too short — keep accumulating
115            return None;
116        }
117
118        // Look for sentence-ending punctuation.
119        // Skip boundaries that would produce chunks shorter than min_chunk_chars
120        // by continuing the search from after the rejected boundary.
121        let mut search_from: usize = 0;
122        loop {
123            match self.find_sentence_boundary_from(search_from) {
124                Some((split_pos, _)) => {
125                    let candidate = self.buffer[..split_pos].trim().to_string();
126                    if candidate.len() >= self.config.min_chunk_chars {
127                        self.buffer = self.buffer[split_pos..].to_string();
128                        return Some(candidate);
129                    }
130                    // Candidate too short — continue searching past this boundary
131                    search_from = split_pos;
132                }
133                None => return None,
134            }
135        }
136    }
137
138    /// Find the first sentence boundary in the buffer starting from `from_byte`.
139    ///
140    /// Returns `Some((byte_position_after_punctuation, char))` for the FIRST
141    /// valid boundary found at or after `from_byte`, so we emit the earliest
142    /// complete sentence and keep the remainder for subsequent calls.
143    fn find_sentence_boundary_from(&self, from_byte: usize) -> Option<(usize, char)> {
144        let bytes = self.buffer.as_bytes();
145        let chars: Vec<(usize, char)> = self.buffer.char_indices().collect();
146
147        for (idx, &(byte_pos, ch)) in chars.iter().enumerate() {
148            if byte_pos < from_byte {
149                continue;
150            }
151            if !matches!(ch, '.' | '!' | '?') {
152                continue;
153            }
154
155            // The byte position right after this punctuation character
156            let after_punct = byte_pos + ch.len_utf8();
157
158            // Skip if this is a decimal number: digit.digit
159            if ch == '.' && self.is_decimal_at(byte_pos, &chars, idx) {
160                continue;
161            }
162
163            // Valid boundary if followed by whitespace + uppercase letter,
164            // OR if punctuation is at the very end of the buffer.
165            if after_punct >= bytes.len() {
166                // Punctuation at end of buffer — valid boundary.
167                return Some((after_punct, ch));
168            }
169
170            // Check what follows the punctuation
171            let remainder = &self.buffer[after_punct..];
172            if self.starts_with_whitespace_then_upper(remainder) {
173                return Some((after_punct, ch));
174            }
175        }
176
177        None
178    }
179
180    /// Check if the period at `byte_pos` is a decimal: digit.digit
181    fn is_decimal_at(&self, _byte_pos: usize, chars: &[(usize, char)], char_idx: usize) -> bool {
182        // Need a digit before and after the period
183        if char_idx == 0 {
184            return false;
185        }
186        let prev_char = chars[char_idx - 1].1;
187        if !prev_char.is_ascii_digit() {
188            return false;
189        }
190        // Check char after
191        if char_idx + 1 < chars.len() {
192            let next_char = chars[char_idx + 1].1;
193            return next_char.is_ascii_digit();
194        }
195        // Period at end after a digit — might be a decimal waiting for more
196        // tokens. Treat as decimal to be safe.
197        false
198    }
199
200    /// Check if a string starts with whitespace followed by an uppercase letter.
201    fn starts_with_whitespace_then_upper(&self, s: &str) -> bool {
202        let mut chars = s.chars();
203        match chars.next() {
204            Some(c) if c.is_whitespace() => {}
205            _ => return false,
206        }
207        // Skip additional whitespace
208        for c in chars {
209            if c.is_whitespace() {
210                continue;
211            }
212            return c.is_uppercase();
213        }
214        false
215    }
216
217    /// Force flush at the best available point when buffer exceeds max length.
218    /// Tries to split at the first sentence boundary; falls back to the full buffer.
219    fn force_flush_at_best_point(&mut self) -> Option<String> {
220        // Try to find the first sentence boundary to split at
221        if let Some((split_pos, _)) = self.find_sentence_boundary_from(0) {
222            let candidate = self.buffer[..split_pos].trim().to_string();
223            if !candidate.is_empty() {
224                self.buffer = self.buffer[split_pos..].to_string();
225                return Some(candidate);
226            }
227        }
228
229        // No good boundary — flush the whole thing
230        self.force_flush()
231    }
232}
233
234#[cfg(test)]
235mod tests {
236    use super::*;
237
238    /// Helper: push all tokens and collect emitted sentences, then force flush.
239    fn chunk_text(tokens: &[&str], config: ChunkerConfig) -> Vec<String> {
240        let mut chunker = SentenceChunker::with_config(config);
241        let mut results = Vec::new();
242        for token in tokens {
243            results.extend(chunker.push_token(token));
244        }
245        if let Some(remainder) = chunker.force_flush() {
246            results.push(remainder);
247        }
248        results
249    }
250
251    /// Helper with default config but low min so tests are simpler.
252    fn chunk_text_default(tokens: &[&str]) -> Vec<String> {
253        chunk_text(
254            tokens,
255            ChunkerConfig {
256                min_chunk_chars: 1,
257                max_chunk_chars: 250,
258            },
259        )
260    }
261
262    #[test]
263    fn test_decimal_no_split() {
264        let tokens = vec!["Price is $4.50. Buy now!"];
265        let result = chunk_text_default(&tokens);
266        assert_eq!(result, vec!["Price is $4.50.", "Buy now!"]);
267    }
268
269    #[test]
270    fn test_multiple_sentences() {
271        let tokens = vec!["Hello! How are you? Fine."];
272        let result = chunk_text_default(&tokens);
273        assert_eq!(result, vec!["Hello!", "How are you?", "Fine."]);
274    }
275
276    #[test]
277    fn test_force_flush_long_text() {
278        let config = ChunkerConfig {
279            min_chunk_chars: 1,
280            max_chunk_chars: 250,
281        };
282        // 300 chars, no punctuation
283        let long_text = "a".repeat(300);
284        let tokens = vec![long_text.as_str()];
285        let result = chunk_text(&tokens, config);
286        assert_eq!(result.len(), 1);
287        assert_eq!(result[0], long_text);
288    }
289
290    #[test]
291    fn test_force_flush_remainder() {
292        let mut chunker = SentenceChunker::default();
293        chunker.push_token("Hello there");
294        let flushed = chunker.force_flush();
295        assert_eq!(flushed, Some("Hello there".to_string()));
296    }
297
298    #[test]
299    fn test_force_flush_empty() {
300        let mut chunker = SentenceChunker::default();
301        assert_eq!(chunker.force_flush(), None);
302    }
303
304    #[test]
305    fn test_force_flush_whitespace_only() {
306        let mut chunker = SentenceChunker::default();
307        chunker.push_token("   ");
308        assert_eq!(chunker.force_flush(), None);
309    }
310
311    #[test]
312    fn test_streaming_tokens() {
313        // Simulate LLM streaming token by token
314        let tokens = vec![
315            "Hello", " ", "world", ".", " ", "How", " ", "are", " ", "you", "?",
316        ];
317        let result = chunk_text_default(&tokens);
318        assert_eq!(result, vec!["Hello world.", "How are you?"]);
319    }
320
321    #[test]
322    fn test_paragraph_break() {
323        let tokens = vec!["First paragraph.\n\nSecond paragraph."];
324        let result = chunk_text_default(&tokens);
325        assert_eq!(result, vec!["First paragraph.", "Second paragraph."]);
326    }
327
328    #[test]
329    fn test_min_chunk_chars_holds() {
330        let config = ChunkerConfig {
331            min_chunk_chars: 20,
332            max_chunk_chars: 250,
333        };
334        // "Hi." is only 3 chars — should be held until more text arrives
335        let mut chunker = SentenceChunker::with_config(config);
336        assert!(chunker.push_token("Hi. ").is_empty());
337        // Now push more text to complete a longer chunk
338        let result = chunker.push_token("What is the meaning of life? I wonder.");
339        // "Hi. What is the meaning of life?" should now be emitted (>= 20 chars)
340        assert!(!result.is_empty());
341        assert!(result[0].len() >= 20);
342    }
343
344    #[test]
345    fn test_version_number_no_split() {
346        let tokens = vec!["Use v2.0 for this. It is better."];
347        let result = chunk_text_default(&tokens);
348        assert_eq!(result, vec!["Use v2.0 for this.", "It is better."]);
349    }
350
351    #[test]
352    fn test_exclamation_and_question() {
353        let tokens = vec!["Wow! Really? Yes."];
354        let result = chunk_text_default(&tokens);
355        assert_eq!(result, vec!["Wow!", "Really?", "Yes."]);
356    }
357
358    #[test]
359    fn test_max_chunk_with_boundary() {
360        let config = ChunkerConfig {
361            min_chunk_chars: 1,
362            max_chunk_chars: 50,
363        };
364        // First sentence < 50, second pushes over
365        let tokens = vec![
366            "Short sentence here. And then a much longer sentence that pushes over the limit.",
367        ];
368        let result = chunk_text(&tokens, config);
369        assert_eq!(result[0], "Short sentence here.");
370        assert!(result.len() >= 2);
371    }
372}