autoagents_speech/tts/
chunker.rs1#[derive(Debug, Clone)]
18pub struct ChunkerConfig {
19 pub min_chunk_chars: usize,
22 pub max_chunk_chars: usize,
25}
26
27impl Default for ChunkerConfig {
28 fn default() -> Self {
29 Self {
30 min_chunk_chars: 20,
31 max_chunk_chars: 250,
32 }
33 }
34}
35
36#[derive(Debug, Default)]
42pub struct SentenceChunker {
43 buffer: String,
44 config: ChunkerConfig,
45}
46
47impl SentenceChunker {
48 pub fn new() -> Self {
50 Self::default()
51 }
52
53 pub fn with_config(config: ChunkerConfig) -> Self {
55 Self {
56 buffer: String::default(),
57 config,
58 }
59 }
60
61 pub fn push_token(&mut self, token: &str) -> Vec<String> {
69 self.buffer.push_str(token);
70 self.emit_all()
71 }
72
73 fn emit_all(&mut self) -> Vec<String> {
75 let mut results = Vec::new();
76 while let Some(sentence) = self.try_emit() {
77 results.push(sentence);
78 }
79 results
80 }
81
82 pub fn force_flush(&mut self) -> Option<String> {
86 if self.buffer.trim().is_empty() {
87 self.buffer.clear();
88 return None;
89 }
90 let text = std::mem::take(&mut self.buffer);
91 Some(text)
92 }
93
94 fn try_emit(&mut self) -> Option<String> {
96 if self.buffer.len() > self.config.max_chunk_chars {
98 return self.force_flush_at_best_point();
99 }
100
101 if let Some(pos) = self.buffer.find("\n\n") {
103 let split_pos = pos + 2; let candidate = self.buffer[..split_pos].trim().to_string();
105 if candidate.is_empty() {
106 self.buffer = self.buffer[split_pos..].to_string();
108 return None;
109 }
110 if candidate.len() >= self.config.min_chunk_chars {
111 self.buffer = self.buffer[split_pos..].to_string();
112 return Some(candidate);
113 }
114 return None;
116 }
117
118 let mut search_from: usize = 0;
122 loop {
123 match self.find_sentence_boundary_from(search_from) {
124 Some((split_pos, _)) => {
125 let candidate = self.buffer[..split_pos].trim().to_string();
126 if candidate.len() >= self.config.min_chunk_chars {
127 self.buffer = self.buffer[split_pos..].to_string();
128 return Some(candidate);
129 }
130 search_from = split_pos;
132 }
133 None => return None,
134 }
135 }
136 }
137
138 fn find_sentence_boundary_from(&self, from_byte: usize) -> Option<(usize, char)> {
144 let bytes = self.buffer.as_bytes();
145 let chars: Vec<(usize, char)> = self.buffer.char_indices().collect();
146
147 for (idx, &(byte_pos, ch)) in chars.iter().enumerate() {
148 if byte_pos < from_byte {
149 continue;
150 }
151 if !matches!(ch, '.' | '!' | '?') {
152 continue;
153 }
154
155 let after_punct = byte_pos + ch.len_utf8();
157
158 if ch == '.' && self.is_decimal_at(byte_pos, &chars, idx) {
160 continue;
161 }
162
163 if after_punct >= bytes.len() {
166 return Some((after_punct, ch));
168 }
169
170 let remainder = &self.buffer[after_punct..];
172 if self.starts_with_whitespace_then_upper(remainder) {
173 return Some((after_punct, ch));
174 }
175 }
176
177 None
178 }
179
180 fn is_decimal_at(&self, _byte_pos: usize, chars: &[(usize, char)], char_idx: usize) -> bool {
182 if char_idx == 0 {
184 return false;
185 }
186 let prev_char = chars[char_idx - 1].1;
187 if !prev_char.is_ascii_digit() {
188 return false;
189 }
190 if char_idx + 1 < chars.len() {
192 let next_char = chars[char_idx + 1].1;
193 return next_char.is_ascii_digit();
194 }
195 false
198 }
199
200 fn starts_with_whitespace_then_upper(&self, s: &str) -> bool {
202 let mut chars = s.chars();
203 match chars.next() {
204 Some(c) if c.is_whitespace() => {}
205 _ => return false,
206 }
207 for c in chars {
209 if c.is_whitespace() {
210 continue;
211 }
212 return c.is_uppercase();
213 }
214 false
215 }
216
217 fn force_flush_at_best_point(&mut self) -> Option<String> {
220 if let Some((split_pos, _)) = self.find_sentence_boundary_from(0) {
222 let candidate = self.buffer[..split_pos].trim().to_string();
223 if !candidate.is_empty() {
224 self.buffer = self.buffer[split_pos..].to_string();
225 return Some(candidate);
226 }
227 }
228
229 self.force_flush()
231 }
232}
233
234#[cfg(test)]
235mod tests {
236 use super::*;
237
238 fn chunk_text(tokens: &[&str], config: ChunkerConfig) -> Vec<String> {
240 let mut chunker = SentenceChunker::with_config(config);
241 let mut results = Vec::new();
242 for token in tokens {
243 results.extend(chunker.push_token(token));
244 }
245 if let Some(remainder) = chunker.force_flush() {
246 results.push(remainder);
247 }
248 results
249 }
250
251 fn chunk_text_default(tokens: &[&str]) -> Vec<String> {
253 chunk_text(
254 tokens,
255 ChunkerConfig {
256 min_chunk_chars: 1,
257 max_chunk_chars: 250,
258 },
259 )
260 }
261
262 #[test]
263 fn test_decimal_no_split() {
264 let tokens = vec!["Price is $4.50. Buy now!"];
265 let result = chunk_text_default(&tokens);
266 assert_eq!(result, vec!["Price is $4.50.", "Buy now!"]);
267 }
268
269 #[test]
270 fn test_multiple_sentences() {
271 let tokens = vec!["Hello! How are you? Fine."];
272 let result = chunk_text_default(&tokens);
273 assert_eq!(result, vec!["Hello!", "How are you?", "Fine."]);
274 }
275
276 #[test]
277 fn test_force_flush_long_text() {
278 let config = ChunkerConfig {
279 min_chunk_chars: 1,
280 max_chunk_chars: 250,
281 };
282 let long_text = "a".repeat(300);
284 let tokens = vec![long_text.as_str()];
285 let result = chunk_text(&tokens, config);
286 assert_eq!(result.len(), 1);
287 assert_eq!(result[0], long_text);
288 }
289
290 #[test]
291 fn test_force_flush_remainder() {
292 let mut chunker = SentenceChunker::default();
293 chunker.push_token("Hello there");
294 let flushed = chunker.force_flush();
295 assert_eq!(flushed, Some("Hello there".to_string()));
296 }
297
298 #[test]
299 fn test_force_flush_empty() {
300 let mut chunker = SentenceChunker::default();
301 assert_eq!(chunker.force_flush(), None);
302 }
303
304 #[test]
305 fn test_force_flush_whitespace_only() {
306 let mut chunker = SentenceChunker::default();
307 chunker.push_token(" ");
308 assert_eq!(chunker.force_flush(), None);
309 }
310
311 #[test]
312 fn test_streaming_tokens() {
313 let tokens = vec![
315 "Hello", " ", "world", ".", " ", "How", " ", "are", " ", "you", "?",
316 ];
317 let result = chunk_text_default(&tokens);
318 assert_eq!(result, vec!["Hello world.", "How are you?"]);
319 }
320
321 #[test]
322 fn test_paragraph_break() {
323 let tokens = vec!["First paragraph.\n\nSecond paragraph."];
324 let result = chunk_text_default(&tokens);
325 assert_eq!(result, vec!["First paragraph.", "Second paragraph."]);
326 }
327
328 #[test]
329 fn test_min_chunk_chars_holds() {
330 let config = ChunkerConfig {
331 min_chunk_chars: 20,
332 max_chunk_chars: 250,
333 };
334 let mut chunker = SentenceChunker::with_config(config);
336 assert!(chunker.push_token("Hi. ").is_empty());
337 let result = chunker.push_token("What is the meaning of life? I wonder.");
339 assert!(!result.is_empty());
341 assert!(result[0].len() >= 20);
342 }
343
344 #[test]
345 fn test_version_number_no_split() {
346 let tokens = vec!["Use v2.0 for this. It is better."];
347 let result = chunk_text_default(&tokens);
348 assert_eq!(result, vec!["Use v2.0 for this.", "It is better."]);
349 }
350
351 #[test]
352 fn test_exclamation_and_question() {
353 let tokens = vec!["Wow! Really? Yes."];
354 let result = chunk_text_default(&tokens);
355 assert_eq!(result, vec!["Wow!", "Really?", "Yes."]);
356 }
357
358 #[test]
359 fn test_max_chunk_with_boundary() {
360 let config = ChunkerConfig {
361 min_chunk_chars: 1,
362 max_chunk_chars: 50,
363 };
364 let tokens = vec![
366 "Short sentence here. And then a much longer sentence that pushes over the limit.",
367 ];
368 let result = chunk_text(&tokens, config);
369 assert_eq!(result[0], "Short sentence here.");
370 assert!(result.len() >= 2);
371 }
372}