piper_plus/
text_splitter.rs

1//! Text splitting for streaming synthesis.
2//!
3//! Splits input text into sentence-sized chunks at natural boundaries
4//! (sentence-ending punctuation, paragraph breaks) while respecting
5//! maximum chunk size limits.
6
7/// Text chunk with metadata
8#[derive(Debug, Clone)]
9pub struct TextChunk {
10    pub text: String,
11    pub index: usize,
12    pub is_last: bool,
13}
14
15/// Split configuration
16#[derive(Debug, Clone)]
17pub struct SplitConfig {
18    /// Maximum characters per chunk (0 = no limit)
19    pub max_chars: usize,
20    /// Whether to split on commas and semicolons (for very long sentences)
21    pub split_on_clause: bool,
22    /// Minimum chunk size (avoid very short chunks)
23    pub min_chars: usize,
24}
25
26impl Default for SplitConfig {
27    fn default() -> Self {
28        Self {
29            max_chars: 500,
30            split_on_clause: true,
31            min_chars: 10,
32        }
33    }
34}
35
36/// Common English abbreviations that should not trigger sentence splitting.
37const ABBREVIATIONS: &[&str] = &[
38    "Mr.", "Mrs.", "Ms.", "Dr.", "Prof.", "Jr.", "Sr.", "Inc.", "Ltd.", "Corp.", "Co.", "vs.",
39    "etc.", "approx.", "dept.", "est.", "vol.", "no.", "tel.", "fax.", "Jan.", "Feb.", "Mar.",
40    "Apr.", "Jun.", "Jul.", "Aug.", "Sep.", "Oct.", "Nov.", "Dec.", "St.", "Ave.", "Blvd.", "Rd.",
41    "a.m.", "p.m.", "e.g.", "i.e.",
42];
43
44/// Check whether the text immediately before a period position ends with
45/// a known abbreviation. `dot_pos` is the byte index of the '.' character.
46fn ends_with_abbreviation(text: &str, dot_pos: usize) -> bool {
47    // The slice up to and including the dot
48    let up_to_dot = &text[..=dot_pos];
49    for abbr in ABBREVIATIONS {
50        if up_to_dot.ends_with(abbr) {
51            return true;
52        }
53        // Case-insensitive: also check lowercase version of first char
54        let lower = abbr.to_lowercase();
55        if up_to_dot.ends_with(&lower) {
56            return true;
57        }
58    }
59    false
60}
61
62/// Returns true if the character is a CJK sentence-ending punctuation mark.
63fn is_cjk_sentence_end(c: char) -> bool {
64    matches!(c, '\u{3002}' | '\u{FF01}' | '\u{FF1F}')
65    // 。(U+3002)  ！(U+FF01)  ？(U+FF1F)
66}
67
68/// Returns true if the character is a Western sentence-ending punctuation mark.
69fn is_western_sentence_end(c: char) -> bool {
70    matches!(c, '.' | '!' | '?')
71}
72
73/// Split text into sentences.
74///
75/// Handles:
76/// - Western punctuation: . ! ? (followed by space or end)
77/// - Japanese punctuation: 。！？
78/// - Chinese punctuation: 。！？
79/// - Newlines/paragraph breaks
80/// - Quoted speech ("Hello." he said)
81/// - Abbreviations (Mr. Dr. etc. - don't split these)
82pub fn split_sentences(text: &str) -> Vec<String> {
83    if text.is_empty() || text.chars().all(|c| c.is_whitespace()) {
84        return Vec::new();
85    }
86
87    let mut sentences: Vec<String> = Vec::new();
88    let mut current = String::new();
89    let mut in_quotes = false;
90
91    // Build a (byte_offset, char) index from char_indices() so we can track
92    // byte positions incrementally instead of collecting into Vec<char>.
93    let indexed: Vec<(usize, char)> = text.char_indices().collect();
94    let len = indexed.len();
95    let mut i = 0;
96
97    while i < len {
98        let (_byte_off, c) = indexed[i];
99
100        // Track quote state
101        if c == '"' || c == '\u{201C}' || c == '\u{201D}' {
102            in_quotes = !in_quotes;
103            current.push(c);
104            i += 1;
105            continue;
106        }
107
108        // Handle paragraph breaks (two or more newlines)
109        if c == '\n' {
110            // Count consecutive newlines
111            let mut newline_count = 0;
112            let mut j = i;
113            while j < len && (indexed[j].1 == '\n' || indexed[j].1 == '\r') {
114                if indexed[j].1 == '\n' {
115                    newline_count += 1;
116                }
117                j += 1;
118            }
119
120            if newline_count >= 2 {
121                // Paragraph break: flush current sentence
122                let trimmed = current.trim_end().to_string();
123                if !trimmed.is_empty() {
124                    sentences.push(trimmed);
125                }
126                current.clear();
127                i = j;
128                continue;
129            } else {
130                // Single newline: treat as space
131                current.push(' ');
132                i = j;
133                continue;
134            }
135        }
136
137        // CJK sentence-ending punctuation: always split (no abbreviation ambiguity)
138        if is_cjk_sentence_end(c) {
139            current.push(c);
140            // Consume any trailing CJK punctuation or closing quotes/brackets
141            while i + 1 < len
142                && (indexed[i + 1].1 == '\u{300D}' // 」
143                || indexed[i + 1].1 == '\u{300F}'                 // 』
144                || indexed[i + 1].1 == '\u{FF09}'                 // ）
145                || indexed[i + 1].1 == '"'
146                || indexed[i + 1].1 == '\u{201D}')
147            // "
148            {
149                i += 1;
150                current.push(indexed[i].1);
151            }
152            let trimmed = current.trim_end().to_string();
153            if !trimmed.is_empty() {
154                sentences.push(trimmed);
155            }
156            current.clear();
157            i += 1;
158            // Skip whitespace after CJK sentence end
159            while i < len && indexed[i].1.is_whitespace() && indexed[i].1 != '\n' {
160                i += 1;
161            }
162            continue;
163        }
164
165        // Western sentence-ending punctuation
166        if is_western_sentence_end(c) {
167            current.push(c);
168
169            // Handle multiple consecutive punctuation: !? ?! !! ...
170            while i + 1 < len
171                && (is_western_sentence_end(indexed[i + 1].1) || indexed[i + 1].1 == '.')
172            {
173                i += 1;
174                current.push(indexed[i].1);
175            }
176
177            // Consume closing quotes after punctuation
178            while i + 1 < len
179                && (indexed[i + 1].1 == '"'
180                    || indexed[i + 1].1 == '\u{201D}'
181                    || indexed[i + 1].1 == '\'')
182            {
183                i += 1;
184                current.push(indexed[i].1);
185            }
186
187            // Check if this is an abbreviation (only for periods)
188            if c == '.' {
189                // Use byte offset directly from char_indices
190                let byte_pos = indexed[i].0;
191                if ends_with_abbreviation(&text[..=byte_pos], byte_pos) {
192                    // Don't split at abbreviations
193                    i += 1;
194                    continue;
195                }
196
197                // Check for ellipsis: three or more dots
198                let dot_count = current.chars().rev().take_while(|&ch| ch == '.').count();
199                if dot_count >= 3 {
200                    // Ellipsis: don't split here, continue to see if next
201                    // char is whitespace + capital or more punctuation
202                    if i + 1 < len && !indexed[i + 1].1.is_whitespace() {
203                        i += 1;
204                        continue;
205                    }
206                }
207            }
208
209            // Check if followed by whitespace or end of string
210            let next_i = i + 1;
211            if next_i >= len {
212                // End of string: flush
213                let trimmed = current.trim_end().to_string();
214                if !trimmed.is_empty() {
215                    sentences.push(trimmed);
216                }
217                current.clear();
218                i = next_i;
219                continue;
220            }
221
222            if indexed[next_i].1.is_whitespace() || indexed[next_i].1 == '\n' {
223                // Don't split if we're inside quotes
224                if in_quotes {
225                    i += 1;
226                    continue;
227                }
228
229                let trimmed = current.trim_end().to_string();
230                if !trimmed.is_empty() {
231                    sentences.push(trimmed);
232                }
233                current.clear();
234                i = next_i;
235                // Skip whitespace between sentences
236                while i < len && indexed[i].1 == ' ' {
237                    i += 1;
238                }
239                continue;
240            }
241
242            i += 1;
243            continue;
244        }
245
246        current.push(c);
247        i += 1;
248    }
249
250    // Flush remaining text
251    let trimmed = current.trim_end().to_string();
252    if !trimmed.is_empty() {
253        sentences.push(trimmed);
254    }
255
256    sentences
257}
258
259/// Split a single long sentence at clause boundaries (commas, semicolons, colons).
260fn split_at_clauses(text: &str) -> Vec<String> {
261    let clause_delimiters: &[char] = &[
262        ',', ';', ':', '\u{3001}', // 、(Japanese comma)
263        '\u{FF0C}', // ，(fullwidth comma)
264        '\u{FF1B}', // ；(fullwidth semicolon)
265    ];
266
267    let mut clauses: Vec<String> = Vec::new();
268    let mut current = String::new();
269
270    let indexed: Vec<(usize, char)> = text.char_indices().collect();
271    let len = indexed.len();
272    let mut i = 0;
273
274    while i < len {
275        let (_byte_off, c) = indexed[i];
276        current.push(c);
277
278        if clause_delimiters.contains(&c) {
279            // Include trailing space if present
280            if i + 1 < len && indexed[i + 1].1 == ' ' {
281                i += 1;
282                current.push(indexed[i].1);
283            }
284            let trimmed = current.trim_end().to_string();
285            if !trimmed.is_empty() {
286                clauses.push(trimmed);
287            }
288            current.clear();
289        }
290
291        i += 1;
292    }
293
294    // Flush remaining
295    let trimmed = current.trim_end().to_string();
296    if !trimmed.is_empty() {
297        clauses.push(trimmed);
298    }
299
300    // If we got no splits (no clause delimiters found), return the original
301    if clauses.len() <= 1 {
302        let trimmed = text.trim_end().to_string();
303        if trimmed.is_empty() {
304            return Vec::new();
305        }
306        return vec![trimmed];
307    }
308
309    clauses
310}
311
312/// Split text into chunks with configuration.
313///
314/// First splits into sentences, then merges/splits to respect size limits.
315pub fn split_chunks(text: &str, config: &SplitConfig) -> Vec<TextChunk> {
316    let sentences = split_sentences(text);
317    if sentences.is_empty() {
318        return Vec::new();
319    }
320
321    let max = config.max_chars;
322    let min = config.min_chars;
323
324    // Phase 1: Expand sentences that exceed max_chars via clause splitting
325    let mut expanded: Vec<String> = Vec::new();
326    for sentence in sentences {
327        if max > 0 && sentence.len() > max && config.split_on_clause {
328            let clauses = split_at_clauses(&sentence);
329            // Merge clauses that are still too long -- just keep them as-is
330            // (we cannot split further without breaking words)
331            for clause in clauses {
332                expanded.push(clause);
333            }
334        } else {
335            expanded.push(sentence);
336        }
337    }
338
339    // Phase 2: Merge short fragments so they meet min_chars
340    let mut merged: Vec<String> = Vec::new();
341    let mut buffer = String::new();
342
343    for piece in expanded {
344        if buffer.is_empty() {
345            buffer = piece;
346        } else {
347            // Try merging
348            let combined_len = buffer.len() + 1 + piece.len(); // +1 for space
349            if buffer.len() < min {
350                // Current buffer is too short, merge
351                buffer.push(' ');
352                buffer.push_str(&piece);
353            } else if max > 0 && combined_len <= max && piece.len() < min {
354                // Next piece is too short, merge it into current
355                buffer.push(' ');
356                buffer.push_str(&piece);
357            } else {
358                // Flush buffer
359                merged.push(buffer);
360                buffer = piece;
361            }
362        }
363    }
364    if !buffer.is_empty() {
365        merged.push(buffer);
366    }
367
368    // Phase 3: Build TextChunks with index and is_last
369    let total = merged.len();
370    merged
371        .into_iter()
372        .enumerate()
373        .map(|(i, text)| TextChunk {
374            text,
375            index: i,
376            is_last: i == total - 1,
377        })
378        .collect()
379}
380
381// ---------------------------------------------------------------------------
382// Tests
383// ---------------------------------------------------------------------------
384
385#[cfg(test)]
386mod tests {
387    use super::*;
388
389    // -----------------------------------------------------------------------
390    // 1. Basic English sentence splitting
391    // -----------------------------------------------------------------------
392    #[test]
393    fn test_basic_english_sentences() {
394        let result = split_sentences("Hello. World.");
395        assert_eq!(result, vec!["Hello.", "World."]);
396    }
397
398    #[test]
399    fn test_english_multiple_sentences() {
400        let result = split_sentences("First sentence. Second sentence. Third one.");
401        assert_eq!(
402            result,
403            vec!["First sentence.", "Second sentence.", "Third one."]
404        );
405    }
406
407    // -----------------------------------------------------------------------
408    // 2. Japanese sentence splitting
409    // -----------------------------------------------------------------------
410    #[test]
411    fn test_japanese_sentences() {
412        let result = split_sentences("今日は。明日は。");
413        assert_eq!(result, vec!["今日は。", "明日は。"]);
414    }
415
416    #[test]
417    fn test_japanese_mixed_punctuation() {
418        let result = split_sentences("元気ですか？はい、元気です。よかった！");
419        assert_eq!(
420            result,
421            vec!["元気ですか？", "はい、元気です。", "よかった！"]
422        );
423    }
424
425    // -----------------------------------------------------------------------
426    // 3. Mixed language splitting
427    // -----------------------------------------------------------------------
428    #[test]
429    fn test_mixed_language() {
430        let result = split_sentences("Hello. こんにちは。World.");
431        assert_eq!(result, vec!["Hello.", "こんにちは。", "World."]);
432    }
433
434    #[test]
435    fn test_mixed_language_continuous() {
436        let result = split_sentences("これはテストです。This is a test. もう一つ。");
437        // At minimum, CJK sentence ender splits correctly
438        assert!(!result.is_empty());
439        assert!(result[0].contains("これはテストです。"));
440        assert!(
441            result.len() >= 2,
442            "expected at least 2 chunks, got {:?}",
443            result
444        );
445    }
446
447    // -----------------------------------------------------------------------
448    // 4. Abbreviation handling
449    // -----------------------------------------------------------------------
450    #[test]
451    fn test_abbreviation_mr() {
452        let result = split_sentences("Mr. Smith went to the store. He bought milk.");
453        assert_eq!(
454            result,
455            vec!["Mr. Smith went to the store.", "He bought milk."]
456        );
457    }
458
459    #[test]
460    fn test_abbreviation_dr() {
461        let result = split_sentences("Dr. Jones and Prof. Lee are here.");
462        assert_eq!(result, vec!["Dr. Jones and Prof. Lee are here."]);
463    }
464
465    #[test]
466    fn test_abbreviation_etc() {
467        let result = split_sentences("Apples, oranges, etc. are fruits. Eat them.");
468        assert_eq!(
469            result,
470            vec!["Apples, oranges, etc. are fruits.", "Eat them."]
471        );
472    }
473
474    #[test]
475    fn test_abbreviation_eg() {
476        let result = split_sentences("Use tools e.g. a hammer. Done.");
477        assert_eq!(result, vec!["Use tools e.g. a hammer.", "Done."]);
478    }
479
480    // -----------------------------------------------------------------------
481    // 5. Quoted speech
482    // -----------------------------------------------------------------------
483    #[test]
484    fn test_quoted_speech_keeps_sentence() {
485        let result = split_sentences("He said \"Hello. How are you?\" Then left.");
486        // The period and question mark inside quotes should not split
487        assert_eq!(result.len(), 1);
488        assert_eq!(result[0], "He said \"Hello. How are you?\" Then left.");
489    }
490
491    #[test]
492    fn test_quoted_speech_at_end() {
493        let result = split_sentences("She whispered \"Goodbye.\"");
494        assert_eq!(result, vec!["She whispered \"Goodbye.\""]);
495    }
496
497    // -----------------------------------------------------------------------
498    // 6. Very long sentences with clause splitting
499    // -----------------------------------------------------------------------
500    #[test]
501    fn test_clause_splitting() {
502        let clauses = split_at_clauses("first part, second part; third part: fourth part");
503        assert_eq!(
504            clauses,
505            vec!["first part,", "second part;", "third part:", "fourth part",]
506        );
507    }
508
509    #[test]
510    fn test_clause_splitting_no_delimiters() {
511        let clauses = split_at_clauses("no delimiters here");
512        assert_eq!(clauses, vec!["no delimiters here"]);
513    }
514
515    #[test]
516    fn test_long_sentence_split_in_chunks() {
517        // Create a sentence that exceeds max_chars
518        let long = "Alpha bravo charlie, delta echo foxtrot; golf hotel india";
519        let config = SplitConfig {
520            max_chars: 30,
521            split_on_clause: true,
522            min_chars: 1,
523        };
524        let chunks = split_chunks(long, &config);
525        // Should have been clause-split since the whole thing > 30 chars
526        assert!(
527            chunks.len() > 1,
528            "expected multiple chunks, got {}",
529            chunks.len()
530        );
531        // Each clause should be within limits (or at least attempted)
532        for chunk in &chunks {
533            assert!(!chunk.text.is_empty());
534        }
535    }
536
537    // -----------------------------------------------------------------------
538    // 7. max_chars enforcement
539    // -----------------------------------------------------------------------
540    #[test]
541    fn test_max_chars_splits_long_text() {
542        let text = "Short. This is a somewhat longer sentence that has many words in it. End.";
543        let config = SplitConfig {
544            max_chars: 50,
545            split_on_clause: false,
546            min_chars: 1,
547        };
548        let chunks = split_chunks(text, &config);
549        assert!(chunks.len() >= 2);
550    }
551
552    #[test]
553    fn test_max_chars_zero_means_no_limit() {
554        let text = "First. Second. Third.";
555        let config = SplitConfig {
556            max_chars: 0,
557            split_on_clause: false,
558            min_chars: 0,
559        };
560        let chunks = split_chunks(text, &config);
561        // max_chars=0 means no limit, sentences split normally
562        assert!(!chunks.is_empty(), "should produce at least one chunk");
563        assert!(
564            chunks.len() >= 2,
565            "expected at least 2 chunks, got {:?}",
566            chunks.iter().map(|c| &c.text).collect::<Vec<_>>()
567        );
568    }
569
570    // -----------------------------------------------------------------------
571    // 8. min_chars merging
572    // -----------------------------------------------------------------------
573    #[test]
574    fn test_min_chars_merges_short_chunks() {
575        let text = "Hi. Go. Now.";
576        let config = SplitConfig {
577            max_chars: 500,
578            split_on_clause: true,
579            min_chars: 10,
580        };
581        let chunks = split_chunks(text, &config);
582        // "Hi." (3 chars) is < 10, so it should be merged with next
583        assert!(
584            chunks.len() < 3,
585            "expected merging of short chunks, got {} chunks: {:?}",
586            chunks.len(),
587            chunks.iter().map(|c| &c.text).collect::<Vec<_>>()
588        );
589    }
590
591    #[test]
592    fn test_min_chars_zero_no_merging() {
593        let text = "A. B. C.";
594        let config = SplitConfig {
595            max_chars: 0,
596            split_on_clause: false,
597            min_chars: 0,
598        };
599        let chunks = split_chunks(text, &config);
600        assert_eq!(chunks.len(), 3);
601        assert_eq!(chunks[0].text, "A.");
602        assert_eq!(chunks[1].text, "B.");
603        assert_eq!(chunks[2].text, "C.");
604    }
605
606    // -----------------------------------------------------------------------
607    // 9. Empty / whitespace input
608    // -----------------------------------------------------------------------
609    #[test]
610    fn test_empty_input() {
611        assert!(split_sentences("").is_empty());
612        assert!(split_chunks("", &SplitConfig::default()).is_empty());
613    }
614
615    #[test]
616    fn test_whitespace_only() {
617        assert!(split_sentences("   ").is_empty());
618        assert!(split_sentences("\n\n\n").is_empty());
619        assert!(split_chunks("   ", &SplitConfig::default()).is_empty());
620    }
621
622    // -----------------------------------------------------------------------
623    // 10. Paragraph breaks (multiple newlines)
624    // -----------------------------------------------------------------------
625    #[test]
626    fn test_paragraph_breaks() {
627        let text = "First paragraph.\n\nSecond paragraph.";
628        let result = split_sentences(text);
629        assert_eq!(result, vec!["First paragraph.", "Second paragraph."]);
630    }
631
632    #[test]
633    fn test_single_newline_no_split() {
634        let text = "Line one\nstill same sentence.";
635        let result = split_sentences(text);
636        assert_eq!(result.len(), 1);
637        // Single newline is treated as a space
638        assert!(result[0].contains("Line one"));
639        assert!(result[0].contains("still same sentence."));
640    }
641
642    // -----------------------------------------------------------------------
643    // 11. Exclamation and question marks
644    // -----------------------------------------------------------------------
645    #[test]
646    fn test_exclamation_mark() {
647        let result = split_sentences("Wow! Amazing!");
648        assert_eq!(result, vec!["Wow!", "Amazing!"]);
649    }
650
651    #[test]
652    fn test_question_mark() {
653        let result = split_sentences("Really? Yes.");
654        assert_eq!(result, vec!["Really?", "Yes."]);
655    }
656
657    // -----------------------------------------------------------------------
658    // 12. Ellipsis handling
659    // -----------------------------------------------------------------------
660    #[test]
661    fn test_ellipsis_followed_by_text() {
662        let result = split_sentences("Wait... what?");
663        // The sentence splitter may split at the period in "..." - that's acceptable
664        // The key is that it doesn't panic and produces non-empty results
665        assert!(!result.is_empty());
666        let joined: String = result.join(" ");
667        assert!(
668            joined.contains("Wait"),
669            "should contain 'Wait': {:?}",
670            result
671        );
672        assert!(
673            joined.contains("what?"),
674            "should contain 'what?': {:?}",
675            result
676        );
677    }
678
679    #[test]
680    fn test_ellipsis_at_end() {
681        let result = split_sentences("And then...");
682        assert_eq!(result.len(), 1);
683        assert_eq!(result[0], "And then...");
684    }
685
686    // -----------------------------------------------------------------------
687    // 13. No trailing whitespace in chunks
688    // -----------------------------------------------------------------------
689    #[test]
690    fn test_no_trailing_whitespace() {
691        let result = split_sentences("Hello.   World.   ");
692        for s in &result {
693            assert_eq!(s, s.trim_end(), "trailing whitespace found in: {:?}", s);
694        }
695    }
696
697    #[test]
698    fn test_chunks_no_trailing_whitespace() {
699        let text = "Hello.   World.   ";
700        let chunks = split_chunks(text, &SplitConfig::default());
701        for chunk in &chunks {
702            assert_eq!(
703                chunk.text,
704                chunk.text.trim_end(),
705                "trailing whitespace in chunk: {:?}",
706                chunk.text
707            );
708        }
709    }
710
711    // -----------------------------------------------------------------------
712    // 14. TextChunk index and is_last correctness
713    // -----------------------------------------------------------------------
714    #[test]
715    fn test_chunk_index_and_is_last() {
716        let text = "First. Second. Third.";
717        let config = SplitConfig {
718            max_chars: 0,
719            split_on_clause: false,
720            min_chars: 0,
721        };
722        let chunks = split_chunks(text, &config);
723        assert!(
724            chunks.len() >= 2,
725            "expected at least 2 chunks, got {:?}",
726            chunks.iter().map(|c| &c.text).collect::<Vec<_>>()
727        );
728        // Verify indices are sequential
729        for (i, chunk) in chunks.iter().enumerate() {
730            assert_eq!(chunk.index, i, "chunk {} index mismatch", i);
731        }
732        // Last chunk must have is_last=true
733        assert!(
734            chunks.last().unwrap().is_last,
735            "last chunk should have is_last=true"
736        );
737        // Non-last chunks must have is_last=false
738        for chunk in &chunks[..chunks.len() - 1] {
739            assert!(!chunk.is_last, "non-last chunk should have is_last=false");
740        }
741    }
742
743    #[test]
744    fn test_single_chunk_is_last() {
745        let config = SplitConfig {
746            max_chars: 0,
747            split_on_clause: false,
748            min_chars: 0,
749        };
750        let chunks = split_chunks("Only one.", &config);
751        assert_eq!(chunks.len(), 1);
752        assert_eq!(chunks[0].index, 0);
753        assert!(chunks[0].is_last);
754    }
755
756    // -----------------------------------------------------------------------
757    // 15. Single sentence (no split needed)
758    // -----------------------------------------------------------------------
759    #[test]
760    fn test_single_sentence_no_split() {
761        let result = split_sentences("Just one sentence without ending punctuation");
762        assert_eq!(result, vec!["Just one sentence without ending punctuation"]);
763    }
764
765    #[test]
766    fn test_single_sentence_with_period() {
767        let result = split_sentences("Just one sentence.");
768        assert_eq!(result, vec!["Just one sentence."]);
769    }
770
771    // -----------------------------------------------------------------------
772    // 16. Multiple consecutive punctuation
773    // -----------------------------------------------------------------------
774    #[test]
775    fn test_multiple_punctuation_exclamation_question() {
776        let result = split_sentences("Really?! Yes.");
777        assert_eq!(result, vec!["Really?!", "Yes."]);
778    }
779
780    #[test]
781    fn test_multiple_exclamation() {
782        let result = split_sentences("No!! Stop.");
783        assert_eq!(result, vec!["No!!", "Stop."]);
784    }
785
786    // -----------------------------------------------------------------------
787    // 17. Default SplitConfig values
788    // -----------------------------------------------------------------------
789    #[test]
790    fn test_default_config() {
791        let config = SplitConfig::default();
792        assert_eq!(config.max_chars, 500);
793        assert!(config.split_on_clause);
794        assert_eq!(config.min_chars, 10);
795    }
796
797    // -----------------------------------------------------------------------
798    // 18. Chinese sentence splitting
799    // -----------------------------------------------------------------------
800    #[test]
801    fn test_chinese_sentences() {
802        let result = split_sentences("你好。再见。");
803        assert_eq!(result, vec!["你好。", "再见。"]);
804    }
805
806    #[test]
807    fn test_chinese_question_and_exclamation() {
808        let result = split_sentences("你好吗？很好！");
809        assert_eq!(result, vec!["你好吗？", "很好！"]);
810    }
811
812    // -----------------------------------------------------------------------
813    // 19. Japanese clause delimiters
814    // -----------------------------------------------------------------------
815    #[test]
816    fn test_japanese_clause_splitting() {
817        let clauses = split_at_clauses("最初の部分、二番目の部分、三番目");
818        assert_eq!(clauses.len(), 3);
819    }
820
821    // -----------------------------------------------------------------------
822    // 20. Edge cases
823    // -----------------------------------------------------------------------
824    #[test]
825    fn test_only_punctuation() {
826        let result = split_sentences("...");
827        assert_eq!(result.len(), 1);
828        assert_eq!(result[0], "...");
829    }
830
831    #[test]
832    fn test_split_chunks_preserves_all_text() {
833        let text = "First sentence. Second sentence. Third sentence.";
834        let config = SplitConfig {
835            max_chars: 0,
836            split_on_clause: false,
837            min_chars: 0,
838        };
839        let chunks = split_chunks(text, &config);
840        // Rejoin and compare (ignoring whitespace differences)
841        let rejoined: String = chunks
842            .iter()
843            .map(|c| c.text.as_str())
844            .collect::<Vec<_>>()
845            .join(" ");
846        assert_eq!(rejoined, "First sentence. Second sentence. Third sentence.");
847    }
848
849    #[test]
850    fn test_period_not_followed_by_space() {
851        // Period mid-word (e.g. a URL or filename) should not split
852        let result = split_sentences("Visit example.com today.");
853        assert_eq!(result.len(), 1);
854    }
855
856    #[test]
857    fn test_chunks_with_merging_and_splitting() {
858        let text =
859            "A. B. This is a long sentence with many words, and some clauses; and more text here.";
860        let config = SplitConfig {
861            max_chars: 40,
862            split_on_clause: true,
863            min_chars: 5,
864        };
865        let chunks = split_chunks(text, &config);
866        assert!(!chunks.is_empty());
867        // Verify index continuity
868        for (i, chunk) in chunks.iter().enumerate() {
869            assert_eq!(chunk.index, i);
870        }
871        // Last chunk must have is_last = true
872        assert!(chunks.last().unwrap().is_last);
873    }
874
875    // -----------------------------------------------------------------------
876    // 21. Nested quotes — inner quotes should not break the outer sentence
877    // -----------------------------------------------------------------------
878    #[test]
879    fn test_split_sentences_nested_quotes() {
880        let text = "He said \"she said 'hello'\" then left.";
881        let result = split_sentences(text);
882        assert_eq!(
883            result.len(),
884            1,
885            "nested quotes should not cause extra splits: {:?}",
886            result
887        );
888        assert_eq!(result[0], text);
889    }
890
891    // -----------------------------------------------------------------------
892    // 22. Only CJK punctuation (no actual text content)
893    // -----------------------------------------------------------------------
894    #[test]
895    fn test_split_sentences_only_cjk_punctuation() {
896        // Input is three CJK sentence-ending marks with no surrounding text
897        let result = split_sentences("\u{3002}\u{FF01}\u{FF1F}");
898        // Each mark should be treated as its own sentence (non-empty)
899        assert!(
900            !result.is_empty(),
901            "CJK-only punctuation should produce output"
902        );
903        for s in &result {
904            assert!(!s.is_empty(), "no empty sentences should be emitted");
905        }
906    }
907
908    // -----------------------------------------------------------------------
909    // 23. split_chunks where max_chars < min_chars — should still not panic
910    // -----------------------------------------------------------------------
911    #[test]
912    fn test_split_chunks_max_less_than_min() {
913        let text = "Hello world. Goodbye world.";
914        // Intentionally contradictory: max < min
915        let config = SplitConfig {
916            max_chars: 5,
917            split_on_clause: true,
918            min_chars: 50,
919        };
920        let chunks = split_chunks(text, &config);
921        // Must not panic; should still produce at least one chunk
922        assert!(
923            !chunks.is_empty(),
924            "should produce chunks even with invalid config"
925        );
926        // All text should survive the round-trip
927        let rejoined: String = chunks
928            .iter()
929            .map(|c| c.text.as_str())
930            .collect::<Vec<_>>()
931            .join(" ");
932        assert!(
933            rejoined.contains("Hello"),
934            "text should survive: {rejoined}"
935        );
936        assert!(
937            rejoined.contains("Goodbye"),
938            "text should survive: {rejoined}"
939        );
940    }
941
942    // -----------------------------------------------------------------------
943    // 24. Consecutive terminators — "Really?! Yes." should give 2 chunks
944    // -----------------------------------------------------------------------
945    #[test]
946    fn test_split_sentences_consecutive_terminators() {
947        let result = split_sentences("Really?! Yes.");
948        assert_eq!(result, vec!["Really?!", "Yes."]);
949    }
950
951    // -----------------------------------------------------------------------
952    // 25. Abbreviation at start of sentence — "Dr. Smith is here."
953    // -----------------------------------------------------------------------
954    #[test]
955    fn test_split_sentences_abbreviation_at_start() {
956        let result = split_sentences("Dr. Smith is here.");
957        // "Dr." is an abbreviation and must not split
958        assert_eq!(
959            result.len(),
960            1,
961            "abbreviation at start should not split: {:?}",
962            result
963        );
964        assert_eq!(result[0], "Dr. Smith is here.");
965    }
966
967    // -----------------------------------------------------------------------
968    // 26. CRLF line endings — "\r\n" should be treated like "\n"
969    // -----------------------------------------------------------------------
970    #[test]
971    fn test_split_sentences_crlf_line_endings() {
972        // Single CRLF: treated as single newline (space), no split
973        let result_single = split_sentences("Hello.\r\nWorld.");
974        assert!(
975            !result_single.is_empty(),
976            "CRLF input should produce output"
977        );
978
979        // Double CRLF: treated as paragraph break, should split
980        let result_double = split_sentences("Hello.\r\n\r\nWorld.");
981        assert!(
982            result_double.len() >= 2,
983            "double CRLF should cause paragraph split: {:?}",
984            result_double
985        );
986        // First chunk should contain "Hello." and last should contain "World."
987        assert!(
988            result_double[0].contains("Hello."),
989            "first chunk: {:?}",
990            result_double
991        );
992        assert!(
993            result_double.last().unwrap().contains("World."),
994            "last chunk: {:?}",
995            result_double
996        );
997    }
998}
piper_plus/text_splitter.rs

piper_plus/
text_splitter.rs