piper-plus 0.2.0

High-quality neural text-to-speech engine with 8-language support
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
//! Text splitting for streaming synthesis.
//!
//! Splits input text into sentence-sized chunks at natural boundaries
//! (sentence-ending punctuation, paragraph breaks) while respecting
//! maximum chunk size limits.

/// Text chunk with metadata
#[derive(Debug, Clone)]
pub struct TextChunk {
    pub text: String,
    pub index: usize,
    pub is_last: bool,
}

/// Split configuration
#[derive(Debug, Clone)]
pub struct SplitConfig {
    /// Maximum characters per chunk (0 = no limit)
    pub max_chars: usize,
    /// Whether to split on commas and semicolons (for very long sentences)
    pub split_on_clause: bool,
    /// Minimum chunk size (avoid very short chunks)
    pub min_chars: usize,
}

impl Default for SplitConfig {
    fn default() -> Self {
        Self {
            max_chars: 500,
            split_on_clause: true,
            min_chars: 10,
        }
    }
}

/// Common English abbreviations that should not trigger sentence splitting.
const ABBREVIATIONS: &[&str] = &[
    "Mr.", "Mrs.", "Ms.", "Dr.", "Prof.", "Jr.", "Sr.", "Inc.", "Ltd.", "Corp.", "Co.", "vs.",
    "etc.", "approx.", "dept.", "est.", "vol.", "no.", "tel.", "fax.", "Jan.", "Feb.", "Mar.",
    "Apr.", "Jun.", "Jul.", "Aug.", "Sep.", "Oct.", "Nov.", "Dec.", "St.", "Ave.", "Blvd.", "Rd.",
    "a.m.", "p.m.", "e.g.", "i.e.",
];

/// Check whether the text immediately before a period position ends with
/// a known abbreviation. `dot_pos` is the byte index of the '.' character.
fn ends_with_abbreviation(text: &str, dot_pos: usize) -> bool {
    // The slice up to and including the dot
    let up_to_dot = &text[..=dot_pos];
    for abbr in ABBREVIATIONS {
        if up_to_dot.ends_with(abbr) {
            return true;
        }
        // Case-insensitive: also check lowercase version of first char
        let lower = abbr.to_lowercase();
        if up_to_dot.ends_with(&lower) {
            return true;
        }
    }
    false
}

/// Returns true if the character is a CJK sentence-ending punctuation mark.
fn is_cjk_sentence_end(c: char) -> bool {
    matches!(c, '\u{3002}' | '\u{FF01}' | '\u{FF1F}')
    // 。(U+3002)  !(U+FF01)  ?(U+FF1F)
}

/// Returns true if the character is a Western sentence-ending punctuation mark.
fn is_western_sentence_end(c: char) -> bool {
    matches!(c, '.' | '!' | '?')
}

/// Split text into sentences.
///
/// Handles:
/// - Western punctuation: . ! ? (followed by space or end)
/// - Japanese punctuation: 。!?
/// - Chinese punctuation: 。!?
/// - Newlines/paragraph breaks
/// - Quoted speech ("Hello." he said)
/// - Abbreviations (Mr. Dr. etc. - don't split these)
pub fn split_sentences(text: &str) -> Vec<String> {
    if text.is_empty() || text.chars().all(|c| c.is_whitespace()) {
        return Vec::new();
    }

    let mut sentences: Vec<String> = Vec::new();
    let mut current = String::new();
    let mut in_quotes = false;

    // Build a (byte_offset, char) index from char_indices() so we can track
    // byte positions incrementally instead of collecting into Vec<char>.
    let indexed: Vec<(usize, char)> = text.char_indices().collect();
    let len = indexed.len();
    let mut i = 0;

    while i < len {
        let (_byte_off, c) = indexed[i];

        // Track quote state
        if c == '"' || c == '\u{201C}' || c == '\u{201D}' {
            in_quotes = !in_quotes;
            current.push(c);
            i += 1;
            continue;
        }

        // Handle paragraph breaks (two or more newlines)
        if c == '\n' {
            // Count consecutive newlines
            let mut newline_count = 0;
            let mut j = i;
            while j < len && (indexed[j].1 == '\n' || indexed[j].1 == '\r') {
                if indexed[j].1 == '\n' {
                    newline_count += 1;
                }
                j += 1;
            }

            if newline_count >= 2 {
                // Paragraph break: flush current sentence
                let trimmed = current.trim_end().to_string();
                if !trimmed.is_empty() {
                    sentences.push(trimmed);
                }
                current.clear();
                i = j;
                continue;
            } else {
                // Single newline: treat as space
                current.push(' ');
                i = j;
                continue;
            }
        }

        // CJK sentence-ending punctuation: always split (no abbreviation ambiguity)
        if is_cjk_sentence_end(c) {
            current.push(c);
            // Consume any trailing CJK punctuation or closing quotes/brackets
            while i + 1 < len
                && (indexed[i + 1].1 == '\u{300D}' //                || indexed[i + 1].1 == '\u{300F}'                 //                || indexed[i + 1].1 == '\u{FF09}'                 //                || indexed[i + 1].1 == '"'
                || indexed[i + 1].1 == '\u{201D}')
            // "
            {
                i += 1;
                current.push(indexed[i].1);
            }
            let trimmed = current.trim_end().to_string();
            if !trimmed.is_empty() {
                sentences.push(trimmed);
            }
            current.clear();
            i += 1;
            // Skip whitespace after CJK sentence end
            while i < len && indexed[i].1.is_whitespace() && indexed[i].1 != '\n' {
                i += 1;
            }
            continue;
        }

        // Western sentence-ending punctuation
        if is_western_sentence_end(c) {
            current.push(c);

            // Handle multiple consecutive punctuation: !? ?! !! ...
            while i + 1 < len
                && (is_western_sentence_end(indexed[i + 1].1) || indexed[i + 1].1 == '.')
            {
                i += 1;
                current.push(indexed[i].1);
            }

            // Consume closing quotes after punctuation
            while i + 1 < len
                && (indexed[i + 1].1 == '"'
                    || indexed[i + 1].1 == '\u{201D}'
                    || indexed[i + 1].1 == '\'')
            {
                i += 1;
                current.push(indexed[i].1);
            }

            // Check if this is an abbreviation (only for periods)
            if c == '.' {
                // Use byte offset directly from char_indices
                let byte_pos = indexed[i].0;
                if ends_with_abbreviation(&text[..=byte_pos], byte_pos) {
                    // Don't split at abbreviations
                    i += 1;
                    continue;
                }

                // Check for ellipsis: three or more dots
                let dot_count = current.chars().rev().take_while(|&ch| ch == '.').count();
                if dot_count >= 3 {
                    // Ellipsis: don't split here, continue to see if next
                    // char is whitespace + capital or more punctuation
                    if i + 1 < len && !indexed[i + 1].1.is_whitespace() {
                        i += 1;
                        continue;
                    }
                }
            }

            // Check if followed by whitespace or end of string
            let next_i = i + 1;
            if next_i >= len {
                // End of string: flush
                let trimmed = current.trim_end().to_string();
                if !trimmed.is_empty() {
                    sentences.push(trimmed);
                }
                current.clear();
                i = next_i;
                continue;
            }

            if indexed[next_i].1.is_whitespace() || indexed[next_i].1 == '\n' {
                // Don't split if we're inside quotes
                if in_quotes {
                    i += 1;
                    continue;
                }

                let trimmed = current.trim_end().to_string();
                if !trimmed.is_empty() {
                    sentences.push(trimmed);
                }
                current.clear();
                i = next_i;
                // Skip whitespace between sentences
                while i < len && indexed[i].1 == ' ' {
                    i += 1;
                }
                continue;
            }

            i += 1;
            continue;
        }

        current.push(c);
        i += 1;
    }

    // Flush remaining text
    let trimmed = current.trim_end().to_string();
    if !trimmed.is_empty() {
        sentences.push(trimmed);
    }

    sentences
}

/// Split a single long sentence at clause boundaries (commas, semicolons, colons).
fn split_at_clauses(text: &str) -> Vec<String> {
    let clause_delimiters: &[char] = &[
        ',', ';', ':', '\u{3001}', // 、(Japanese comma)
        '\u{FF0C}', // ,(fullwidth comma)
        '\u{FF1B}', // ;(fullwidth semicolon)
    ];

    let mut clauses: Vec<String> = Vec::new();
    let mut current = String::new();

    let indexed: Vec<(usize, char)> = text.char_indices().collect();
    let len = indexed.len();
    let mut i = 0;

    while i < len {
        let (_byte_off, c) = indexed[i];
        current.push(c);

        if clause_delimiters.contains(&c) {
            // Include trailing space if present
            if i + 1 < len && indexed[i + 1].1 == ' ' {
                i += 1;
                current.push(indexed[i].1);
            }
            let trimmed = current.trim_end().to_string();
            if !trimmed.is_empty() {
                clauses.push(trimmed);
            }
            current.clear();
        }

        i += 1;
    }

    // Flush remaining
    let trimmed = current.trim_end().to_string();
    if !trimmed.is_empty() {
        clauses.push(trimmed);
    }

    // If we got no splits (no clause delimiters found), return the original
    if clauses.len() <= 1 {
        let trimmed = text.trim_end().to_string();
        if trimmed.is_empty() {
            return Vec::new();
        }
        return vec![trimmed];
    }

    clauses
}

/// Split text into chunks with configuration.
///
/// First splits into sentences, then merges/splits to respect size limits.
pub fn split_chunks(text: &str, config: &SplitConfig) -> Vec<TextChunk> {
    let sentences = split_sentences(text);
    if sentences.is_empty() {
        return Vec::new();
    }

    let max = config.max_chars;
    let min = config.min_chars;

    // Phase 1: Expand sentences that exceed max_chars via clause splitting
    let mut expanded: Vec<String> = Vec::new();
    for sentence in sentences {
        if max > 0 && sentence.len() > max && config.split_on_clause {
            let clauses = split_at_clauses(&sentence);
            // Merge clauses that are still too long -- just keep them as-is
            // (we cannot split further without breaking words)
            for clause in clauses {
                expanded.push(clause);
            }
        } else {
            expanded.push(sentence);
        }
    }

    // Phase 2: Merge short fragments so they meet min_chars
    let mut merged: Vec<String> = Vec::new();
    let mut buffer = String::new();

    for piece in expanded {
        if buffer.is_empty() {
            buffer = piece;
        } else {
            // Try merging
            let combined_len = buffer.len() + 1 + piece.len(); // +1 for space
            if buffer.len() < min {
                // Current buffer is too short, merge
                buffer.push(' ');
                buffer.push_str(&piece);
            } else if max > 0 && combined_len <= max && piece.len() < min {
                // Next piece is too short, merge it into current
                buffer.push(' ');
                buffer.push_str(&piece);
            } else {
                // Flush buffer
                merged.push(buffer);
                buffer = piece;
            }
        }
    }
    if !buffer.is_empty() {
        merged.push(buffer);
    }

    // Phase 3: Build TextChunks with index and is_last
    let total = merged.len();
    merged
        .into_iter()
        .enumerate()
        .map(|(i, text)| TextChunk {
            text,
            index: i,
            is_last: i == total - 1,
        })
        .collect()
}

// ---------------------------------------------------------------------------
// Tests
// ---------------------------------------------------------------------------

#[cfg(test)]
mod tests {
    use super::*;

    // -----------------------------------------------------------------------
    // 1. Basic English sentence splitting
    // -----------------------------------------------------------------------
    #[test]
    fn test_basic_english_sentences() {
        let result = split_sentences("Hello. World.");
        assert_eq!(result, vec!["Hello.", "World."]);
    }

    #[test]
    fn test_english_multiple_sentences() {
        let result = split_sentences("First sentence. Second sentence. Third one.");
        assert_eq!(
            result,
            vec!["First sentence.", "Second sentence.", "Third one."]
        );
    }

    // -----------------------------------------------------------------------
    // 2. Japanese sentence splitting
    // -----------------------------------------------------------------------
    #[test]
    fn test_japanese_sentences() {
        let result = split_sentences("今日は。明日は。");
        assert_eq!(result, vec!["今日は。", "明日は。"]);
    }

    #[test]
    fn test_japanese_mixed_punctuation() {
        let result = split_sentences("元気ですか?はい、元気です。よかった!");
        assert_eq!(
            result,
            vec!["元気ですか?", "はい、元気です。", "よかった!"]
        );
    }

    // -----------------------------------------------------------------------
    // 3. Mixed language splitting
    // -----------------------------------------------------------------------
    #[test]
    fn test_mixed_language() {
        let result = split_sentences("Hello. こんにちは。World.");
        assert_eq!(result, vec!["Hello.", "こんにちは。", "World."]);
    }

    #[test]
    fn test_mixed_language_continuous() {
        let result = split_sentences("これはテストです。This is a test. もう一つ。");
        // At minimum, CJK sentence ender splits correctly
        assert!(!result.is_empty());
        assert!(result[0].contains("これはテストです。"));
        assert!(
            result.len() >= 2,
            "expected at least 2 chunks, got {:?}",
            result
        );
    }

    // -----------------------------------------------------------------------
    // 4. Abbreviation handling
    // -----------------------------------------------------------------------
    #[test]
    fn test_abbreviation_mr() {
        let result = split_sentences("Mr. Smith went to the store. He bought milk.");
        assert_eq!(
            result,
            vec!["Mr. Smith went to the store.", "He bought milk."]
        );
    }

    #[test]
    fn test_abbreviation_dr() {
        let result = split_sentences("Dr. Jones and Prof. Lee are here.");
        assert_eq!(result, vec!["Dr. Jones and Prof. Lee are here."]);
    }

    #[test]
    fn test_abbreviation_etc() {
        let result = split_sentences("Apples, oranges, etc. are fruits. Eat them.");
        assert_eq!(
            result,
            vec!["Apples, oranges, etc. are fruits.", "Eat them."]
        );
    }

    #[test]
    fn test_abbreviation_eg() {
        let result = split_sentences("Use tools e.g. a hammer. Done.");
        assert_eq!(result, vec!["Use tools e.g. a hammer.", "Done."]);
    }

    // -----------------------------------------------------------------------
    // 5. Quoted speech
    // -----------------------------------------------------------------------
    #[test]
    fn test_quoted_speech_keeps_sentence() {
        let result = split_sentences("He said \"Hello. How are you?\" Then left.");
        // The period and question mark inside quotes should not split
        assert_eq!(result.len(), 1);
        assert_eq!(result[0], "He said \"Hello. How are you?\" Then left.");
    }

    #[test]
    fn test_quoted_speech_at_end() {
        let result = split_sentences("She whispered \"Goodbye.\"");
        assert_eq!(result, vec!["She whispered \"Goodbye.\""]);
    }

    // -----------------------------------------------------------------------
    // 6. Very long sentences with clause splitting
    // -----------------------------------------------------------------------
    #[test]
    fn test_clause_splitting() {
        let clauses = split_at_clauses("first part, second part; third part: fourth part");
        assert_eq!(
            clauses,
            vec!["first part,", "second part;", "third part:", "fourth part",]
        );
    }

    #[test]
    fn test_clause_splitting_no_delimiters() {
        let clauses = split_at_clauses("no delimiters here");
        assert_eq!(clauses, vec!["no delimiters here"]);
    }

    #[test]
    fn test_long_sentence_split_in_chunks() {
        // Create a sentence that exceeds max_chars
        let long = "Alpha bravo charlie, delta echo foxtrot; golf hotel india";
        let config = SplitConfig {
            max_chars: 30,
            split_on_clause: true,
            min_chars: 1,
        };
        let chunks = split_chunks(long, &config);
        // Should have been clause-split since the whole thing > 30 chars
        assert!(
            chunks.len() > 1,
            "expected multiple chunks, got {}",
            chunks.len()
        );
        // Each clause should be within limits (or at least attempted)
        for chunk in &chunks {
            assert!(!chunk.text.is_empty());
        }
    }

    // -----------------------------------------------------------------------
    // 7. max_chars enforcement
    // -----------------------------------------------------------------------
    #[test]
    fn test_max_chars_splits_long_text() {
        let text = "Short. This is a somewhat longer sentence that has many words in it. End.";
        let config = SplitConfig {
            max_chars: 50,
            split_on_clause: false,
            min_chars: 1,
        };
        let chunks = split_chunks(text, &config);
        assert!(chunks.len() >= 2);
    }

    #[test]
    fn test_max_chars_zero_means_no_limit() {
        let text = "First. Second. Third.";
        let config = SplitConfig {
            max_chars: 0,
            split_on_clause: false,
            min_chars: 0,
        };
        let chunks = split_chunks(text, &config);
        // max_chars=0 means no limit, sentences split normally
        assert!(!chunks.is_empty(), "should produce at least one chunk");
        assert!(
            chunks.len() >= 2,
            "expected at least 2 chunks, got {:?}",
            chunks.iter().map(|c| &c.text).collect::<Vec<_>>()
        );
    }

    // -----------------------------------------------------------------------
    // 8. min_chars merging
    // -----------------------------------------------------------------------
    #[test]
    fn test_min_chars_merges_short_chunks() {
        let text = "Hi. Go. Now.";
        let config = SplitConfig {
            max_chars: 500,
            split_on_clause: true,
            min_chars: 10,
        };
        let chunks = split_chunks(text, &config);
        // "Hi." (3 chars) is < 10, so it should be merged with next
        assert!(
            chunks.len() < 3,
            "expected merging of short chunks, got {} chunks: {:?}",
            chunks.len(),
            chunks.iter().map(|c| &c.text).collect::<Vec<_>>()
        );
    }

    #[test]
    fn test_min_chars_zero_no_merging() {
        let text = "A. B. C.";
        let config = SplitConfig {
            max_chars: 0,
            split_on_clause: false,
            min_chars: 0,
        };
        let chunks = split_chunks(text, &config);
        assert_eq!(chunks.len(), 3);
        assert_eq!(chunks[0].text, "A.");
        assert_eq!(chunks[1].text, "B.");
        assert_eq!(chunks[2].text, "C.");
    }

    // -----------------------------------------------------------------------
    // 9. Empty / whitespace input
    // -----------------------------------------------------------------------
    #[test]
    fn test_empty_input() {
        assert!(split_sentences("").is_empty());
        assert!(split_chunks("", &SplitConfig::default()).is_empty());
    }

    #[test]
    fn test_whitespace_only() {
        assert!(split_sentences("   ").is_empty());
        assert!(split_sentences("\n\n\n").is_empty());
        assert!(split_chunks("   ", &SplitConfig::default()).is_empty());
    }

    // -----------------------------------------------------------------------
    // 10. Paragraph breaks (multiple newlines)
    // -----------------------------------------------------------------------
    #[test]
    fn test_paragraph_breaks() {
        let text = "First paragraph.\n\nSecond paragraph.";
        let result = split_sentences(text);
        assert_eq!(result, vec!["First paragraph.", "Second paragraph."]);
    }

    #[test]
    fn test_single_newline_no_split() {
        let text = "Line one\nstill same sentence.";
        let result = split_sentences(text);
        assert_eq!(result.len(), 1);
        // Single newline is treated as a space
        assert!(result[0].contains("Line one"));
        assert!(result[0].contains("still same sentence."));
    }

    // -----------------------------------------------------------------------
    // 11. Exclamation and question marks
    // -----------------------------------------------------------------------
    #[test]
    fn test_exclamation_mark() {
        let result = split_sentences("Wow! Amazing!");
        assert_eq!(result, vec!["Wow!", "Amazing!"]);
    }

    #[test]
    fn test_question_mark() {
        let result = split_sentences("Really? Yes.");
        assert_eq!(result, vec!["Really?", "Yes."]);
    }

    // -----------------------------------------------------------------------
    // 12. Ellipsis handling
    // -----------------------------------------------------------------------
    #[test]
    fn test_ellipsis_followed_by_text() {
        let result = split_sentences("Wait... what?");
        // The sentence splitter may split at the period in "..." - that's acceptable
        // The key is that it doesn't panic and produces non-empty results
        assert!(!result.is_empty());
        let joined: String = result.join(" ");
        assert!(
            joined.contains("Wait"),
            "should contain 'Wait': {:?}",
            result
        );
        assert!(
            joined.contains("what?"),
            "should contain 'what?': {:?}",
            result
        );
    }

    #[test]
    fn test_ellipsis_at_end() {
        let result = split_sentences("And then...");
        assert_eq!(result.len(), 1);
        assert_eq!(result[0], "And then...");
    }

    // -----------------------------------------------------------------------
    // 13. No trailing whitespace in chunks
    // -----------------------------------------------------------------------
    #[test]
    fn test_no_trailing_whitespace() {
        let result = split_sentences("Hello.   World.   ");
        for s in &result {
            assert_eq!(s, s.trim_end(), "trailing whitespace found in: {:?}", s);
        }
    }

    #[test]
    fn test_chunks_no_trailing_whitespace() {
        let text = "Hello.   World.   ";
        let chunks = split_chunks(text, &SplitConfig::default());
        for chunk in &chunks {
            assert_eq!(
                chunk.text,
                chunk.text.trim_end(),
                "trailing whitespace in chunk: {:?}",
                chunk.text
            );
        }
    }

    // -----------------------------------------------------------------------
    // 14. TextChunk index and is_last correctness
    // -----------------------------------------------------------------------
    #[test]
    fn test_chunk_index_and_is_last() {
        let text = "First. Second. Third.";
        let config = SplitConfig {
            max_chars: 0,
            split_on_clause: false,
            min_chars: 0,
        };
        let chunks = split_chunks(text, &config);
        assert!(
            chunks.len() >= 2,
            "expected at least 2 chunks, got {:?}",
            chunks.iter().map(|c| &c.text).collect::<Vec<_>>()
        );
        // Verify indices are sequential
        for (i, chunk) in chunks.iter().enumerate() {
            assert_eq!(chunk.index, i, "chunk {} index mismatch", i);
        }
        // Last chunk must have is_last=true
        assert!(
            chunks.last().unwrap().is_last,
            "last chunk should have is_last=true"
        );
        // Non-last chunks must have is_last=false
        for chunk in &chunks[..chunks.len() - 1] {
            assert!(!chunk.is_last, "non-last chunk should have is_last=false");
        }
    }

    #[test]
    fn test_single_chunk_is_last() {
        let config = SplitConfig {
            max_chars: 0,
            split_on_clause: false,
            min_chars: 0,
        };
        let chunks = split_chunks("Only one.", &config);
        assert_eq!(chunks.len(), 1);
        assert_eq!(chunks[0].index, 0);
        assert!(chunks[0].is_last);
    }

    // -----------------------------------------------------------------------
    // 15. Single sentence (no split needed)
    // -----------------------------------------------------------------------
    #[test]
    fn test_single_sentence_no_split() {
        let result = split_sentences("Just one sentence without ending punctuation");
        assert_eq!(result, vec!["Just one sentence without ending punctuation"]);
    }

    #[test]
    fn test_single_sentence_with_period() {
        let result = split_sentences("Just one sentence.");
        assert_eq!(result, vec!["Just one sentence."]);
    }

    // -----------------------------------------------------------------------
    // 16. Multiple consecutive punctuation
    // -----------------------------------------------------------------------
    #[test]
    fn test_multiple_punctuation_exclamation_question() {
        let result = split_sentences("Really?! Yes.");
        assert_eq!(result, vec!["Really?!", "Yes."]);
    }

    #[test]
    fn test_multiple_exclamation() {
        let result = split_sentences("No!! Stop.");
        assert_eq!(result, vec!["No!!", "Stop."]);
    }

    // -----------------------------------------------------------------------
    // 17. Default SplitConfig values
    // -----------------------------------------------------------------------
    #[test]
    fn test_default_config() {
        let config = SplitConfig::default();
        assert_eq!(config.max_chars, 500);
        assert!(config.split_on_clause);
        assert_eq!(config.min_chars, 10);
    }

    // -----------------------------------------------------------------------
    // 18. Chinese sentence splitting
    // -----------------------------------------------------------------------
    #[test]
    fn test_chinese_sentences() {
        let result = split_sentences("你好。再见。");
        assert_eq!(result, vec!["你好。", "再见。"]);
    }

    #[test]
    fn test_chinese_question_and_exclamation() {
        let result = split_sentences("你好吗?很好!");
        assert_eq!(result, vec!["你好吗?", "很好!"]);
    }

    // -----------------------------------------------------------------------
    // 19. Japanese clause delimiters
    // -----------------------------------------------------------------------
    #[test]
    fn test_japanese_clause_splitting() {
        let clauses = split_at_clauses("最初の部分、二番目の部分、三番目");
        assert_eq!(clauses.len(), 3);
    }

    // -----------------------------------------------------------------------
    // 20. Edge cases
    // -----------------------------------------------------------------------
    #[test]
    fn test_only_punctuation() {
        let result = split_sentences("...");
        assert_eq!(result.len(), 1);
        assert_eq!(result[0], "...");
    }

    #[test]
    fn test_split_chunks_preserves_all_text() {
        let text = "First sentence. Second sentence. Third sentence.";
        let config = SplitConfig {
            max_chars: 0,
            split_on_clause: false,
            min_chars: 0,
        };
        let chunks = split_chunks(text, &config);
        // Rejoin and compare (ignoring whitespace differences)
        let rejoined: String = chunks
            .iter()
            .map(|c| c.text.as_str())
            .collect::<Vec<_>>()
            .join(" ");
        assert_eq!(rejoined, "First sentence. Second sentence. Third sentence.");
    }

    #[test]
    fn test_period_not_followed_by_space() {
        // Period mid-word (e.g. a URL or filename) should not split
        let result = split_sentences("Visit example.com today.");
        assert_eq!(result.len(), 1);
    }

    #[test]
    fn test_chunks_with_merging_and_splitting() {
        let text =
            "A. B. This is a long sentence with many words, and some clauses; and more text here.";
        let config = SplitConfig {
            max_chars: 40,
            split_on_clause: true,
            min_chars: 5,
        };
        let chunks = split_chunks(text, &config);
        assert!(!chunks.is_empty());
        // Verify index continuity
        for (i, chunk) in chunks.iter().enumerate() {
            assert_eq!(chunk.index, i);
        }
        // Last chunk must have is_last = true
        assert!(chunks.last().unwrap().is_last);
    }

    // -----------------------------------------------------------------------
    // 21. Nested quotes — inner quotes should not break the outer sentence
    // -----------------------------------------------------------------------
    #[test]
    fn test_split_sentences_nested_quotes() {
        let text = "He said \"she said 'hello'\" then left.";
        let result = split_sentences(text);
        assert_eq!(
            result.len(),
            1,
            "nested quotes should not cause extra splits: {:?}",
            result
        );
        assert_eq!(result[0], text);
    }

    // -----------------------------------------------------------------------
    // 22. Only CJK punctuation (no actual text content)
    // -----------------------------------------------------------------------
    #[test]
    fn test_split_sentences_only_cjk_punctuation() {
        // Input is three CJK sentence-ending marks with no surrounding text
        let result = split_sentences("\u{3002}\u{FF01}\u{FF1F}");
        // Each mark should be treated as its own sentence (non-empty)
        assert!(
            !result.is_empty(),
            "CJK-only punctuation should produce output"
        );
        for s in &result {
            assert!(!s.is_empty(), "no empty sentences should be emitted");
        }
    }

    // -----------------------------------------------------------------------
    // 23. split_chunks where max_chars < min_chars — should still not panic
    // -----------------------------------------------------------------------
    #[test]
    fn test_split_chunks_max_less_than_min() {
        let text = "Hello world. Goodbye world.";
        // Intentionally contradictory: max < min
        let config = SplitConfig {
            max_chars: 5,
            split_on_clause: true,
            min_chars: 50,
        };
        let chunks = split_chunks(text, &config);
        // Must not panic; should still produce at least one chunk
        assert!(
            !chunks.is_empty(),
            "should produce chunks even with invalid config"
        );
        // All text should survive the round-trip
        let rejoined: String = chunks
            .iter()
            .map(|c| c.text.as_str())
            .collect::<Vec<_>>()
            .join(" ");
        assert!(
            rejoined.contains("Hello"),
            "text should survive: {rejoined}"
        );
        assert!(
            rejoined.contains("Goodbye"),
            "text should survive: {rejoined}"
        );
    }

    // -----------------------------------------------------------------------
    // 24. Consecutive terminators — "Really?! Yes." should give 2 chunks
    // -----------------------------------------------------------------------
    #[test]
    fn test_split_sentences_consecutive_terminators() {
        let result = split_sentences("Really?! Yes.");
        assert_eq!(result, vec!["Really?!", "Yes."]);
    }

    // -----------------------------------------------------------------------
    // 25. Abbreviation at start of sentence — "Dr. Smith is here."
    // -----------------------------------------------------------------------
    #[test]
    fn test_split_sentences_abbreviation_at_start() {
        let result = split_sentences("Dr. Smith is here.");
        // "Dr." is an abbreviation and must not split
        assert_eq!(
            result.len(),
            1,
            "abbreviation at start should not split: {:?}",
            result
        );
        assert_eq!(result[0], "Dr. Smith is here.");
    }

    // -----------------------------------------------------------------------
    // 26. CRLF line endings — "\r\n" should be treated like "\n"
    // -----------------------------------------------------------------------
    #[test]
    fn test_split_sentences_crlf_line_endings() {
        // Single CRLF: treated as single newline (space), no split
        let result_single = split_sentences("Hello.\r\nWorld.");
        assert!(
            !result_single.is_empty(),
            "CRLF input should produce output"
        );

        // Double CRLF: treated as paragraph break, should split
        let result_double = split_sentences("Hello.\r\n\r\nWorld.");
        assert!(
            result_double.len() >= 2,
            "double CRLF should cause paragraph split: {:?}",
            result_double
        );
        // First chunk should contain "Hello." and last should contain "World."
        assert!(
            result_double[0].contains("Hello."),
            "first chunk: {:?}",
            result_double
        );
        assert!(
            result_double.last().unwrap().contains("World."),
            "last chunk: {:?}",
            result_double
        );
    }
}