1#[derive(Debug, Clone)]
9pub struct TextChunk {
10 pub text: String,
11 pub index: usize,
12 pub is_last: bool,
13}
14
15#[derive(Debug, Clone)]
17pub struct SplitConfig {
18 pub max_chars: usize,
20 pub split_on_clause: bool,
22 pub min_chars: usize,
24}
25
26impl Default for SplitConfig {
27 fn default() -> Self {
28 Self {
29 max_chars: 500,
30 split_on_clause: true,
31 min_chars: 10,
32 }
33 }
34}
35
36const ABBREVIATIONS: &[&str] = &[
38 "Mr.", "Mrs.", "Ms.", "Dr.", "Prof.", "Jr.", "Sr.", "Inc.", "Ltd.", "Corp.", "Co.", "vs.",
39 "etc.", "approx.", "dept.", "est.", "vol.", "no.", "tel.", "fax.", "Jan.", "Feb.", "Mar.",
40 "Apr.", "Jun.", "Jul.", "Aug.", "Sep.", "Oct.", "Nov.", "Dec.", "St.", "Ave.", "Blvd.", "Rd.",
41 "a.m.", "p.m.", "e.g.", "i.e.",
42];
43
44fn ends_with_abbreviation(text: &str, dot_pos: usize) -> bool {
47 let up_to_dot = &text[..=dot_pos];
49 for abbr in ABBREVIATIONS {
50 if up_to_dot.ends_with(abbr) {
51 return true;
52 }
53 let lower = abbr.to_lowercase();
55 if up_to_dot.ends_with(&lower) {
56 return true;
57 }
58 }
59 false
60}
61
62fn is_cjk_sentence_end(c: char) -> bool {
64 matches!(c, '\u{3002}' | '\u{FF01}' | '\u{FF1F}')
65 }
67
68fn is_western_sentence_end(c: char) -> bool {
70 matches!(c, '.' | '!' | '?')
71}
72
73pub fn split_sentences(text: &str) -> Vec<String> {
83 if text.is_empty() || text.chars().all(|c| c.is_whitespace()) {
84 return Vec::new();
85 }
86
87 let mut sentences: Vec<String> = Vec::new();
88 let mut current = String::new();
89 let mut in_quotes = false;
90
91 let indexed: Vec<(usize, char)> = text.char_indices().collect();
94 let len = indexed.len();
95 let mut i = 0;
96
97 while i < len {
98 let (_byte_off, c) = indexed[i];
99
100 if c == '"' || c == '\u{201C}' || c == '\u{201D}' {
102 in_quotes = !in_quotes;
103 current.push(c);
104 i += 1;
105 continue;
106 }
107
108 if c == '\n' {
110 let mut newline_count = 0;
112 let mut j = i;
113 while j < len && (indexed[j].1 == '\n' || indexed[j].1 == '\r') {
114 if indexed[j].1 == '\n' {
115 newline_count += 1;
116 }
117 j += 1;
118 }
119
120 if newline_count >= 2 {
121 let trimmed = current.trim_end().to_string();
123 if !trimmed.is_empty() {
124 sentences.push(trimmed);
125 }
126 current.clear();
127 i = j;
128 continue;
129 } else {
130 current.push(' ');
132 i = j;
133 continue;
134 }
135 }
136
137 if is_cjk_sentence_end(c) {
139 current.push(c);
140 while i + 1 < len
142 && (indexed[i + 1].1 == '\u{300D}' || indexed[i + 1].1 == '\u{300F}' || indexed[i + 1].1 == '\u{FF09}' || indexed[i + 1].1 == '"'
146 || indexed[i + 1].1 == '\u{201D}')
147 {
149 i += 1;
150 current.push(indexed[i].1);
151 }
152 let trimmed = current.trim_end().to_string();
153 if !trimmed.is_empty() {
154 sentences.push(trimmed);
155 }
156 current.clear();
157 i += 1;
158 while i < len && indexed[i].1.is_whitespace() && indexed[i].1 != '\n' {
160 i += 1;
161 }
162 continue;
163 }
164
165 if is_western_sentence_end(c) {
167 current.push(c);
168
169 while i + 1 < len
171 && (is_western_sentence_end(indexed[i + 1].1) || indexed[i + 1].1 == '.')
172 {
173 i += 1;
174 current.push(indexed[i].1);
175 }
176
177 while i + 1 < len
179 && (indexed[i + 1].1 == '"'
180 || indexed[i + 1].1 == '\u{201D}'
181 || indexed[i + 1].1 == '\'')
182 {
183 i += 1;
184 current.push(indexed[i].1);
185 }
186
187 if c == '.' {
189 let byte_pos = indexed[i].0;
191 if ends_with_abbreviation(&text[..=byte_pos], byte_pos) {
192 i += 1;
194 continue;
195 }
196
197 let dot_count = current.chars().rev().take_while(|&ch| ch == '.').count();
199 if dot_count >= 3 {
200 if i + 1 < len && !indexed[i + 1].1.is_whitespace() {
203 i += 1;
204 continue;
205 }
206 }
207 }
208
209 let next_i = i + 1;
211 if next_i >= len {
212 let trimmed = current.trim_end().to_string();
214 if !trimmed.is_empty() {
215 sentences.push(trimmed);
216 }
217 current.clear();
218 i = next_i;
219 continue;
220 }
221
222 if indexed[next_i].1.is_whitespace() || indexed[next_i].1 == '\n' {
223 if in_quotes {
225 i += 1;
226 continue;
227 }
228
229 let trimmed = current.trim_end().to_string();
230 if !trimmed.is_empty() {
231 sentences.push(trimmed);
232 }
233 current.clear();
234 i = next_i;
235 while i < len && indexed[i].1 == ' ' {
237 i += 1;
238 }
239 continue;
240 }
241
242 i += 1;
243 continue;
244 }
245
246 current.push(c);
247 i += 1;
248 }
249
250 let trimmed = current.trim_end().to_string();
252 if !trimmed.is_empty() {
253 sentences.push(trimmed);
254 }
255
256 sentences
257}
258
259fn split_at_clauses(text: &str) -> Vec<String> {
261 let clause_delimiters: &[char] = &[
262 ',', ';', ':', '\u{3001}', '\u{FF0C}', '\u{FF1B}', ];
266
267 let mut clauses: Vec<String> = Vec::new();
268 let mut current = String::new();
269
270 let indexed: Vec<(usize, char)> = text.char_indices().collect();
271 let len = indexed.len();
272 let mut i = 0;
273
274 while i < len {
275 let (_byte_off, c) = indexed[i];
276 current.push(c);
277
278 if clause_delimiters.contains(&c) {
279 if i + 1 < len && indexed[i + 1].1 == ' ' {
281 i += 1;
282 current.push(indexed[i].1);
283 }
284 let trimmed = current.trim_end().to_string();
285 if !trimmed.is_empty() {
286 clauses.push(trimmed);
287 }
288 current.clear();
289 }
290
291 i += 1;
292 }
293
294 let trimmed = current.trim_end().to_string();
296 if !trimmed.is_empty() {
297 clauses.push(trimmed);
298 }
299
300 if clauses.len() <= 1 {
302 let trimmed = text.trim_end().to_string();
303 if trimmed.is_empty() {
304 return Vec::new();
305 }
306 return vec![trimmed];
307 }
308
309 clauses
310}
311
312pub fn split_chunks(text: &str, config: &SplitConfig) -> Vec<TextChunk> {
316 let sentences = split_sentences(text);
317 if sentences.is_empty() {
318 return Vec::new();
319 }
320
321 let max = config.max_chars;
322 let min = config.min_chars;
323
324 let mut expanded: Vec<String> = Vec::new();
326 for sentence in sentences {
327 if max > 0 && sentence.len() > max && config.split_on_clause {
328 let clauses = split_at_clauses(&sentence);
329 for clause in clauses {
332 expanded.push(clause);
333 }
334 } else {
335 expanded.push(sentence);
336 }
337 }
338
339 let mut merged: Vec<String> = Vec::new();
341 let mut buffer = String::new();
342
343 for piece in expanded {
344 if buffer.is_empty() {
345 buffer = piece;
346 } else {
347 let combined_len = buffer.len() + 1 + piece.len(); if buffer.len() < min {
350 buffer.push(' ');
352 buffer.push_str(&piece);
353 } else if max > 0 && combined_len <= max && piece.len() < min {
354 buffer.push(' ');
356 buffer.push_str(&piece);
357 } else {
358 merged.push(buffer);
360 buffer = piece;
361 }
362 }
363 }
364 if !buffer.is_empty() {
365 merged.push(buffer);
366 }
367
368 let total = merged.len();
370 merged
371 .into_iter()
372 .enumerate()
373 .map(|(i, text)| TextChunk {
374 text,
375 index: i,
376 is_last: i == total - 1,
377 })
378 .collect()
379}
380
381#[cfg(test)]
386mod tests {
387 use super::*;
388
389 #[test]
393 fn test_basic_english_sentences() {
394 let result = split_sentences("Hello. World.");
395 assert_eq!(result, vec!["Hello.", "World."]);
396 }
397
398 #[test]
399 fn test_english_multiple_sentences() {
400 let result = split_sentences("First sentence. Second sentence. Third one.");
401 assert_eq!(
402 result,
403 vec!["First sentence.", "Second sentence.", "Third one."]
404 );
405 }
406
407 #[test]
411 fn test_japanese_sentences() {
412 let result = split_sentences("今日は。明日は。");
413 assert_eq!(result, vec!["今日は。", "明日は。"]);
414 }
415
416 #[test]
417 fn test_japanese_mixed_punctuation() {
418 let result = split_sentences("元気ですか?はい、元気です。よかった!");
419 assert_eq!(
420 result,
421 vec!["元気ですか?", "はい、元気です。", "よかった!"]
422 );
423 }
424
425 #[test]
429 fn test_mixed_language() {
430 let result = split_sentences("Hello. こんにちは。World.");
431 assert_eq!(result, vec!["Hello.", "こんにちは。", "World."]);
432 }
433
434 #[test]
435 fn test_mixed_language_continuous() {
436 let result = split_sentences("これはテストです。This is a test. もう一つ。");
437 assert!(!result.is_empty());
439 assert!(result[0].contains("これはテストです。"));
440 assert!(
441 result.len() >= 2,
442 "expected at least 2 chunks, got {:?}",
443 result
444 );
445 }
446
447 #[test]
451 fn test_abbreviation_mr() {
452 let result = split_sentences("Mr. Smith went to the store. He bought milk.");
453 assert_eq!(
454 result,
455 vec!["Mr. Smith went to the store.", "He bought milk."]
456 );
457 }
458
459 #[test]
460 fn test_abbreviation_dr() {
461 let result = split_sentences("Dr. Jones and Prof. Lee are here.");
462 assert_eq!(result, vec!["Dr. Jones and Prof. Lee are here."]);
463 }
464
465 #[test]
466 fn test_abbreviation_etc() {
467 let result = split_sentences("Apples, oranges, etc. are fruits. Eat them.");
468 assert_eq!(
469 result,
470 vec!["Apples, oranges, etc. are fruits.", "Eat them."]
471 );
472 }
473
474 #[test]
475 fn test_abbreviation_eg() {
476 let result = split_sentences("Use tools e.g. a hammer. Done.");
477 assert_eq!(result, vec!["Use tools e.g. a hammer.", "Done."]);
478 }
479
480 #[test]
484 fn test_quoted_speech_keeps_sentence() {
485 let result = split_sentences("He said \"Hello. How are you?\" Then left.");
486 assert_eq!(result.len(), 1);
488 assert_eq!(result[0], "He said \"Hello. How are you?\" Then left.");
489 }
490
491 #[test]
492 fn test_quoted_speech_at_end() {
493 let result = split_sentences("She whispered \"Goodbye.\"");
494 assert_eq!(result, vec!["She whispered \"Goodbye.\""]);
495 }
496
497 #[test]
501 fn test_clause_splitting() {
502 let clauses = split_at_clauses("first part, second part; third part: fourth part");
503 assert_eq!(
504 clauses,
505 vec!["first part,", "second part;", "third part:", "fourth part",]
506 );
507 }
508
509 #[test]
510 fn test_clause_splitting_no_delimiters() {
511 let clauses = split_at_clauses("no delimiters here");
512 assert_eq!(clauses, vec!["no delimiters here"]);
513 }
514
515 #[test]
516 fn test_long_sentence_split_in_chunks() {
517 let long = "Alpha bravo charlie, delta echo foxtrot; golf hotel india";
519 let config = SplitConfig {
520 max_chars: 30,
521 split_on_clause: true,
522 min_chars: 1,
523 };
524 let chunks = split_chunks(long, &config);
525 assert!(
527 chunks.len() > 1,
528 "expected multiple chunks, got {}",
529 chunks.len()
530 );
531 for chunk in &chunks {
533 assert!(!chunk.text.is_empty());
534 }
535 }
536
537 #[test]
541 fn test_max_chars_splits_long_text() {
542 let text = "Short. This is a somewhat longer sentence that has many words in it. End.";
543 let config = SplitConfig {
544 max_chars: 50,
545 split_on_clause: false,
546 min_chars: 1,
547 };
548 let chunks = split_chunks(text, &config);
549 assert!(chunks.len() >= 2);
550 }
551
552 #[test]
553 fn test_max_chars_zero_means_no_limit() {
554 let text = "First. Second. Third.";
555 let config = SplitConfig {
556 max_chars: 0,
557 split_on_clause: false,
558 min_chars: 0,
559 };
560 let chunks = split_chunks(text, &config);
561 assert!(!chunks.is_empty(), "should produce at least one chunk");
563 assert!(
564 chunks.len() >= 2,
565 "expected at least 2 chunks, got {:?}",
566 chunks.iter().map(|c| &c.text).collect::<Vec<_>>()
567 );
568 }
569
570 #[test]
574 fn test_min_chars_merges_short_chunks() {
575 let text = "Hi. Go. Now.";
576 let config = SplitConfig {
577 max_chars: 500,
578 split_on_clause: true,
579 min_chars: 10,
580 };
581 let chunks = split_chunks(text, &config);
582 assert!(
584 chunks.len() < 3,
585 "expected merging of short chunks, got {} chunks: {:?}",
586 chunks.len(),
587 chunks.iter().map(|c| &c.text).collect::<Vec<_>>()
588 );
589 }
590
591 #[test]
592 fn test_min_chars_zero_no_merging() {
593 let text = "A. B. C.";
594 let config = SplitConfig {
595 max_chars: 0,
596 split_on_clause: false,
597 min_chars: 0,
598 };
599 let chunks = split_chunks(text, &config);
600 assert_eq!(chunks.len(), 3);
601 assert_eq!(chunks[0].text, "A.");
602 assert_eq!(chunks[1].text, "B.");
603 assert_eq!(chunks[2].text, "C.");
604 }
605
606 #[test]
610 fn test_empty_input() {
611 assert!(split_sentences("").is_empty());
612 assert!(split_chunks("", &SplitConfig::default()).is_empty());
613 }
614
615 #[test]
616 fn test_whitespace_only() {
617 assert!(split_sentences(" ").is_empty());
618 assert!(split_sentences("\n\n\n").is_empty());
619 assert!(split_chunks(" ", &SplitConfig::default()).is_empty());
620 }
621
622 #[test]
626 fn test_paragraph_breaks() {
627 let text = "First paragraph.\n\nSecond paragraph.";
628 let result = split_sentences(text);
629 assert_eq!(result, vec!["First paragraph.", "Second paragraph."]);
630 }
631
632 #[test]
633 fn test_single_newline_no_split() {
634 let text = "Line one\nstill same sentence.";
635 let result = split_sentences(text);
636 assert_eq!(result.len(), 1);
637 assert!(result[0].contains("Line one"));
639 assert!(result[0].contains("still same sentence."));
640 }
641
642 #[test]
646 fn test_exclamation_mark() {
647 let result = split_sentences("Wow! Amazing!");
648 assert_eq!(result, vec!["Wow!", "Amazing!"]);
649 }
650
651 #[test]
652 fn test_question_mark() {
653 let result = split_sentences("Really? Yes.");
654 assert_eq!(result, vec!["Really?", "Yes."]);
655 }
656
657 #[test]
661 fn test_ellipsis_followed_by_text() {
662 let result = split_sentences("Wait... what?");
663 assert!(!result.is_empty());
666 let joined: String = result.join(" ");
667 assert!(
668 joined.contains("Wait"),
669 "should contain 'Wait': {:?}",
670 result
671 );
672 assert!(
673 joined.contains("what?"),
674 "should contain 'what?': {:?}",
675 result
676 );
677 }
678
679 #[test]
680 fn test_ellipsis_at_end() {
681 let result = split_sentences("And then...");
682 assert_eq!(result.len(), 1);
683 assert_eq!(result[0], "And then...");
684 }
685
686 #[test]
690 fn test_no_trailing_whitespace() {
691 let result = split_sentences("Hello. World. ");
692 for s in &result {
693 assert_eq!(s, s.trim_end(), "trailing whitespace found in: {:?}", s);
694 }
695 }
696
697 #[test]
698 fn test_chunks_no_trailing_whitespace() {
699 let text = "Hello. World. ";
700 let chunks = split_chunks(text, &SplitConfig::default());
701 for chunk in &chunks {
702 assert_eq!(
703 chunk.text,
704 chunk.text.trim_end(),
705 "trailing whitespace in chunk: {:?}",
706 chunk.text
707 );
708 }
709 }
710
711 #[test]
715 fn test_chunk_index_and_is_last() {
716 let text = "First. Second. Third.";
717 let config = SplitConfig {
718 max_chars: 0,
719 split_on_clause: false,
720 min_chars: 0,
721 };
722 let chunks = split_chunks(text, &config);
723 assert!(
724 chunks.len() >= 2,
725 "expected at least 2 chunks, got {:?}",
726 chunks.iter().map(|c| &c.text).collect::<Vec<_>>()
727 );
728 for (i, chunk) in chunks.iter().enumerate() {
730 assert_eq!(chunk.index, i, "chunk {} index mismatch", i);
731 }
732 assert!(
734 chunks.last().unwrap().is_last,
735 "last chunk should have is_last=true"
736 );
737 for chunk in &chunks[..chunks.len() - 1] {
739 assert!(!chunk.is_last, "non-last chunk should have is_last=false");
740 }
741 }
742
743 #[test]
744 fn test_single_chunk_is_last() {
745 let config = SplitConfig {
746 max_chars: 0,
747 split_on_clause: false,
748 min_chars: 0,
749 };
750 let chunks = split_chunks("Only one.", &config);
751 assert_eq!(chunks.len(), 1);
752 assert_eq!(chunks[0].index, 0);
753 assert!(chunks[0].is_last);
754 }
755
756 #[test]
760 fn test_single_sentence_no_split() {
761 let result = split_sentences("Just one sentence without ending punctuation");
762 assert_eq!(result, vec!["Just one sentence without ending punctuation"]);
763 }
764
765 #[test]
766 fn test_single_sentence_with_period() {
767 let result = split_sentences("Just one sentence.");
768 assert_eq!(result, vec!["Just one sentence."]);
769 }
770
771 #[test]
775 fn test_multiple_punctuation_exclamation_question() {
776 let result = split_sentences("Really?! Yes.");
777 assert_eq!(result, vec!["Really?!", "Yes."]);
778 }
779
780 #[test]
781 fn test_multiple_exclamation() {
782 let result = split_sentences("No!! Stop.");
783 assert_eq!(result, vec!["No!!", "Stop."]);
784 }
785
786 #[test]
790 fn test_default_config() {
791 let config = SplitConfig::default();
792 assert_eq!(config.max_chars, 500);
793 assert!(config.split_on_clause);
794 assert_eq!(config.min_chars, 10);
795 }
796
797 #[test]
801 fn test_chinese_sentences() {
802 let result = split_sentences("你好。再见。");
803 assert_eq!(result, vec!["你好。", "再见。"]);
804 }
805
806 #[test]
807 fn test_chinese_question_and_exclamation() {
808 let result = split_sentences("你好吗?很好!");
809 assert_eq!(result, vec!["你好吗?", "很好!"]);
810 }
811
812 #[test]
816 fn test_japanese_clause_splitting() {
817 let clauses = split_at_clauses("最初の部分、二番目の部分、三番目");
818 assert_eq!(clauses.len(), 3);
819 }
820
821 #[test]
825 fn test_only_punctuation() {
826 let result = split_sentences("...");
827 assert_eq!(result.len(), 1);
828 assert_eq!(result[0], "...");
829 }
830
831 #[test]
832 fn test_split_chunks_preserves_all_text() {
833 let text = "First sentence. Second sentence. Third sentence.";
834 let config = SplitConfig {
835 max_chars: 0,
836 split_on_clause: false,
837 min_chars: 0,
838 };
839 let chunks = split_chunks(text, &config);
840 let rejoined: String = chunks
842 .iter()
843 .map(|c| c.text.as_str())
844 .collect::<Vec<_>>()
845 .join(" ");
846 assert_eq!(rejoined, "First sentence. Second sentence. Third sentence.");
847 }
848
849 #[test]
850 fn test_period_not_followed_by_space() {
851 let result = split_sentences("Visit example.com today.");
853 assert_eq!(result.len(), 1);
854 }
855
856 #[test]
857 fn test_chunks_with_merging_and_splitting() {
858 let text =
859 "A. B. This is a long sentence with many words, and some clauses; and more text here.";
860 let config = SplitConfig {
861 max_chars: 40,
862 split_on_clause: true,
863 min_chars: 5,
864 };
865 let chunks = split_chunks(text, &config);
866 assert!(!chunks.is_empty());
867 for (i, chunk) in chunks.iter().enumerate() {
869 assert_eq!(chunk.index, i);
870 }
871 assert!(chunks.last().unwrap().is_last);
873 }
874
875 #[test]
879 fn test_split_sentences_nested_quotes() {
880 let text = "He said \"she said 'hello'\" then left.";
881 let result = split_sentences(text);
882 assert_eq!(
883 result.len(),
884 1,
885 "nested quotes should not cause extra splits: {:?}",
886 result
887 );
888 assert_eq!(result[0], text);
889 }
890
891 #[test]
895 fn test_split_sentences_only_cjk_punctuation() {
896 let result = split_sentences("\u{3002}\u{FF01}\u{FF1F}");
898 assert!(
900 !result.is_empty(),
901 "CJK-only punctuation should produce output"
902 );
903 for s in &result {
904 assert!(!s.is_empty(), "no empty sentences should be emitted");
905 }
906 }
907
908 #[test]
912 fn test_split_chunks_max_less_than_min() {
913 let text = "Hello world. Goodbye world.";
914 let config = SplitConfig {
916 max_chars: 5,
917 split_on_clause: true,
918 min_chars: 50,
919 };
920 let chunks = split_chunks(text, &config);
921 assert!(
923 !chunks.is_empty(),
924 "should produce chunks even with invalid config"
925 );
926 let rejoined: String = chunks
928 .iter()
929 .map(|c| c.text.as_str())
930 .collect::<Vec<_>>()
931 .join(" ");
932 assert!(
933 rejoined.contains("Hello"),
934 "text should survive: {rejoined}"
935 );
936 assert!(
937 rejoined.contains("Goodbye"),
938 "text should survive: {rejoined}"
939 );
940 }
941
942 #[test]
946 fn test_split_sentences_consecutive_terminators() {
947 let result = split_sentences("Really?! Yes.");
948 assert_eq!(result, vec!["Really?!", "Yes."]);
949 }
950
951 #[test]
955 fn test_split_sentences_abbreviation_at_start() {
956 let result = split_sentences("Dr. Smith is here.");
957 assert_eq!(
959 result.len(),
960 1,
961 "abbreviation at start should not split: {:?}",
962 result
963 );
964 assert_eq!(result[0], "Dr. Smith is here.");
965 }
966
967 #[test]
971 fn test_split_sentences_crlf_line_endings() {
972 let result_single = split_sentences("Hello.\r\nWorld.");
974 assert!(
975 !result_single.is_empty(),
976 "CRLF input should produce output"
977 );
978
979 let result_double = split_sentences("Hello.\r\n\r\nWorld.");
981 assert!(
982 result_double.len() >= 2,
983 "double CRLF should cause paragraph split: {:?}",
984 result_double
985 );
986 assert!(
988 result_double[0].contains("Hello."),
989 "first chunk: {:?}",
990 result_double
991 );
992 assert!(
993 result_double.last().unwrap().contains("World."),
994 "last chunk: {:?}",
995 result_double
996 );
997 }
998}