voicepeak_cli/
text_splitter.rs1pub const MAX_CHARS: usize = 140;
2
3pub fn check_text_length(text: &str) -> bool {
4 text.chars().count() <= MAX_CHARS
5}
6
7pub fn split_text(text: &str) -> Vec<String> {
8 if text.chars().count() <= MAX_CHARS {
9 return vec![text.to_string()];
10 }
11
12 let mut chunks = Vec::new();
13 let mut current_chunk = String::new();
14 let mut chars_count = 0;
15
16 let sentences = split_into_sentences(text);
17
18 for sentence in sentences {
19 let sentence_len = sentence.chars().count();
20
21 if chars_count + sentence_len <= MAX_CHARS {
22 current_chunk.push_str(&sentence);
23 chars_count += sentence_len;
24 } else {
25 if !current_chunk.is_empty() {
26 chunks.push(current_chunk.trim().to_string());
27 current_chunk = String::new();
28 chars_count = 0;
29 }
30
31 if sentence_len <= MAX_CHARS {
32 current_chunk.push_str(&sentence);
33 chars_count = sentence_len;
34 } else {
35 let sub_chunks = split_long_sentence(&sentence);
36 for (i, sub_chunk) in sub_chunks.iter().enumerate() {
37 if i == sub_chunks.len() - 1 {
38 current_chunk.push_str(sub_chunk);
39 chars_count = sub_chunk.chars().count();
40 } else {
41 chunks.push(sub_chunk.trim().to_string());
42 }
43 }
44 }
45 }
46 }
47
48 if !current_chunk.trim().is_empty() {
49 chunks.push(current_chunk.trim().to_string());
50 }
51
52 chunks
53}
54
55fn split_into_sentences(text: &str) -> Vec<String> {
56 let sentence_endings = ['。', '!', '?', '.', '!', '?'];
57 let mut sentences = Vec::new();
58 let mut current_sentence = String::new();
59
60 for ch in text.chars() {
61 current_sentence.push(ch);
62
63 if sentence_endings.contains(&ch) {
64 sentences.push(current_sentence.clone());
65 current_sentence.clear();
66 }
67 }
68
69 if !current_sentence.trim().is_empty() {
70 sentences.push(current_sentence);
71 }
72
73 sentences
74}
75
76fn split_long_sentence(sentence: &str) -> Vec<String> {
77 let break_points = ['、', ',', ',', ' ', ' '];
78 let mut chunks = Vec::new();
79 let mut current_chunk = String::new();
80 let mut chars_count = 0;
81
82 for ch in sentence.chars() {
83 current_chunk.push(ch);
84 chars_count += 1;
85
86 if chars_count >= MAX_CHARS {
87 if break_points.contains(&ch) {
88 chunks.push(current_chunk.clone());
89 current_chunk.clear();
90 chars_count = 0;
91 } else {
92 let last_break = find_last_break_point(¤t_chunk, &break_points);
93 if let Some(char_pos) = last_break {
94 let chars: Vec<char> = current_chunk.chars().collect();
95 let first_part: String = chars[..=char_pos].iter().collect();
96 let second_part: String = chars[char_pos + 1..].iter().collect();
97 chunks.push(first_part);
98 current_chunk = second_part;
99 chars_count = current_chunk.chars().count();
100 } else {
101 chunks.push(current_chunk.clone());
102 current_chunk.clear();
103 chars_count = 0;
104 }
105 }
106 }
107 }
108
109 if !current_chunk.trim().is_empty() {
110 chunks.push(current_chunk);
111 }
112
113 chunks
114}
115
116fn find_last_break_point(text: &str, break_points: &[char]) -> Option<usize> {
117 let chars: Vec<char> = text.chars().collect();
118 chars
119 .iter()
120 .enumerate()
121 .rev()
122 .find(|(_, ch)| break_points.contains(ch))
123 .map(|(i, _)| i)
124}