Skip to main content

voicepeak_cli/
text_splitter.rs

1pub const MAX_CHARS: usize = 140;
2
3pub fn check_text_length(text: &str) -> bool {
4    text.chars().count() <= MAX_CHARS
5}
6
7pub fn split_text(text: &str) -> Vec<String> {
8    if text.chars().count() <= MAX_CHARS {
9        return vec![text.to_string()];
10    }
11
12    let mut chunks = Vec::new();
13    let mut current_chunk = String::new();
14    let mut chars_count = 0;
15
16    let sentences = split_into_sentences(text);
17
18    for sentence in sentences {
19        let sentence_len = sentence.chars().count();
20
21        if chars_count + sentence_len <= MAX_CHARS {
22            current_chunk.push_str(&sentence);
23            chars_count += sentence_len;
24        } else {
25            if !current_chunk.is_empty() {
26                chunks.push(current_chunk.trim().to_string());
27                current_chunk = String::new();
28                chars_count = 0;
29            }
30
31            if sentence_len <= MAX_CHARS {
32                current_chunk.push_str(&sentence);
33                chars_count = sentence_len;
34            } else {
35                let sub_chunks = split_long_sentence(&sentence);
36                for (i, sub_chunk) in sub_chunks.iter().enumerate() {
37                    if i == sub_chunks.len() - 1 {
38                        current_chunk.push_str(sub_chunk);
39                        chars_count = sub_chunk.chars().count();
40                    } else {
41                        chunks.push(sub_chunk.trim().to_string());
42                    }
43                }
44            }
45        }
46    }
47
48    if !current_chunk.trim().is_empty() {
49        chunks.push(current_chunk.trim().to_string());
50    }
51
52    chunks
53}
54
55fn split_into_sentences(text: &str) -> Vec<String> {
56    let sentence_endings = ['。', '!', '?', '.', '!', '?'];
57    let mut sentences = Vec::new();
58    let mut current_sentence = String::new();
59
60    for ch in text.chars() {
61        current_sentence.push(ch);
62
63        if sentence_endings.contains(&ch) {
64            sentences.push(current_sentence.clone());
65            current_sentence.clear();
66        }
67    }
68
69    if !current_sentence.trim().is_empty() {
70        sentences.push(current_sentence);
71    }
72
73    sentences
74}
75
76fn split_long_sentence(sentence: &str) -> Vec<String> {
77    let break_points = ['、', ',', ',', ' ', ' '];
78    let mut chunks = Vec::new();
79    let mut current_chunk = String::new();
80    let mut chars_count = 0;
81
82    for ch in sentence.chars() {
83        current_chunk.push(ch);
84        chars_count += 1;
85
86        if chars_count >= MAX_CHARS {
87            if break_points.contains(&ch) {
88                chunks.push(current_chunk.clone());
89                current_chunk.clear();
90                chars_count = 0;
91            } else {
92                let last_break = find_last_break_point(&current_chunk, &break_points);
93                if let Some(char_pos) = last_break {
94                    let chars: Vec<char> = current_chunk.chars().collect();
95                    let first_part: String = chars[..=char_pos].iter().collect();
96                    let second_part: String = chars[char_pos + 1..].iter().collect();
97                    chunks.push(first_part);
98                    current_chunk = second_part;
99                    chars_count = current_chunk.chars().count();
100                } else {
101                    chunks.push(current_chunk.clone());
102                    current_chunk.clear();
103                    chars_count = 0;
104                }
105            }
106        }
107    }
108
109    if !current_chunk.trim().is_empty() {
110        chunks.push(current_chunk);
111    }
112
113    chunks
114}
115
116fn find_last_break_point(text: &str, break_points: &[char]) -> Option<usize> {
117    let chars: Vec<char> = text.chars().collect();
118    chars
119        .iter()
120        .enumerate()
121        .rev()
122        .find(|(_, ch)| break_points.contains(ch))
123        .map(|(i, _)| i)
124}