oxihuman_core/
sentence_splitter.rs1#![allow(dead_code)]
4
5#[derive(Debug, Clone, PartialEq)]
12pub struct Sentence {
13 pub text: String,
14 pub start: usize,
15 pub end: usize,
16}
17
18impl Sentence {
19 pub fn byte_len(&self) -> usize {
20 self.end.saturating_sub(self.start)
21 }
22
23 pub fn word_count_est(&self) -> usize {
24 self.text.split_whitespace().count()
25 }
26}
27
28#[derive(Debug, Clone)]
30pub struct SentenceSplitterConfig {
31 pub terminals: Vec<char>,
33 pub abbreviation_guard: bool,
35}
36
37impl Default for SentenceSplitterConfig {
38 fn default() -> Self {
39 Self {
40 terminals: vec!['.', '!', '?'],
41 abbreviation_guard: true,
42 }
43 }
44}
45
46static ABBREVIATIONS: &[&str] = &[
47 "Mr", "Mrs", "Ms", "Dr", "Prof", "Sr", "Jr", "vs", "etc", "St", "Ave", "Blvd", "Dept", "est",
48];
49
50pub fn split_sentences(text: &str, cfg: &SentenceSplitterConfig) -> Vec<Sentence> {
52 let mut sentences: Vec<Sentence> = Vec::new();
53 let mut start = 0usize;
54 let chars: Vec<(usize, char)> = text.char_indices().collect();
55 let n = chars.len();
56 let mut i = 0;
57
58 while i < n {
59 let (byte_pos, ch) = chars[i];
60 if cfg.terminals.contains(&ch) {
61 let is_abbrev = if cfg.abbreviation_guard {
63 let before = &text[start..byte_pos];
65 let last_word = before.split_whitespace().last().unwrap_or("");
66 ABBREVIATIONS
67 .iter()
68 .any(|a| last_word.eq_ignore_ascii_case(a))
69 } else {
70 false
71 };
72
73 let next_upper = (i + 1..n)
75 .find(|&j| {
76 let (_, nc) = chars[j];
77 !nc.is_whitespace()
78 })
79 .map(|j| chars[j].1.is_uppercase())
80 .unwrap_or(false);
81
82 if !is_abbrev && (next_upper || i + 1 == n) {
83 let end = byte_pos + ch.len_utf8();
84 let sentence_text = text[start..end].trim().to_string();
85 if !sentence_text.is_empty() {
86 sentences.push(Sentence {
87 text: sentence_text,
88 start,
89 end,
90 });
91 }
92 let mut j = i + 1;
94 while j < n && chars[j].1.is_whitespace() {
95 j += 1;
96 }
97 start = if j < n { chars[j].0 } else { text.len() };
98 i = j;
99 continue;
100 }
101 }
102 i += 1;
103 }
104
105 if start < text.len() {
107 let remainder = text[start..].trim().to_string();
108 if !remainder.is_empty() {
109 sentences.push(Sentence {
110 text: remainder,
111 start,
112 end: text.len(),
113 });
114 }
115 }
116
117 sentences
118}
119
120pub fn sentence_count(text: &str) -> usize {
122 let cfg = SentenceSplitterConfig::default();
123 split_sentences(text, &cfg).len()
124}
125
126pub fn avg_words_per_sentence(text: &str) -> f64 {
128 let cfg = SentenceSplitterConfig::default();
129 let sents = split_sentences(text, &cfg);
130 if sents.is_empty() {
131 return 0.0;
132 }
133 let total: usize = sents.iter().map(|s| s.word_count_est()).sum();
134 total as f64 / sents.len() as f64
135}
136
137pub fn longest_sentence(sentences: &[Sentence]) -> Option<&Sentence> {
139 sentences.iter().max_by_key(|s| s.text.len())
140}
141
142pub fn filter_short_sentences(sentences: Vec<Sentence>, min_words: usize) -> Vec<Sentence> {
144 sentences
145 .into_iter()
146 .filter(|s| s.word_count_est() >= min_words)
147 .collect()
148}
149
150#[cfg(test)]
151mod tests {
152 use super::*;
153
154 #[test]
155 fn test_simple_split() {
156 let text = "Hello world. How are you? I am fine!";
157 let sents = split_sentences(text, &SentenceSplitterConfig::default());
158 assert!(sents.len() >= 2);
159 }
160
161 #[test]
162 fn test_sentence_count() {
163 let text = "First. Second. Third.";
164 assert!(sentence_count(text) >= 1);
165 }
166
167 #[test]
168 fn test_byte_len() {
169 let s = Sentence {
170 text: "Hi.".into(),
171 start: 0,
172 end: 3,
173 };
174 assert_eq!(s.byte_len(), 3);
175 }
176
177 #[test]
178 fn test_word_count_est() {
179 let s = Sentence {
180 text: "One two three.".into(),
181 start: 0,
182 end: 14,
183 };
184 assert_eq!(s.word_count_est(), 3);
185 }
186
187 #[test]
188 fn test_avg_words_per_sentence() {
189 let text = "One two. Three four five.";
190 let avg = avg_words_per_sentence(text);
191 assert!(avg > 0.0);
192 }
193
194 #[test]
195 fn test_longest_sentence() {
196 let sents = vec![
197 Sentence {
198 text: "Hi.".into(),
199 start: 0,
200 end: 3,
201 },
202 Sentence {
203 text: "Hello world friend.".into(),
204 start: 4,
205 end: 23,
206 },
207 ];
208 let longest = longest_sentence(&sents).expect("should succeed");
209 assert_eq!(longest.text, "Hello world friend.");
210 }
211
212 #[test]
213 fn test_filter_short() {
214 let sents = vec![
215 Sentence {
216 text: "Hi.".into(),
217 start: 0,
218 end: 3,
219 },
220 Sentence {
221 text: "Hello there world.".into(),
222 start: 4,
223 end: 22,
224 },
225 ];
226 let filtered = filter_short_sentences(sents, 2);
227 assert_eq!(filtered.len(), 1);
228 }
229
230 #[test]
231 fn test_empty_text() {
232 assert_eq!(sentence_count(""), 0);
233 }
234
235 #[test]
236 fn test_no_terminal_is_one_sentence() {
237 let text = "this has no terminal punctuation";
238 let sents = split_sentences(text, &SentenceSplitterConfig::default());
239 assert_eq!(sents.len(), 1);
240 }
241}