oxihuman_core/
sentence_splitter.rs

1// Copyright (C) 2026 COOLJAPAN OU (Team KitaSan)
2// SPDX-License-Identifier: Apache-2.0
3#![allow(dead_code)]
4
5//! Sentence boundary detector stub.
6//!
7//! Splits text into sentences using a simplified heuristic approach based on
8//! terminal punctuation (`.`, `!`, `?`) followed by whitespace and an uppercase letter.
9
10/// A detected sentence with its byte span.
11#[derive(Debug, Clone, PartialEq)]
12pub struct Sentence {
13    pub text: String,
14    pub start: usize,
15    pub end: usize,
16}
17
18impl Sentence {
19    pub fn byte_len(&self) -> usize {
20        self.end.saturating_sub(self.start)
21    }
22
23    pub fn word_count_est(&self) -> usize {
24        self.text.split_whitespace().count()
25    }
26}
27
28/// Configuration for sentence splitting.
29#[derive(Debug, Clone)]
30pub struct SentenceSplitterConfig {
31    /// Terminal punctuation characters.
32    pub terminals: Vec<char>,
33    /// If true, try to handle abbreviations (Mr., Dr., etc.) by not splitting.
34    pub abbreviation_guard: bool,
35}
36
37impl Default for SentenceSplitterConfig {
38    fn default() -> Self {
39        Self {
40            terminals: vec!['.', '!', '?'],
41            abbreviation_guard: true,
42        }
43    }
44}
45
46static ABBREVIATIONS: &[&str] = &[
47    "Mr", "Mrs", "Ms", "Dr", "Prof", "Sr", "Jr", "vs", "etc", "St", "Ave", "Blvd", "Dept", "est",
48];
49
50/// Split text into sentences.
51pub fn split_sentences(text: &str, cfg: &SentenceSplitterConfig) -> Vec<Sentence> {
52    let mut sentences: Vec<Sentence> = Vec::new();
53    let mut start = 0usize;
54    let chars: Vec<(usize, char)> = text.char_indices().collect();
55    let n = chars.len();
56    let mut i = 0;
57
58    while i < n {
59        let (byte_pos, ch) = chars[i];
60        if cfg.terminals.contains(&ch) {
61            /* Check for abbreviation guard */
62            let is_abbrev = if cfg.abbreviation_guard {
63                /* Look back at the word before the dot */
64                let before = &text[start..byte_pos];
65                let last_word = before.split_whitespace().last().unwrap_or("");
66                ABBREVIATIONS
67                    .iter()
68                    .any(|a| last_word.eq_ignore_ascii_case(a))
69            } else {
70                false
71            };
72
73            /* Look ahead for whitespace + uppercase to confirm sentence end */
74            let next_upper = (i + 1..n)
75                .find(|&j| {
76                    let (_, nc) = chars[j];
77                    !nc.is_whitespace()
78                })
79                .map(|j| chars[j].1.is_uppercase())
80                .unwrap_or(false);
81
82            if !is_abbrev && (next_upper || i + 1 == n) {
83                let end = byte_pos + ch.len_utf8();
84                let sentence_text = text[start..end].trim().to_string();
85                if !sentence_text.is_empty() {
86                    sentences.push(Sentence {
87                        text: sentence_text,
88                        start,
89                        end,
90                    });
91                }
92                /* Skip whitespace */
93                let mut j = i + 1;
94                while j < n && chars[j].1.is_whitespace() {
95                    j += 1;
96                }
97                start = if j < n { chars[j].0 } else { text.len() };
98                i = j;
99                continue;
100            }
101        }
102        i += 1;
103    }
104
105    /* Remaining text */
106    if start < text.len() {
107        let remainder = text[start..].trim().to_string();
108        if !remainder.is_empty() {
109            sentences.push(Sentence {
110                text: remainder,
111                start,
112                end: text.len(),
113            });
114        }
115    }
116
117    sentences
118}
119
120/// Count the number of sentences in text.
121pub fn sentence_count(text: &str) -> usize {
122    let cfg = SentenceSplitterConfig::default();
123    split_sentences(text, &cfg).len()
124}
125
126/// Return the average word count per sentence.
127pub fn avg_words_per_sentence(text: &str) -> f64 {
128    let cfg = SentenceSplitterConfig::default();
129    let sents = split_sentences(text, &cfg);
130    if sents.is_empty() {
131        return 0.0;
132    }
133    let total: usize = sents.iter().map(|s| s.word_count_est()).sum();
134    total as f64 / sents.len() as f64
135}
136
137/// Find the longest sentence by character count.
138pub fn longest_sentence(sentences: &[Sentence]) -> Option<&Sentence> {
139    sentences.iter().max_by_key(|s| s.text.len())
140}
141
142/// Filter sentences shorter than `min_words` words.
143pub fn filter_short_sentences(sentences: Vec<Sentence>, min_words: usize) -> Vec<Sentence> {
144    sentences
145        .into_iter()
146        .filter(|s| s.word_count_est() >= min_words)
147        .collect()
148}
149
150#[cfg(test)]
151mod tests {
152    use super::*;
153
154    #[test]
155    fn test_simple_split() {
156        let text = "Hello world. How are you? I am fine!";
157        let sents = split_sentences(text, &SentenceSplitterConfig::default());
158        assert!(sents.len() >= 2);
159    }
160
161    #[test]
162    fn test_sentence_count() {
163        let text = "First. Second. Third.";
164        assert!(sentence_count(text) >= 1);
165    }
166
167    #[test]
168    fn test_byte_len() {
169        let s = Sentence {
170            text: "Hi.".into(),
171            start: 0,
172            end: 3,
173        };
174        assert_eq!(s.byte_len(), 3);
175    }
176
177    #[test]
178    fn test_word_count_est() {
179        let s = Sentence {
180            text: "One two three.".into(),
181            start: 0,
182            end: 14,
183        };
184        assert_eq!(s.word_count_est(), 3);
185    }
186
187    #[test]
188    fn test_avg_words_per_sentence() {
189        let text = "One two. Three four five.";
190        let avg = avg_words_per_sentence(text);
191        assert!(avg > 0.0);
192    }
193
194    #[test]
195    fn test_longest_sentence() {
196        let sents = vec![
197            Sentence {
198                text: "Hi.".into(),
199                start: 0,
200                end: 3,
201            },
202            Sentence {
203                text: "Hello world friend.".into(),
204                start: 4,
205                end: 23,
206            },
207        ];
208        let longest = longest_sentence(&sents).expect("should succeed");
209        assert_eq!(longest.text, "Hello world friend.");
210    }
211
212    #[test]
213    fn test_filter_short() {
214        let sents = vec![
215            Sentence {
216                text: "Hi.".into(),
217                start: 0,
218                end: 3,
219            },
220            Sentence {
221                text: "Hello there world.".into(),
222                start: 4,
223                end: 22,
224            },
225        ];
226        let filtered = filter_short_sentences(sents, 2);
227        assert_eq!(filtered.len(), 1);
228    }
229
230    #[test]
231    fn test_empty_text() {
232        assert_eq!(sentence_count(""), 0);
233    }
234
235    #[test]
236    fn test_no_terminal_is_one_sentence() {
237        let text = "this has no terminal punctuation";
238        let sents = split_sentences(text, &SentenceSplitterConfig::default());
239        assert_eq!(sents.len(), 1);
240    }
241}
oxihuman_core/sentence_splitter.rs

oxihuman_core/
sentence_splitter.rs