#![allow(dead_code)]
#[derive(Debug, Clone, PartialEq)]
pub struct Sentence {
pub text: String,
pub start: usize,
pub end: usize,
}
impl Sentence {
pub fn byte_len(&self) -> usize {
self.end.saturating_sub(self.start)
}
pub fn word_count_est(&self) -> usize {
self.text.split_whitespace().count()
}
}
#[derive(Debug, Clone)]
pub struct SentenceSplitterConfig {
pub terminals: Vec<char>,
pub abbreviation_guard: bool,
}
impl Default for SentenceSplitterConfig {
fn default() -> Self {
Self {
terminals: vec!['.', '!', '?'],
abbreviation_guard: true,
}
}
}
static ABBREVIATIONS: &[&str] = &[
"Mr", "Mrs", "Ms", "Dr", "Prof", "Sr", "Jr", "vs", "etc", "St", "Ave", "Blvd", "Dept", "est",
];
pub fn split_sentences(text: &str, cfg: &SentenceSplitterConfig) -> Vec<Sentence> {
let mut sentences: Vec<Sentence> = Vec::new();
let mut start = 0usize;
let chars: Vec<(usize, char)> = text.char_indices().collect();
let n = chars.len();
let mut i = 0;
while i < n {
let (byte_pos, ch) = chars[i];
if cfg.terminals.contains(&ch) {
let is_abbrev = if cfg.abbreviation_guard {
let before = &text[start..byte_pos];
let last_word = before.split_whitespace().last().unwrap_or("");
ABBREVIATIONS
.iter()
.any(|a| last_word.eq_ignore_ascii_case(a))
} else {
false
};
let next_upper = (i + 1..n)
.find(|&j| {
let (_, nc) = chars[j];
!nc.is_whitespace()
})
.map(|j| chars[j].1.is_uppercase())
.unwrap_or(false);
if !is_abbrev && (next_upper || i + 1 == n) {
let end = byte_pos + ch.len_utf8();
let sentence_text = text[start..end].trim().to_string();
if !sentence_text.is_empty() {
sentences.push(Sentence {
text: sentence_text,
start,
end,
});
}
let mut j = i + 1;
while j < n && chars[j].1.is_whitespace() {
j += 1;
}
start = if j < n { chars[j].0 } else { text.len() };
i = j;
continue;
}
}
i += 1;
}
if start < text.len() {
let remainder = text[start..].trim().to_string();
if !remainder.is_empty() {
sentences.push(Sentence {
text: remainder,
start,
end: text.len(),
});
}
}
sentences
}
pub fn sentence_count(text: &str) -> usize {
let cfg = SentenceSplitterConfig::default();
split_sentences(text, &cfg).len()
}
pub fn avg_words_per_sentence(text: &str) -> f64 {
let cfg = SentenceSplitterConfig::default();
let sents = split_sentences(text, &cfg);
if sents.is_empty() {
return 0.0;
}
let total: usize = sents.iter().map(|s| s.word_count_est()).sum();
total as f64 / sents.len() as f64
}
pub fn longest_sentence(sentences: &[Sentence]) -> Option<&Sentence> {
sentences.iter().max_by_key(|s| s.text.len())
}
pub fn filter_short_sentences(sentences: Vec<Sentence>, min_words: usize) -> Vec<Sentence> {
sentences
.into_iter()
.filter(|s| s.word_count_est() >= min_words)
.collect()
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_simple_split() {
let text = "Hello world. How are you? I am fine!";
let sents = split_sentences(text, &SentenceSplitterConfig::default());
assert!(sents.len() >= 2);
}
#[test]
fn test_sentence_count() {
let text = "First. Second. Third.";
assert!(sentence_count(text) >= 1);
}
#[test]
fn test_byte_len() {
let s = Sentence {
text: "Hi.".into(),
start: 0,
end: 3,
};
assert_eq!(s.byte_len(), 3);
}
#[test]
fn test_word_count_est() {
let s = Sentence {
text: "One two three.".into(),
start: 0,
end: 14,
};
assert_eq!(s.word_count_est(), 3);
}
#[test]
fn test_avg_words_per_sentence() {
let text = "One two. Three four five.";
let avg = avg_words_per_sentence(text);
assert!(avg > 0.0);
}
#[test]
fn test_longest_sentence() {
let sents = vec![
Sentence {
text: "Hi.".into(),
start: 0,
end: 3,
},
Sentence {
text: "Hello world friend.".into(),
start: 4,
end: 23,
},
];
let longest = longest_sentence(&sents).expect("should succeed");
assert_eq!(longest.text, "Hello world friend.");
}
#[test]
fn test_filter_short() {
let sents = vec![
Sentence {
text: "Hi.".into(),
start: 0,
end: 3,
},
Sentence {
text: "Hello there world.".into(),
start: 4,
end: 22,
},
];
let filtered = filter_short_sentences(sents, 2);
assert_eq!(filtered.len(), 1);
}
#[test]
fn test_empty_text() {
assert_eq!(sentence_count(""), 0);
}
#[test]
fn test_no_terminal_is_one_sentence() {
let text = "this has no terminal punctuation";
let sents = split_sentences(text, &SentenceSplitterConfig::default());
assert_eq!(sents.len(), 1);
}
}