pub fn split_sentences(text: &str) -> Vec<String> {
if text.is_empty() {
return Vec::new();
}
let mut sentences = Vec::new();
let mut current = String::new();
let chars: Vec<char> = text.chars().collect();
let len = chars.len();
let mut i = 0;
while i < len {
let ch = chars[i];
current.push(ch);
if (ch == '.' || ch == '!' || ch == '?') && !current.trim().is_empty() {
let is_boundary = if ch == '.' {
is_sentence_boundary(&chars, i)
} else {
true
};
if is_boundary {
let trimmed = current.trim().to_string();
if !trimmed.is_empty() {
sentences.push(trimmed);
}
current.clear();
}
}
i += 1;
}
let trimmed = current.trim().to_string();
if !trimmed.is_empty() {
sentences.push(trimmed);
}
sentences
}
fn is_sentence_boundary(chars: &[char], pos: usize) -> bool {
let len = chars.len();
if pos + 1 < len && chars[pos + 1].is_ascii_digit() {
return false;
}
if pos >= 1
&& chars[pos - 1].is_ascii_uppercase()
&& (pos < 2 || !chars[pos - 2].is_alphanumeric())
{
return false;
}
if pos + 1 >= len {
return true;
}
if pos + 2 < len && chars[pos + 1].is_whitespace() && chars[pos + 2].is_uppercase() {
return true;
}
if chars[pos + 1].is_whitespace() {
return true;
}
if chars[pos + 1] == '\n' {
return true;
}
false
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_split_basic_sentences() {
let text = "Hello world. This is a test. Another sentence here.";
let sentences = split_sentences(text);
assert_eq!(sentences.len(), 3);
assert_eq!(sentences[0], "Hello world.");
assert_eq!(sentences[1], "This is a test.");
assert_eq!(sentences[2], "Another sentence here.");
}
#[test]
fn test_split_with_exclamation() {
let text = "Wow! That is amazing. Really?";
let sentences = split_sentences(text);
assert_eq!(sentences.len(), 3);
}
#[test]
fn test_decimal_not_split() {
let text = "The value is 3.14 and it matters.";
let sentences = split_sentences(text);
assert_eq!(sentences.len(), 1);
}
#[test]
fn test_empty_text() {
let sentences = split_sentences("");
assert!(sentences.is_empty());
}
}