pub fn split_sentences(text: &str) -> Vec<String> {
if text.is_empty() {
return Vec::new();
}
let mut sentences = Vec::new();
let bytes = text.as_bytes();
let mut start = 0;
for (byte_pos, ch) in text.char_indices() {
if ch == '.' || ch == '!' || ch == '?' {
let is_boundary = if ch == '.' {
is_sentence_boundary_bytes(bytes, byte_pos)
} else {
true
};
if is_boundary {
let end = byte_pos + 1; let trimmed = text[start..end].trim();
if !trimmed.is_empty() {
sentences.push(trimmed.to_string());
}
start = end;
}
}
}
let trimmed = text[start..].trim();
if !trimmed.is_empty() {
sentences.push(trimmed.to_string());
}
sentences
}
fn is_sentence_boundary_bytes(bytes: &[u8], pos: usize) -> bool {
let len = bytes.len();
if pos + 1 < len && bytes[pos + 1].is_ascii_digit() {
return false;
}
if pos >= 1
&& bytes[pos - 1].is_ascii_uppercase()
&& (pos < 2 || !bytes[pos - 2].is_ascii_alphanumeric())
{
return false;
}
if pos + 1 >= len {
return true;
}
if pos + 2 < len && bytes[pos + 1].is_ascii_whitespace() && bytes[pos + 2].is_ascii_uppercase()
{
return true;
}
if bytes[pos + 1].is_ascii_whitespace() {
return true;
}
if bytes[pos + 1] == b'\n' {
return true;
}
false
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_split_basic_sentences() {
let text = "Hello world. This is a test. Another sentence here.";
let sentences = split_sentences(text);
assert_eq!(sentences.len(), 3);
assert_eq!(sentences[0], "Hello world.");
assert_eq!(sentences[1], "This is a test.");
assert_eq!(sentences[2], "Another sentence here.");
}
#[test]
fn test_split_with_exclamation() {
let text = "Wow! That is amazing. Really?";
let sentences = split_sentences(text);
assert_eq!(sentences.len(), 3);
}
#[test]
fn test_decimal_not_split() {
let text = "The value is 3.14 and it matters.";
let sentences = split_sentences(text);
assert_eq!(sentences.len(), 1);
}
#[test]
fn test_empty_text() {
let sentences = split_sentences("");
assert!(sentences.is_empty());
}
}