pub fn naive_sentences(text: &str) -> Vec<String> {
let trimmed = text.trim();
if trimmed.is_empty() {
return Vec::new();
}
let bytes = trimmed.as_bytes();
let mut out: Vec<String> = Vec::new();
let mut start = 0usize;
let mut i = 0usize;
while i < bytes.len() {
let b = bytes[i];
if b == b'.' || b == b'!' || b == b'?' {
let after_term = i + 1;
let mut j = after_term;
while j < bytes.len() && (bytes[j] as char).is_whitespace() {
j += 1;
}
if j > after_term {
let sent = std::str::from_utf8(&bytes[start..after_term])
.expect("UTF-8 boundary safe at ASCII terminator")
.trim();
if !sent.is_empty() {
out.push(sent.to_string());
}
start = j;
i = j;
continue;
}
}
i += 1;
}
if start < bytes.len() {
let tail = std::str::from_utf8(&bytes[start..])
.expect("UTF-8 boundary safe — start is at split point")
.trim();
if !tail.is_empty() {
out.push(tail.to_string());
}
}
out
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn empty_returns_empty() {
assert!(naive_sentences("").is_empty());
assert!(naive_sentences(" \n ").is_empty());
}
#[test]
fn single_sentence_no_terminator() {
assert_eq!(
naive_sentences("just words"),
vec!["just words".to_string()]
);
}
#[test]
fn three_sentences_split() {
assert_eq!(
naive_sentences("Hello world. This is two. And three!"),
vec![
"Hello world.".to_string(),
"This is two.".to_string(),
"And three!".to_string()
]
);
}
#[test]
fn abbreviation_with_trailing_space_does_split() {
assert_eq!(
naive_sentences("U.S.A. is here."),
vec!["U.S.A.".to_string(), "is here.".to_string()]
);
}
#[test]
fn preserves_internal_punctuation() {
assert_eq!(
naive_sentences("Hello, world! How are you?"),
vec!["Hello, world!".to_string(), "How are you?".to_string()]
);
}
}