#[must_use]
pub fn split_sentences(text: &str) -> Vec<String> {
let mut sentences: Vec<String> = Vec::new();
let mut current = String::new();
for c in text.chars() {
current.push(c);
if is_sentence_boundary(c) {
let trimmed = current.trim().to_owned();
if !trimmed.is_empty() {
sentences.push(trimmed);
}
current.clear();
}
}
let tail = current.trim().to_owned();
if !tail.is_empty() {
sentences.push(tail);
}
sentences
}
#[inline]
fn is_sentence_boundary(c: char) -> bool {
matches!(c, '.' | '!' | '؟' | '؛')
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn basic_split() {
let r = split_sentences("سلام. چطوری؟ خوبم!");
assert_eq!(r, vec!["سلام.", "چطوری؟", "خوبم!"]);
}
#[test]
fn no_delimiter() {
assert_eq!(
split_sentences("یک جمله بدون نقطه"),
vec!["یک جمله بدون نقطه"]
);
}
#[test]
fn trailing_whitespace_trimmed() {
let r = split_sentences(" سلام. خداحافظ؟ ");
assert_eq!(r, vec!["سلام.", "خداحافظ؟"]);
}
#[test]
fn empty_input() {
assert!(split_sentences("").is_empty());
}
#[test]
fn zwnj_not_a_boundary() {
let r = split_sentences("می\u{200C}روم به خانه.");
assert_eq!(r.len(), 1);
assert!(r[0].contains('\u{200C}'));
}
#[test]
fn arabic_semicolon_splits() {
let r = split_sentences("اول؛ دوم.");
assert_eq!(r.len(), 2);
}
}