use std::borrow::Cow;
use memchr::memchr3;
const ABBREVIATIONS: &[&str] = &[
"mr", "mrs", "ms", "dr", "prof", "sr", "jr", "st", "ave", "blvd", "gen", "gov", "sgt", "cpl", "pvt", "capt", "lt",
"col", "maj", "cmdr", "adm", "dept", "univ", "assn", "bros", "inc", "ltd", "co", "corp", "vs", "al", "approx",
"appt", "apt", "dept", "dpt", "est", "etc", "fig", "figs", "ft", "hr", "hrs", "min", "mins", "misc", "mt", "no",
"nos", "nr", "oz", "ph", "pp", "sec", "vol", "rev", "jan", "feb", "mar", "apr", "jun", "jul", "aug", "sep", "oct",
"nov", "dec", "mon", "tue", "wed", "thu", "fri", "sat", "sun",
];
pub(crate) fn split_into_sentences(text: &str) -> Vec<Cow<'_, str>> {
if text.is_empty() {
return Vec::new();
}
let bytes = text.as_bytes();
let mut sentences: Vec<Cow<'_, str>> = Vec::new();
let mut start = 0;
while start < bytes.len() {
while start < bytes.len() && bytes[start].is_ascii_whitespace() {
start += 1;
}
if start >= bytes.len() {
break;
}
match find_sentence_end(text, start) {
Some(end) => {
let s = text[start..end].trim();
if !s.is_empty() {
sentences.push(Cow::Borrowed(s));
}
start = end;
}
None => {
let s = text[start..].trim();
if !s.is_empty() {
sentences.push(Cow::Borrowed(s));
}
break;
}
}
}
sentences
}
fn find_sentence_end(text: &str, from: usize) -> Option<usize> {
let bytes = text.as_bytes();
let mut pos = from;
while pos < bytes.len() {
if bytes[pos] == b'\n' {
let nl_start = pos;
pos += 1;
let mut found_second_nl = false;
let mut scan = pos;
while scan < bytes.len() {
if bytes[scan] == b'\n' {
found_second_nl = true;
scan += 1;
break;
} else if bytes[scan] == b' ' || bytes[scan] == b'\t' || bytes[scan] == b'\r' {
scan += 1;
} else {
break;
}
}
if found_second_nl {
return Some(scan);
}
pos = nl_start + 1;
continue;
}
match memchr3(b'.', b'!', b'?', &bytes[pos..]) {
None => return None,
Some(offset) => {
let terminal_pos = pos + offset;
let mut end = terminal_pos + 1;
while end < bytes.len() && (bytes[end] == b'.' || bytes[end] == b'!' || bytes[end] == b'?') {
end += 1;
}
while end < bytes.len() && matches!(bytes[end], b'"' | b'\'' | b')' | b']' | b'}') {
end += 1;
}
if is_sentence_boundary(text, terminal_pos, end) {
return Some(end);
}
pos = end;
}
}
}
None
}
fn is_sentence_boundary(text: &str, terminal_pos: usize, after_terminal: usize) -> bool {
let bytes = text.as_bytes();
if bytes[terminal_pos] != b'.' {
return has_content_after(bytes, after_terminal);
}
if is_abbreviation(text, terminal_pos) {
return false;
}
if terminal_pos >= 1
&& bytes[terminal_pos - 1].is_ascii_alphabetic()
&& (terminal_pos < 2 || !bytes[terminal_pos - 2].is_ascii_alphabetic())
{
return false;
}
let mut next = after_terminal;
while next < bytes.len() && bytes[next].is_ascii_whitespace() {
next += 1;
}
if next >= bytes.len() {
return false; }
if bytes[next].is_ascii_lowercase() {
return false;
}
true
}
fn has_content_after(bytes: &[u8], pos: usize) -> bool {
let mut i = pos;
while i < bytes.len() && bytes[i].is_ascii_whitespace() {
i += 1;
}
i < bytes.len()
}
fn is_abbreviation(text: &str, dot_pos: usize) -> bool {
let bytes = text.as_bytes();
let mut word_start = dot_pos;
while word_start > 0 && bytes[word_start - 1].is_ascii_alphabetic() {
word_start -= 1;
}
if word_start == dot_pos {
return false; }
let word = &text[word_start..dot_pos];
let lower: Cow<'_, str> = if word.bytes().any(|b| b.is_ascii_uppercase()) {
Cow::Owned(word.to_ascii_lowercase())
} else {
Cow::Borrowed(word)
};
ABBREVIATIONS.contains(&lower.as_ref())
}
pub(crate) fn split_into_words(text: &str) -> Vec<String> {
let mut words: Vec<String> = Vec::new();
let bytes = text.as_bytes();
let len = bytes.len();
let mut i = 0;
while i < len {
let b = bytes[i];
if b.is_ascii_whitespace() {
i += 1;
continue;
}
if is_punctuation_byte(b) && b != b'\'' && b != b'-' {
words.push(String::from(b as char));
i += 1;
continue;
}
let word_start = i;
while i < len {
let c = bytes[i];
if c.is_ascii_alphanumeric() || c > 127 {
i += 1;
while i < len && bytes[i] > 127 && bytes[i] < 192 {
i += 1;
}
} else if c == b'-' && i + 1 < len && (bytes[i + 1].is_ascii_alphanumeric() || bytes[i + 1] > 127) {
i += 1;
} else if c == b'\'' && i > word_start && i + 1 < len && bytes[i + 1].is_ascii_alphabetic() {
let before = &text[word_start..i];
if !before.is_empty() {
words.push(before.to_string());
}
let cont_start = i;
i += 1; while i < len && bytes[i].is_ascii_alphabetic() {
i += 1;
}
let contraction = &text[cont_start..i];
if contraction.len() > 1 && !contraction.starts_with("'") || contraction.len() > 1 {
}
continue;
} else {
break;
}
}
if i > word_start {
let word = &text[word_start..i];
if !word.is_empty() {
words.push(word.to_string());
}
} else {
if i < len {
let ch_len = text[i..].chars().next().map_or(1, |c| c.len_utf8());
words.push(text[i..i + ch_len].to_string());
i += ch_len;
}
}
}
words
}
#[inline]
fn is_punctuation_byte(b: u8) -> bool {
matches!(
b,
b'!' | b'"'
| b'#'
| b'$'
| b'%'
| b'&'
| b'\''
| b'('
| b')'
| b'*'
| b'+'
| b','
| b'-'
| b'.'
| b'/'
| b':'
| b';'
| b'<'
| b'='
| b'>'
| b'?'
| b'@'
| b'['
| b'\\'
| b']'
| b'^'
| b'_'
| b'`'
| b'{'
| b'|'
| b'}'
| b'~'
)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn split_simple_sentences() {
let text = "One smartwatch. One phone. Many phones.";
let result = split_into_sentences(text);
assert_eq!(result.len(), 3);
assert_eq!(result[0], "One smartwatch.");
assert_eq!(result[1], "One phone.");
assert_eq!(result[2], "Many phones.");
}
#[test]
fn split_exclamation_sentences() {
let text = "This is your weekly newsletter! Hundreds of great deals - everything from men's fashion to high-tech drones!";
let result = split_into_sentences(text);
assert_eq!(result.len(), 2);
assert_eq!(result[0], "This is your weekly newsletter!");
}
#[test]
fn split_paragraph_boundary() {
let text = "First paragraph.\n\nSecond paragraph.";
let result = split_into_sentences(text);
assert_eq!(result.len(), 2);
assert_eq!(result[0], "First paragraph.");
assert_eq!(result[1], "Second paragraph.");
}
#[test]
fn abbreviation_no_split() {
let text = "Dr. Smith went to Washington.";
let result = split_into_sentences(text);
assert_eq!(result.len(), 1);
}
#[test]
fn split_hyphenated_words() {
let text = "Truly high-tech!";
let words = split_into_words(text);
assert_eq!(words, vec!["Truly", "high-tech", "!"]);
}
#[test]
fn empty_text() {
assert!(split_into_sentences("").is_empty());
assert!(split_into_words("").is_empty());
}
#[test]
fn large_input_no_panic() {
let paragraph = "This is a test sentence with some words. ";
let large_text = paragraph.repeat(250_000); let sentences = split_into_sentences(&large_text);
assert!(!sentences.is_empty());
}
}