yake-rust 1.0.3

use segtok::segmenter::split_multi;
use segtok::tokenizer::{split_contractions, web_tokenizer};

pub fn split_into_words(text: &str) -> Vec<String> {
    split_contractions(web_tokenizer(text))
        .into_iter()
        .filter(|word| !(word.is_empty() || word.len() > 1 && word.starts_with("'")))
        .collect()
}

pub fn split_into_sentences(text: &str) -> Vec<String> {
    split_multi(text, Default::default()).into_iter().filter(|span| !span.is_empty()).collect()
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn split_hyphenated_sentence_into_words() {
        let text = "Truly high-tech!";
        let expected = ["Truly", "high-tech", "!"];
        let actual = split_into_words(&text);
        assert_eq!(actual, expected);
    }

    #[test]
    fn split_sentences() {
        let text = "One smartwatch. One phone. Many phones.";
        let expected = ["One smartwatch.", "One phone.", "Many phones."];
        let actual = split_into_sentences(&text);
        assert_eq!(actual, expected);
    }

    #[test]
    fn split_tabbed_multiline_text_into_sentences() {
        let text = "This is your weekly newsletter! \
            Hundreds of great deals - everything from men's fashion \
            to high-tech drones!";
        let expected = [
            "This is your weekly newsletter!",
            "Hundreds of great deals - everything from men's fashion to high-tech drones!",
        ];
        let actual = split_into_sentences(&text);
        assert_eq!(actual, expected);
    }
}