1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137

use rust_stemmers::{Algorithm, Stemmer}; // for stemming single words
use porter_stemmer::stem as pstem;

/// ### text_tokens
/// This method takes a slice of a string and produces a vector of stemmed words
/// Here is what happens under the hood:
/// 1. "funky" non-alphanumeric characters are removed
/// 2. everything is converted to lower case
/// 3. the string slice is split into words (split on whitespace)
/// 4. the "stem" of each word is taken using rust_stemmers
/// 
/// #### EXAMPLE:
/// ```
/// use zoea::nlp::text_tokens;
/// let string_2 = String::from("I walked to San Diego slowly today!");
/// let tokens = text_tokens(&string_2);
/// println!("Sentence = {}", string_2);
/// for token in tokens {
///     println!("bigram= {}", token)
/// }
/// ```

/// ### porter_stems
/// Return a Vec<String> with the porter stems of words in a &str
/// 
/// #### EXAMPLE:
/// ```
/// use zoea::nlp;
/// let port_stems = nlp::porter_stems("Totally dude!");
/// assert_eq!(port_stems[0], "total");
/// ```
pub fn porter_stems(text: &str) -> Vec<String> {
    // return the porter stems of each word
    let mut token: String;
    let mut tokens =  Vec::new();

    // convert text to lower case and iterate over words
    let text_clean = text.replace(&['(', ')', ',', '\"', '.', ';', ':', '\'','!','?'][..], "");
    let text_lower = text_clean.to_lowercase();
    let words = text_lower.split_whitespace();
    for gram in words  {

        // find the word stem
        token = pstem(&gram); 
        
        // append this token to tne reesults
        tokens.push(token);     
    }
    // return the vector of bigrams
    tokens

}
pub fn text_tokens(text: &str) -> Vec<String> {
    // string goes in, list of tokens comes out

    // declare some variables and bring them into context
    let en_stemmer = Stemmer::create(Algorithm::English); // english langage stemmer- one word at a time please!
    let mut stem: std::borrow::Cow<str>; // copy-on-write pointer to a word stem
    let mut token: String;
    let mut tokens =  Vec::new();

    // convert text to lower case and iterate over words
    let text_clean = text.replace(&['(', ')', ',', '\"', '.', ';', ':', '\'','!','?'][..], "");
    let text_lower = text_clean.to_lowercase();
    let words = text_lower.split_whitespace();
    for gram in words  {

        // find the word stem
        stem = en_stemmer.stem(&gram);
        token = stem.to_string(); 
        
        // append this token to tne reesults
        tokens.push(token);     
    }
    // return the vector of bigrams
    tokens
}

/// 
/// ### text_token_bigrams
/// This method takes a slice of a string and produces a vector of bigrams as strings.
/// Here is what happens under the hood:
/// 1. "funky" non-alphanumeric characters are removed
/// 2. everything is converted to lower case
/// 3. the string slice is split into words (split on whitespace)
/// 4. the "stem" of each word is taken using rust_stemmers
/// 5. a window of two "stems" moves along the list producing bigrams
/// 
/// #### EXAMPLE:
/// ```
/// use zoea::nlp::text_token_bigrams;
/// let string_2 = String::from("I walked to San Diego slowly today!");
/// let bigrams_2 = text_token_bigrams(&string_2);
/// println!("Sentence = {}", string_2);
/// for gram in bigrams_2 {
///    println!("bigram= {}", gram)
/// }
/// ```
pub fn text_token_bigrams(text: &str) -> Vec<String> {
    // string goes in, tokenized bigrams come out

    // declare some variables and bring them into context
    let en_stemmer = Stemmer::create(Algorithm::English); // english langage stemmer- one word at a time please!
    let mut stem: std::borrow::Cow<str>; // copy-on-write pointer to a word stem
    let mut bigram = String::from("!NewDoc"); // bigram of last two grams
    let mut bigrams =  Vec::new();

    // convert text to lower case and iterate over words
    let text_clean = text.replace(&['(', ')', ',', '\"', '.', ';', ':', '\'','!','?'][..], "");
    let text_lower = text_clean.to_lowercase();
    let words = text_lower.split_whitespace();
    for gram in words  {

        // find the word stem and bigram with the previous stem
        stem = en_stemmer.stem(&gram);
        bigram.push_str(" ");
        bigram.push_str(&stem);
        
        // append this bigram to tne reesults
        bigrams.push(bigram);
        // replace the "previous gram" so bigram is ready for the next loop
        bigram = stem.to_string();        
    }
    // return the vector of bigrams
    bigrams
}

#[test]// use zoea::nlp::text_token_bigrams
fn demo() {
    let sentence = String::from("Today I walked slowly to the garden in San Diego.");
    let tokenized_bigrams = text_token_bigrams(&sentence);
    assert_eq!(tokenized_bigrams[0], "!NewDoc today".to_string());
    let port_stems = porter_stems("Totally dude!");
    assert_eq!(port_stems[0], "total");

}