rnltk/
token.rs

1//! Module containing functions used to tokenize strings and get term frequencies.
2
3use std::collections::BTreeMap;
4
5use regex::Regex;
6
7use crate::stem;
8
9pub fn get_stop_words() -> Vec<String> {
10    ["i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "you're", "you've", "you'll", "you'd", "your", "yours", "yourself", "yourselves", "he", "him", "his", "himself", "she", "she's", "her", "hers", "herself", "it", "it's", "its", "itself", "they", "them", "their", "theirs", "themselves", "what", "which", "who", "whom", "this", "that", "that'll", "these", "those", "am", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "having", "do", "does", "did", "doing", "a", "an", "the", "and", "but", "if", "or", "because", "as", "until", "while", "of", "at", "by", "for", "with", "about", "against", "between", "into", "through", "during", "before", "after", "above", "below", "to", "from", "up", "down", "in", "out", "on", "off", "over", "under", "again", "further", "then", "once", "here", "there", "when", "where", "why", "how", "all", "any", "both", "each", "few", "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so", "than", "too", "very", "s", "t", "can", "will", "just", "don", "don't", "should", "should've", "now", "d", "ll", "m", "o", "re", "ve", "y", "ain", "aren", "aren't", "couldn", "couldn't", "didn", "didn't", "doesn", "doesn't", "hadn", "hadn't", "hasn", "hasn't", "haven", "haven't", "isn", "isn't", "ma", "mightn", "mightn't", "mustn", "mustn't", "needn", "needn't", "shan", "shan't", "shouldn", "shouldn't", "wasn", "wasn't", "weren", "weren't", "won", "won't", "wouldn", "wouldn't"]
11        .map(String::from)
12        .to_vec()
13}
14
15#[derive(Debug, Clone)]
16pub struct TokenConfig {
17    pub stem: bool,
18    pub remove_stop_words: bool,
19    pub stop_words: Vec<String>,
20}
21
22impl Default for TokenConfig {
23    fn default() -> Self {
24        Self {
25            stem: true,
26            remove_stop_words: true,
27            stop_words: get_stop_words(),
28        }
29    }
30}
31
32/// Converts a `document` to sentence vector.
33///
34/// # Examples
35///
36/// ```
37/// use rnltk::token;
38/// 
39/// let text = "Why hello there. General Kenobi!";
40/// let tokens = vec!["Why hello there", "General Kenobi"];
41/// let tokenized_text = token::tokenize_into_sentences(text);
42///
43/// assert_eq!(tokens, tokenized_text);
44/// ```
45pub fn tokenize_into_sentences(document: &str) -> Vec<String> {
46    let quote_regex = Regex::new(r#"[\.!\?]""#).expect("Invalid regex");
47    let updated_document: &str = &quote_regex.replace_all(document, "\"");
48
49    let separator = Regex::new(r#"[\.!\?] *"#).expect("Invalid regex");
50    let mut full_sentences: Vec<String> = separator.split(updated_document).map(|s| s.to_string()).collect();
51    full_sentences.retain(|sentence| !sentence.is_empty());
52
53    full_sentences
54}
55
56/// Converts `sentence` to token vector.
57///
58/// # Examples
59///
60/// ```
61/// use rnltk::token;
62/// 
63/// let text = "Why hello there. General Kenobi!";
64/// let tokens = vec!["why", "hello", "there", "general", "kenobi"];
65/// let tokenized_text = token::tokenize_sentence(text);
66///
67/// assert_eq!(tokens, tokenized_text);
68/// ```
69pub fn tokenize_sentence(sentence: &str) -> Vec<String> {
70    let punctuation = Regex::new(r#"[!"\#$%&'()*+,-./:;<=>?@\[\]^_`{|}~]+"#).expect("Invalid regex");
71    let updated_sentence: &str = &punctuation.replace_all(sentence, "");
72
73    let mut tokens: Vec<String> = updated_sentence
74        .split(' ')
75        .map(|s| s.trim().to_ascii_lowercase())
76        .collect();
77    tokens.retain(|token| !token.is_empty());
78
79    tokens
80}
81
82/// Converts `sentence` to token vector without stop words.
83///
84/// # Examples
85///
86/// ```
87/// use rnltk::token;
88/// 
89/// let text = "Why hello there. General Kenobi!";
90/// let tokens = vec!["hello", "general", "kenobi"];
91/// let stop_words = token::get_stop_words();
92/// let tokenized_text = token::tokenize_sentence_without_stop_words(text, stop_words);
93///
94/// assert_eq!(tokens, tokenized_text);
95/// ```
96pub fn tokenize_sentence_without_stop_words(sentence: &str, stop_words: Vec<String>) -> Vec<String> {
97    let punctuation = Regex::new(r#"[!"\#$%&'()*+,-./:;<=>?@\[\]^_`{|}~]+"#).expect("Invalid regex");
98    let updated_sentence: &str = &punctuation.replace_all(sentence, "");
99
100    let mut tokens: Vec<String> = tokenize_sentence(updated_sentence);
101    tokens.retain(|token| !stop_words.contains(token));
102
103    tokens
104}
105
106/// Converts `sentence` to stemmed token vector.
107///
108/// # Examples
109///
110/// ```
111/// use rnltk::token;
112/// 
113/// let text = "Why hello there. General Kenobi!";
114/// let tokens = vec!["why", "hello", "there", "gener", "kenobi"];
115/// let tokenized_text = token::tokenize_stemmed_sentence(text);
116///
117/// assert_eq!(tokens, tokenized_text);
118/// ```
119pub fn tokenize_stemmed_sentence(sentence: &str) -> Vec<String> {
120    let punctuation = Regex::new(r#"[!"\#$%&'()*+,-./:;<=>?@\[\]^_`{|}~]+"#).expect("Invalid regex");
121    let updated_sentence: &str = &punctuation.replace_all(sentence, "");
122
123    let tokens: Vec<String> = updated_sentence
124        .split(' ')
125        .map(|s| s.trim())
126        .filter(|s| !s.is_empty())
127        .map(|s| stem::get(s).unwrap_or_else(|_| s.to_string()))
128        .collect();
129    
130    tokens
131}
132
133/// Converts `sentence` to stemmed token vector without stop words.
134///
135/// # Examples
136///
137/// ```
138/// use rnltk::token;
139/// 
140/// let text = "Why hello there. General Kenobi!";
141/// let tokens = vec!["hello", "gener", "kenobi"];
142/// let stop_words = token::get_stop_words();
143/// let tokenized_text = token::tokenize_stemmed_sentence_without_stop_words(text, stop_words);
144///
145/// assert_eq!(tokens, tokenized_text);
146/// ```
147pub fn tokenize_stemmed_sentence_without_stop_words(sentence: &str, stop_words: Vec<String>) -> Vec<String> {
148    let punctuation = Regex::new(r#"[!"\#$%&'()*+,-./:;<=>?@\[\]^_`{|}~]+"#).expect("Invalid regex");
149    let updated_sentence: &str = &punctuation.replace_all(sentence, "");
150
151    let tokens: Vec<String> = updated_sentence
152        .split(' ')
153        .map(|token| token.trim().to_ascii_lowercase())
154        .filter(|token| !token.is_empty() && !stop_words.contains(&token.to_string()))
155        .map(|token| stem::get(&token).unwrap_or_else(|_| token.to_string()))
156        .collect();
157
158    tokens
159}
160
161/// Tokenize sentence based on a given configuration.
162/// 
163/// This function will be deprecated in the future once `rnltk` hits version 1.0
164/// and functionality will be moved to `tokenize_sentence`.
165/// 
166/// # Examples
167///
168/// ```
169/// use rnltk::token;
170/// 
171/// let token_config = token::TokenConfig::default();
172/// let text = "Why hello there. General Kenobi!";
173/// let tokens = vec!["hello", "gener", "kenobi"];
174/// let tokenized_text = token::tokenize_sentence_configurable(text, token_config);
175///
176/// assert_eq!(tokens, tokenized_text);
177/// ```
178pub fn tokenize_sentence_configurable(sentence: &str, config: TokenConfig) -> Vec<String> {
179    if config.remove_stop_words && config.stem {
180        tokenize_stemmed_sentence_without_stop_words(sentence, config.stop_words)
181    } else if config.remove_stop_words {
182        tokenize_sentence_without_stop_words(sentence, config.stop_words)
183    } else if config.stem {
184        tokenize_stemmed_sentence(sentence)
185    } else {
186        tokenize_sentence(sentence)
187    }
188}
189
190/// Gets a count of all words from a vector of `word_tokens`.
191///
192/// # Examples
193///
194/// ```
195/// use std::collections::BTreeMap;
196/// use rnltk::token;
197/// 
198/// let arg = vec!["fear", "leads", "to", "anger", "anger", "leads", "to", "hatred", "hatred", "leads", "to", "conflict", "conflict", "leads", "to", "suffering"];
199/// let word_counts = BTreeMap::from([("fear".to_string(), 1.), ("leads".to_string(), 4.), ("to".to_string(), 4.), ("anger".to_string(), 2.), ("hatred".to_string(), 2.), ("conflict".to_string(), 2.), ("suffering".to_string(), 1.)]);
200/// let term_frequencies = token::get_term_frequencies_from_word_vector(arg);
201///
202/// assert_eq!(word_counts, term_frequencies);
203/// ```
204pub fn get_term_frequencies_from_word_vector(word_tokens: Vec<&str>) -> BTreeMap<String, f64> {
205    let mut word_counts: BTreeMap<String, f64> = BTreeMap::new();
206    for word in word_tokens {
207        let count = word_counts.entry(word.to_string()).or_insert(0.);
208        *count += 1.;
209    }
210    word_counts
211}
212
213/// Gets a count of all words from a vector of `word_tokens` without stop words.
214///
215/// # Examples
216///
217/// ```
218/// use std::collections::BTreeMap;
219/// use rnltk::token;
220/// 
221/// let arg = vec!["fear", "leads", "to", "anger", "anger", "leads", "to", "hatred", "hatred", "leads", "to", "conflict", "conflict", "leads", "to", "suffering"];
222/// let word_counts = BTreeMap::from([("fear".to_string(), 1.), ("leads".to_string(), 4.), ("anger".to_string(), 2.), ("hatred".to_string(), 2.), ("conflict".to_string(), 2.), ("suffering".to_string(), 1.)]);
223/// let stop_words = token::get_stop_words();
224/// let term_frequencies = token::get_term_frequencies_from_word_vector_without_stop_words(arg, stop_words);
225///
226/// assert_eq!(word_counts, term_frequencies);
227/// ```
228pub fn get_term_frequencies_from_word_vector_without_stop_words(word_tokens: Vec<&str>, stop_words: Vec<String>) -> BTreeMap<String, f64> {
229    let mut word_counts: BTreeMap<String, f64> = BTreeMap::new();
230    for word in word_tokens {
231        if !stop_words.contains(&word.to_string()) {
232            let count = word_counts.entry(word.to_string()).or_insert(0.);
233            *count += 1.;
234        }
235    }
236    word_counts
237}
238
239/// Gets a count of all stemmed words from a vector of `word_tokens`.
240/// 
241/// If a word cannot be stemmed, it will get a frequency of the original word.
242///
243/// # Examples
244///
245/// ```
246/// use std::collections::BTreeMap;
247/// use rnltk::token;
248/// 
249/// let arg = vec!["fear", "leads", "to", "anger", "anger", "leads", "to", "hatred", "hatred", "leads", "to", "conflict", "conflict", "leads", "to", "suffering"];
250/// let word_counts = BTreeMap::from([("fear".to_string(), 1.), ("lead".to_string(), 4.), ("to".to_string(), 4.), ("anger".to_string(), 2.), ("hatr".to_string(), 2.), ("conflict".to_string(), 2.), ("suffer".to_string(), 1.)]);
251/// let term_frequencies = token::get_stemmed_term_frequencies_from_word_vector(arg);
252///
253/// assert_eq!(word_counts, term_frequencies);
254/// ```
255pub fn get_stemmed_term_frequencies_from_word_vector(word_tokens: Vec<&str>) -> BTreeMap<String, f64> {
256    let mut word_counts: BTreeMap<String, f64> = BTreeMap::new();
257    for word in word_tokens {
258        let count = word_counts.entry(stem::get(word).unwrap_or_else(|_| word.to_string())).or_insert(0.);
259        *count += 1.;
260    }
261    word_counts
262}
263
264/// Gets a count of all stemmed words from a vector of `word_tokens` without stop words.
265/// 
266/// If a word cannot be stemmed, it will get a frequency of the original word.
267///
268/// # Examples
269///
270/// ```
271/// use std::collections::BTreeMap;
272/// use rnltk::token;
273/// 
274/// let arg = vec!["fear", "leads", "to", "anger", "anger", "leads", "to", "hatred", "hatred", "leads", "to", "conflict", "conflict", "leads", "to", "suffering"];
275/// let word_counts = BTreeMap::from([("fear".to_string(), 1.), ("lead".to_string(), 4.), ("anger".to_string(), 2.), ("hatr".to_string(), 2.), ("conflict".to_string(), 2.), ("suffer".to_string(), 1.)]);
276/// let stop_words = token::get_stop_words();
277/// let term_frequencies = token::get_stemmed_term_frequencies_from_word_vector_without_stop_words(arg, stop_words);
278///
279/// assert_eq!(word_counts, term_frequencies);
280/// ```
281pub fn get_stemmed_term_frequencies_from_word_vector_without_stop_words(word_tokens: Vec<&str>, stop_words: Vec<String>) -> BTreeMap<String, f64> {
282    let mut word_counts: BTreeMap<String, f64> = BTreeMap::new();
283    for word in word_tokens {
284        if !stop_words.contains(&word.to_string()) {
285            let count = word_counts.entry(stem::get(word).unwrap_or_else(|_| word.to_string())).or_insert(0.);
286            *count += 1.;
287        }
288    }
289    word_counts
290}
291
292/// Gets a count of all words from a vector of `word_tokens` based on a given configuration.
293/// 
294/// This function will be deprecated in the future once `rnltk` hits version 1.0
295/// and functionality will be moved to `get_term_frequencies_from_word_vector`.
296/// 
297/// # Examples
298///
299/// ```
300/// use std::collections::BTreeMap;
301/// use rnltk::token;
302/// 
303/// let token_config = token::TokenConfig::default();
304/// let arg = vec!["fear", "leads", "to", "anger", "anger", "leads", "to", "hatred", "hatred", "leads", "to", "conflict", "conflict", "leads", "to", "suffering"];
305/// let word_counts = BTreeMap::from([("fear".to_string(), 1.), ("lead".to_string(), 4.), ("anger".to_string(), 2.), ("hatr".to_string(), 2.), ("conflict".to_string(), 2.), ("suffer".to_string(), 1.)]);
306/// let term_frequencies = token::get_term_frequencies_from_word_vector_configurable(arg, token_config);
307///
308/// assert_eq!(word_counts, term_frequencies);
309/// ```
310pub fn get_term_frequencies_from_word_vector_configurable(word_tokens: Vec<&str>, config: TokenConfig) -> BTreeMap<String, f64> {
311    if config.remove_stop_words && config.stem {
312        get_stemmed_term_frequencies_from_word_vector_without_stop_words(word_tokens, config.stop_words)
313    } else if config.remove_stop_words {
314        get_term_frequencies_from_word_vector_without_stop_words(word_tokens, config.stop_words)
315    } else if config.stem {
316        get_stemmed_term_frequencies_from_word_vector(word_tokens)
317    } else {
318        get_term_frequencies_from_word_vector(word_tokens)
319    }
320}
321
322/// Gets a count of all words from a `sentence`.
323///
324/// # Examples
325///
326/// ```
327/// use std::collections::BTreeMap;
328/// use rnltk::token;
329/// 
330/// let sentence = "fear leads to anger, anger leads to hatred, hatred leads to conflict, conflict leads to suffering.";
331/// let word_counts = BTreeMap::from([("fear".to_string(), 1.), ("leads".to_string(), 4.), ("to".to_string(), 4.), ("anger".to_string(), 2.), ("hatred".to_string(), 2.), ("conflict".to_string(), 2.), ("suffering".to_string(), 1.)]);
332/// let term_frequencies = token::get_term_frequencies_from_sentence(sentence);
333///
334/// assert_eq!(word_counts, term_frequencies);
335/// ```
336pub fn get_term_frequencies_from_sentence(sentence: &str) -> BTreeMap<String, f64> {
337    let sentence_tokens = tokenize_sentence(sentence);
338    let sentence_tokens: Vec<&str> = sentence_tokens.iter().map(|s| s.as_str()).collect();
339    get_term_frequencies_from_word_vector(sentence_tokens)
340}
341
342/// Gets a count of all words from a `sentence` without `stop_words`.
343///
344/// # Examples
345///
346/// ```
347/// use std::collections::BTreeMap;
348/// use rnltk::token;
349/// 
350/// let sentence = "fear leads to anger, anger leads to hatred, hatred leads to conflict, conflict leads to suffering.";
351/// let word_counts = BTreeMap::from([("fear".to_string(), 1.), ("leads".to_string(), 4.), ("anger".to_string(), 2.), ("hatred".to_string(), 2.), ("conflict".to_string(), 2.), ("suffering".to_string(), 1.)]);
352/// let stop_words = token::get_stop_words();
353/// let term_frequencies = token::get_term_frequencies_from_sentence_without_stop_words(sentence, stop_words);
354///
355/// assert_eq!(word_counts, term_frequencies);
356/// ```
357pub fn get_term_frequencies_from_sentence_without_stop_words(sentence: &str, stop_words: Vec<String>) -> BTreeMap<String, f64> {
358    let sentence_tokens = tokenize_sentence(sentence);
359    let sentence_tokens: Vec<&str> = sentence_tokens.iter().map(|s| s.as_str()).collect();
360    get_term_frequencies_from_word_vector_without_stop_words(sentence_tokens, stop_words)
361}
362
363/// Gets a count of all stemmed words from a `sentence`.
364///
365/// # Examples
366///
367/// ```
368/// use std::collections::BTreeMap;
369/// use rnltk::token;
370/// 
371/// let sentence = "fear leads to anger, anger leads to hatred, hatred leads to conflict, conflict leads to suffering.";
372/// let word_counts = BTreeMap::from([("fear".to_string(), 1.), ("lead".to_string(), 4.), ("to".to_string(), 4.), ("anger".to_string(), 2.), ("hatr".to_string(), 2.), ("conflict".to_string(), 2.), ("suffer".to_string(), 1.)]);
373/// let term_frequencies = token::get_stemmed_term_frequencies_from_sentence(sentence);
374///
375/// assert_eq!(word_counts, term_frequencies);
376/// ```
377pub fn get_stemmed_term_frequencies_from_sentence(sentence: &str) -> BTreeMap<String, f64> {
378    let sentence_tokens = tokenize_sentence(sentence);
379    let sentence_tokens: Vec<&str> = sentence_tokens.iter().map(|s| s.as_str()).collect();
380    get_stemmed_term_frequencies_from_word_vector(sentence_tokens)
381}
382
383/// Gets a count of all stemmed words from a `sentence` without `stop_words`.
384///
385/// # Examples
386///
387/// ```
388/// use std::collections::BTreeMap;
389/// use rnltk::token;
390/// 
391/// let sentence = "fear leads to anger, anger leads to hatred, hatred leads to conflict, conflict leads to suffering.";
392/// let word_counts = BTreeMap::from([("fear".to_string(), 1.), ("lead".to_string(), 4.), ("anger".to_string(), 2.), ("hatr".to_string(), 2.), ("conflict".to_string(), 2.), ("suffer".to_string(), 1.)]);
393/// let stop_words = token::get_stop_words();
394/// let term_frequencies = token::get_stemmed_term_frequencies_from_sentence_without_stop_words(sentence, stop_words);
395///
396/// assert_eq!(word_counts, term_frequencies);
397/// ```
398pub fn get_stemmed_term_frequencies_from_sentence_without_stop_words(sentence: &str, stop_words: Vec<String>) -> BTreeMap<String, f64> {
399    let sentence_tokens = tokenize_sentence(sentence);
400    let sentence_tokens: Vec<&str> = sentence_tokens.iter().map(|s| s.as_str()).collect();
401    get_stemmed_term_frequencies_from_word_vector_without_stop_words(sentence_tokens, stop_words)
402}
403
404/// Gets a count of all words from a `sentence` based on a given configuration.
405/// 
406/// This function will be deprecated in the future once `rnltk` hits version 1.0
407/// and functionality will be moved to `get_term_frequencies_from_sentence`.
408/// 
409/// # Examples
410///
411/// ```
412/// use std::collections::BTreeMap;
413/// use rnltk::token;
414/// 
415/// let token_config = token::TokenConfig::default();
416/// let sentence = "fear leads to anger, anger leads to hatred, hatred leads to conflict, conflict leads to suffering.";
417/// let word_counts = BTreeMap::from([("fear".to_string(), 1.), ("lead".to_string(), 4.), ("anger".to_string(), 2.), ("hatr".to_string(), 2.), ("conflict".to_string(), 2.), ("suffer".to_string(), 1.)]);
418/// let term_frequencies = token::get_term_frequencies_from_sentence_configurable(sentence, token_config);
419///
420/// assert_eq!(word_counts, term_frequencies);
421/// ```
422pub fn get_term_frequencies_from_sentence_configurable(sentence: &str, config: TokenConfig) -> BTreeMap<String, f64> {
423    if config.remove_stop_words && config.stem {
424        get_stemmed_term_frequencies_from_sentence_without_stop_words(sentence, config.stop_words)
425    } else if config.remove_stop_words {
426        get_term_frequencies_from_sentence_without_stop_words(sentence, config.stop_words)
427    } else if config.stem {
428        get_stemmed_term_frequencies_from_sentence(sentence)
429    } else {
430        get_term_frequencies_from_sentence(sentence)
431    }
432}
433
434/// Gets a count of all words from a vector of `sentence`s.
435///
436/// # Examples
437///
438/// ```
439/// use std::collections::BTreeMap;
440/// use rnltk::token;
441/// 
442/// let sentences = vec!["fear leads to anger", "anger leads to hatred", "hatred leads to conflict", "conflict leads to suffering."];
443/// let word_counts1 = BTreeMap::from([
444///     ("fear".to_string(), 1.), ("leads".to_string(), 1.), ("to".to_string(), 1.), ("anger".to_string(), 1.), ("hatred".to_string(), 0.), ("conflict".to_string(), 0.), ("suffering".to_string(), 0.)
445/// ]);
446/// let word_counts2 = BTreeMap::from([
447///     ("fear".to_string(), 0.), ("leads".to_string(), 1.), ("to".to_string(), 1.), ("anger".to_string(), 1.), ("hatred".to_string(), 1.), ("conflict".to_string(), 0.), ("suffering".to_string(), 0.)
448/// ]);
449/// let word_counts3 = BTreeMap::from([
450///     ("fear".to_string(), 0.), ("leads".to_string(), 1.), ("to".to_string(), 1.), ("anger".to_string(), 0.), ("hatred".to_string(), 1.), ("conflict".to_string(),1.), ("suffering".to_string(), 0.)
451/// ]);
452/// let word_counts4 = BTreeMap::from([
453///     ("fear".to_string(), 0.), ("leads".to_string(), 1.), ("to".to_string(), 1.), ("anger".to_string(), 0.), ("hatred".to_string(), 0.), ("conflict".to_string(), 1.), ("suffering".to_string(), 1.)
454/// ]);
455/// let term_frequencies = token::get_term_frequencies_from_sentences(&sentences);
456///
457/// assert_eq!(vec![word_counts1, word_counts2, word_counts3, word_counts4], term_frequencies);
458/// ```
459pub fn get_term_frequencies_from_sentences(sentences: &[&str]) -> Vec<BTreeMap<String, f64>> {
460    let mut total_terms: Vec<String> = vec![];
461    let mut term_frequencies: Vec<BTreeMap<String, f64>> = sentences.iter().map(|sentence| {
462        let frequencies = get_term_frequencies_from_sentence(sentence);
463        total_terms.extend(frequencies.keys().cloned().collect::<Vec<String>>());
464        frequencies
465    }).collect();
466    for frequency_counts in &mut term_frequencies {
467        for term in &total_terms {
468            if !frequency_counts.contains_key(term) {
469                frequency_counts.insert(term.to_string(), 0.);
470            }
471        }
472    }
473    term_frequencies
474}
475
476/// Gets a count of all words from a vector of `sentence`s without `stop_words`.
477///
478/// # Examples
479///
480/// ```
481/// use std::collections::BTreeMap;
482/// use rnltk::token;
483/// 
484/// let sentences = vec!["fear leads to anger", "anger leads to hatred", "hatred leads to conflict", "conflict leads to suffering."];
485/// let stop_words = token::get_stop_words();
486/// let word_counts1 = BTreeMap::from([
487///     ("fear".to_string(), 1.), ("leads".to_string(), 1.), ("anger".to_string(), 1.), ("hatred".to_string(), 0.), ("conflict".to_string(), 0.), ("suffering".to_string(), 0.)
488/// ]);
489/// let word_counts2 = BTreeMap::from([
490///     ("fear".to_string(), 0.), ("leads".to_string(), 1.), ("anger".to_string(), 1.), ("hatred".to_string(), 1.), ("conflict".to_string(), 0.), ("suffering".to_string(), 0.)
491/// ]);
492/// let word_counts3 = BTreeMap::from([
493///     ("fear".to_string(), 0.), ("leads".to_string(), 1.), ("anger".to_string(), 0.), ("hatred".to_string(), 1.), ("conflict".to_string(),1.), ("suffering".to_string(), 0.)
494/// ]);
495/// let word_counts4 = BTreeMap::from([
496///     ("fear".to_string(), 0.), ("leads".to_string(), 1.), ("anger".to_string(), 0.), ("hatred".to_string(), 0.), ("conflict".to_string(), 1.), ("suffering".to_string(), 1.)
497/// ]);
498/// let term_frequencies = token::get_term_frequencies_from_sentences_without_stop_words(&sentences, stop_words);
499///
500/// assert_eq!(vec![word_counts1, word_counts2, word_counts3, word_counts4], term_frequencies);
501/// ```
502pub fn get_term_frequencies_from_sentences_without_stop_words(sentences: &[&str], stop_words: Vec<String>) -> Vec<BTreeMap<String, f64>> {
503    let mut total_terms: Vec<String> = vec![];
504    let mut term_frequencies: Vec<BTreeMap<String, f64>> = sentences.iter().map(|sentence| {
505        let frequencies = get_term_frequencies_from_sentence_without_stop_words(sentence, stop_words.clone());
506        total_terms.extend(frequencies.keys().cloned().collect::<Vec<String>>());
507        frequencies
508    }).collect();
509    for frequency_counts in &mut term_frequencies {
510        for term in &total_terms {
511            if !frequency_counts.contains_key(term) {
512                frequency_counts.insert(term.to_string(), 0.);
513            }
514        }
515    }
516    term_frequencies
517}
518
519/// Gets a count of all stemmed words from a vector of `sentence`s.
520///
521/// # Examples
522///
523/// ```
524/// use std::collections::BTreeMap;
525/// use rnltk::token;
526/// 
527/// let sentences = vec!["fear leads to anger", "anger leads to hatred", "hatred leads to conflict", "conflict leads to suffering."];
528/// let word_counts1 = BTreeMap::from([
529///     ("fear".to_string(), 1.), ("lead".to_string(), 1.), ("to".to_string(), 1.), ("anger".to_string(), 1.), ("hatr".to_string(), 0.), ("conflict".to_string(), 0.), ("suffer".to_string(), 0.)
530/// ]);
531/// let word_counts2 = BTreeMap::from([
532///     ("fear".to_string(), 0.), ("lead".to_string(), 1.), ("to".to_string(), 1.), ("anger".to_string(), 1.), ("hatr".to_string(), 1.), ("conflict".to_string(), 0.), ("suffer".to_string(), 0.)
533/// ]);
534/// let word_counts3 = BTreeMap::from([
535///     ("fear".to_string(), 0.), ("lead".to_string(), 1.), ("to".to_string(), 1.), ("anger".to_string(), 0.), ("hatr".to_string(), 1.), ("conflict".to_string(),1.), ("suffer".to_string(), 0.)
536/// ]);
537/// let word_counts4 = BTreeMap::from([
538///     ("fear".to_string(), 0.), ("lead".to_string(), 1.), ("to".to_string(), 1.), ("anger".to_string(), 0.), ("hatr".to_string(), 0.), ("conflict".to_string(), 1.), ("suffer".to_string(), 1.)
539/// ]);
540/// let term_frequencies = token::get_stemmed_term_frequencies_from_sentences(&sentences);
541///
542/// assert_eq!(vec![word_counts1, word_counts2, word_counts3, word_counts4], term_frequencies);
543/// ```
544pub fn get_stemmed_term_frequencies_from_sentences(sentences: &[&str]) -> Vec<BTreeMap<String, f64>> {
545    let mut total_terms: Vec<String> = vec![];
546    let mut term_frequencies: Vec<BTreeMap<String, f64>> = sentences.iter().map(|sentence| {
547        let frequencies = get_stemmed_term_frequencies_from_sentence(sentence);
548        total_terms.extend(frequencies.keys().cloned().collect::<Vec<String>>());
549        frequencies
550    }).collect();
551    for frequency_counts in &mut term_frequencies {
552        for term in &total_terms {
553            if !frequency_counts.contains_key(term) {
554                frequency_counts.insert(term.to_string(), 0.);
555            }
556        }
557    }
558    term_frequencies
559}
560
561
562/// Gets a count of all stemmed words from a vector of `sentence`s without `stop_words`.
563///
564/// # Examples
565///
566/// ```
567/// use std::collections::BTreeMap;
568/// use rnltk::token;
569/// 
570/// let sentences = vec!["fear leads to anger", "anger leads to hatred", "hatred leads to conflict", "conflict leads to suffering."];
571/// let stop_words = token::get_stop_words();
572/// let word_counts1 = BTreeMap::from([
573///     ("fear".to_string(), 1.), ("lead".to_string(), 1.), ("to".to_string(), 1.), ("anger".to_string(), 1.), ("hatr".to_string(), 0.), ("conflict".to_string(), 0.), ("suffer".to_string(), 0.)
574/// ]);
575/// let word_counts2 = BTreeMap::from([
576///     ("fear".to_string(), 0.), ("lead".to_string(), 1.), ("to".to_string(), 1.), ("anger".to_string(), 1.), ("hatr".to_string(), 1.), ("conflict".to_string(), 0.), ("suffer".to_string(), 0.)
577/// ]);
578/// let word_counts3 = BTreeMap::from([
579///     ("fear".to_string(), 0.), ("lead".to_string(), 1.), ("to".to_string(), 1.), ("anger".to_string(), 0.), ("hatr".to_string(), 1.), ("conflict".to_string(),1.), ("suffer".to_string(), 0.)
580/// ]);
581/// let word_counts4 = BTreeMap::from([
582///     ("fear".to_string(), 0.), ("lead".to_string(), 1.), ("to".to_string(), 1.), ("anger".to_string(), 0.), ("hatr".to_string(), 0.), ("conflict".to_string(), 1.), ("suffer".to_string(), 1.)
583/// ]);
584/// let term_frequencies = token::get_stemmed_term_frequencies_from_sentences(&sentences);
585///
586/// assert_eq!(vec![word_counts1, word_counts2, word_counts3, word_counts4], term_frequencies);
587/// ```
588pub fn get_stemmed_term_frequencies_from_sentences_without_stop_words(sentences: &[&str], stop_words: Vec<String>) -> Vec<BTreeMap<String, f64>> {
589    let mut total_terms: Vec<String> = vec![];
590    let mut term_frequencies: Vec<BTreeMap<String, f64>> = sentences.iter().map(|sentence| {
591        let frequencies = get_stemmed_term_frequencies_from_sentence_without_stop_words(sentence, stop_words.clone());
592        total_terms.extend(frequencies.keys().cloned().collect::<Vec<String>>());
593        frequencies
594    }).collect();
595    for frequency_counts in &mut term_frequencies {
596        for term in &total_terms {
597            if !frequency_counts.contains_key(term) {
598                frequency_counts.insert(term.to_string(), 0.);
599            }
600        }
601    }
602    term_frequencies
603}
604
605/// Gets a count of all words from a vector of `word_tokens` based on a given configuration.
606/// 
607/// This function will be deprecated in the future once `rnltk` hits version 1.0
608/// and functionality will be moved to `get_term_frequencies_from_word_vector`.
609/// 
610/// # Examples
611///
612/// ```
613/// use std::collections::BTreeMap;
614/// use rnltk::token;
615/// 
616/// let token_config = token::TokenConfig::default();
617/// let sentences = vec!["fear leads to anger", "anger leads to hatred", "hatred leads to conflict", "conflict leads to suffering."];
618/// let word_counts1 = BTreeMap::from([
619///     ("fear".to_string(), 1.), ("lead".to_string(), 1.), ("anger".to_string(), 1.), ("hatr".to_string(), 0.), ("conflict".to_string(), 0.), ("suffer".to_string(), 0.)
620/// ]);
621/// let word_counts2 = BTreeMap::from([
622///     ("fear".to_string(), 0.), ("lead".to_string(), 1.), ("anger".to_string(), 1.), ("hatr".to_string(), 1.), ("conflict".to_string(), 0.), ("suffer".to_string(), 0.)
623/// ]);
624/// let word_counts3 = BTreeMap::from([
625///     ("fear".to_string(), 0.), ("lead".to_string(), 1.), ("anger".to_string(), 0.), ("hatr".to_string(), 1.), ("conflict".to_string(),1.), ("suffer".to_string(), 0.)
626/// ]);
627/// let word_counts4 = BTreeMap::from([
628///     ("fear".to_string(), 0.), ("lead".to_string(), 1.), ("anger".to_string(), 0.), ("hatr".to_string(), 0.), ("conflict".to_string(), 1.), ("suffer".to_string(), 1.)
629/// ]);
630/// let term_frequencies = token::get_term_frequencies_from_sentences_configurable(&sentences, token_config);
631///
632/// assert_eq!(vec![word_counts1, word_counts2, word_counts3, word_counts4], term_frequencies);
633/// ```
634pub fn get_term_frequencies_from_sentences_configurable(sentences: &[&str], config: TokenConfig) -> Vec<BTreeMap<String, f64>> {
635    if config.remove_stop_words && config.stem {
636        get_stemmed_term_frequencies_from_sentences_without_stop_words(sentences, config.stop_words)
637    } else if config.remove_stop_words {
638        get_term_frequencies_from_sentences_without_stop_words(sentences, config.stop_words)
639    } else if config.stem {
640        get_stemmed_term_frequencies_from_sentences(sentences)
641    } else {
642        get_term_frequencies_from_sentences(sentences)
643    }
644}
645
646#[cfg(test)]
647mod tests {
648    use super::*;
649    
650    #[test]
651    fn test_document_tokenization() {
652        let text = "Why hello there. General Kenobi!";
653        let tokens = vec!["Why hello there", "General Kenobi"];
654        let tokenized_text = tokenize_into_sentences(text);
655        assert_eq!(tokens, tokenized_text);
656    }
657
658    #[test]
659    fn test_sentence_tokenization() {
660        let text = "Why hello there. General Kenobi!";
661        let tokens = vec!["why", "hello", "there", "general", "kenobi"];
662        let tokenized_text = tokenize_sentence(text);
663        assert_eq!(tokens, tokenized_text);
664    }
665
666    #[test]
667    fn test_sentence_tokenization_without_stop_words() {
668        let stop_words = get_stop_words();
669        let text = "Why hello there. General Kenobi!";
670        let tokens = vec!["hello", "general", "kenobi"];
671        let tokenized_text = tokenize_sentence_without_stop_words(text, stop_words);
672        assert_eq!(tokens, tokenized_text);
673    }
674
675    #[test]
676    fn test_sentence_tokenization_with_stemming() {
677        let text = "Why hello there. General Kenobi!";
678        let tokens = vec!["why", "hello", "there", "gener", "kenobi"];
679        let tokenized_text = tokenize_stemmed_sentence(text);
680        assert_eq!(tokens, tokenized_text);
681    }
682
683    #[test]
684    fn test_sentence_tokenization_with_stemming_without_stop_words() {
685        let stop_words = get_stop_words();
686        let text = "Why hello there. General Kenobi!";
687        let tokens = vec!["hello", "gener", "kenobi"];
688        let tokenized_text = tokenize_stemmed_sentence_without_stop_words(text, stop_words);
689        assert_eq!(tokens, tokenized_text);
690    }
691
692    #[test]
693    fn test_sentence_tokenization_configurable() {
694        let token_config = TokenConfig::default();
695        let text = "Why hello there. General Kenobi!";
696        let tokens = vec!["hello", "gener", "kenobi"];
697        let tokenized_text = tokenize_sentence_configurable(text, token_config);
698        assert_eq!(tokens, tokenized_text);
699    }
700
701    #[test]
702    fn test_term_frequencies_from_str_vector() {
703        let tokens = vec!["fear", "leads", "to", "anger", "anger", "leads", "to", "hatred", "hatred", "leads", "to", "conflict", "conflict", "leads", "to", "suffering"];
704        let word_counts = BTreeMap::from([("fear".to_string(), 1.), ("leads".to_string(), 4.), ("to".to_string(), 4.), ("anger".to_string(), 2.), ("hatred".to_string(), 2.), ("conflict".to_string(), 2.), ("suffering".to_string(), 1.)]);
705        let term_frequencies = get_term_frequencies_from_word_vector(tokens);
706        assert_eq!(word_counts, term_frequencies);
707    }
708
709    #[test]
710    fn test_term_frequencies_from_str_vector_without_stop_words() {
711        let stop_words = get_stop_words();
712        let tokens = vec!["fear", "leads", "to", "anger", "anger", "leads", "to", "hatred", "hatred", "leads", "to", "conflict", "conflict", "leads", "to", "suffering"];
713        let word_counts = BTreeMap::from([("fear".to_string(), 1.), ("leads".to_string(), 4.), ("anger".to_string(), 2.), ("hatred".to_string(), 2.), ("conflict".to_string(), 2.), ("suffering".to_string(), 1.)]);
714        let term_frequencies = get_term_frequencies_from_word_vector_without_stop_words(tokens, stop_words);
715        assert_eq!(word_counts, term_frequencies);
716    }
717
718    #[test]
719    fn test_term_frequencies_from_str_vector_with_stemming() {
720        let tokens = vec!["fear", "leads", "to", "anger", "anger", "leads", "to", "hatred", "hatred", "leads", "to", "conflict", "conflict", "leads", "to", "suffering"];
721        let word_counts = BTreeMap::from([("fear".to_string(), 1.), ("lead".to_string(), 4.), ("to".to_string(), 4.), ("anger".to_string(), 2.), ("hatr".to_string(), 2.), ("conflict".to_string(), 2.), ("suffer".to_string(), 1.)]);
722        let term_frequencies = get_stemmed_term_frequencies_from_word_vector(tokens);
723        assert_eq!(word_counts, term_frequencies);
724    }
725
726    #[test]
727    fn test_term_frequencies_from_str_vector_with_stemming_without_stop_words() {
728        let stop_words = get_stop_words();
729        let tokens = vec!["fear", "leads", "to", "anger", "anger", "leads", "to", "hatred", "hatred", "leads", "to", "conflict", "conflict", "leads", "to", "suffering"];
730        let word_counts = BTreeMap::from([("fear".to_string(), 1.), ("lead".to_string(), 4.), ("anger".to_string(), 2.), ("hatr".to_string(), 2.), ("conflict".to_string(), 2.), ("suffer".to_string(), 1.)]);
731        let term_frequencies = get_stemmed_term_frequencies_from_word_vector_without_stop_words(tokens, stop_words);
732        assert_eq!(word_counts, term_frequencies);
733    }
734
735    #[test]
736    fn test_term_frequencies_from_str_vector_configurable() {
737        let token_config = TokenConfig::default();
738        let tokens = vec!["fear", "leads", "to", "anger", "anger", "leads", "to", "hatred", "hatred", "leads", "to", "conflict", "conflict", "leads", "to", "suffering"];
739        let word_counts = BTreeMap::from([("fear".to_string(), 1.), ("lead".to_string(), 4.), ("anger".to_string(), 2.), ("hatr".to_string(), 2.), ("conflict".to_string(), 2.), ("suffer".to_string(), 1.)]);
740        let term_frequencies = get_term_frequencies_from_word_vector_configurable(tokens, token_config);
741        assert_eq!(word_counts, term_frequencies);
742    }
743
744    #[test]
745    fn test_term_frequencies_from_sentences() {
746        let sentences = vec!["fear leads to anger", "anger leads to hatred", "hatred leads to conflict", "conflict leads to suffering."];
747        let word_counts1 = BTreeMap::from([
748            ("fear".to_string(), 1.), ("leads".to_string(), 1.), ("to".to_string(), 1.), ("anger".to_string(), 1.), ("hatred".to_string(), 0.), ("conflict".to_string(), 0.), ("suffering".to_string(), 0.)
749        ]);
750        let word_counts2 = BTreeMap::from([
751            ("fear".to_string(), 0.), ("leads".to_string(), 1.), ("to".to_string(), 1.), ("anger".to_string(), 1.), ("hatred".to_string(), 1.), ("conflict".to_string(), 0.), ("suffering".to_string(), 0.)
752        ]);
753        let word_counts3 = BTreeMap::from([
754            ("fear".to_string(), 0.), ("leads".to_string(), 1.), ("to".to_string(), 1.), ("anger".to_string(), 0.), ("hatred".to_string(), 1.), ("conflict".to_string(), 1.), ("suffering".to_string(), 0.)
755        ]);
756        let word_counts4 = BTreeMap::from([
757            ("fear".to_string(), 0.), ("leads".to_string(), 1.), ("to".to_string(), 1.), ("anger".to_string(), 0.), ("hatred".to_string(), 0.), ("conflict".to_string(), 1.), ("suffering".to_string(), 1.)
758        ]);
759        let term_frequencies = get_term_frequencies_from_sentences(&sentences);
760        
761        assert_eq!(vec![word_counts1, word_counts2, word_counts3, word_counts4], term_frequencies);
762    }
763
764    #[test]
765    fn test_term_frequencies_from_sentences_without_stop_words() {
766        let stop_words = get_stop_words();
767        let sentences = vec!["fear leads to anger", "anger leads to hatred", "hatred leads to conflict", "conflict leads to suffering."];
768        let word_counts1 = BTreeMap::from([
769            ("fear".to_string(), 1.), ("leads".to_string(), 1.), ("anger".to_string(), 1.), ("hatred".to_string(), 0.), ("conflict".to_string(), 0.), ("suffering".to_string(), 0.)
770        ]);
771        let word_counts2 = BTreeMap::from([
772            ("fear".to_string(), 0.), ("leads".to_string(), 1.), ("anger".to_string(), 1.), ("hatred".to_string(), 1.), ("conflict".to_string(), 0.), ("suffering".to_string(), 0.)
773        ]);
774        let word_counts3 = BTreeMap::from([
775            ("fear".to_string(), 0.), ("leads".to_string(), 1.), ("anger".to_string(), 0.), ("hatred".to_string(), 1.), ("conflict".to_string(), 1.), ("suffering".to_string(), 0.)
776        ]);
777        let word_counts4 = BTreeMap::from([
778            ("fear".to_string(), 0.), ("leads".to_string(), 1.), ("anger".to_string(), 0.), ("hatred".to_string(), 0.), ("conflict".to_string(), 1.), ("suffering".to_string(), 1.)
779        ]);
780        let term_frequencies = get_term_frequencies_from_sentences_without_stop_words(&sentences, stop_words);
781        
782        assert_eq!(vec![word_counts1, word_counts2, word_counts3, word_counts4], term_frequencies);
783    }
784
785    #[test]
786    fn test_term_frequencies_from_sentences_with_stemming() {
787        let sentences = vec!["fear leads to anger", "anger leads to hatred", "hatred leads to conflict", "conflict leads to suffering."];
788        let word_counts1 = BTreeMap::from([
789            ("fear".to_string(), 1.), ("lead".to_string(), 1.), ("to".to_string(), 1.), ("anger".to_string(), 1.), ("hatr".to_string(), 0.), ("conflict".to_string(), 0.), ("suffer".to_string(), 0.)
790        ]);
791        let word_counts2 = BTreeMap::from([
792            ("fear".to_string(), 0.), ("lead".to_string(), 1.), ("to".to_string(), 1.), ("anger".to_string(), 1.), ("hatr".to_string(), 1.), ("conflict".to_string(), 0.), ("suffer".to_string(), 0.)
793        ]);
794        let word_counts3 = BTreeMap::from([
795            ("fear".to_string(), 0.), ("lead".to_string(), 1.), ("to".to_string(), 1.), ("anger".to_string(), 0.), ("hatr".to_string(), 1.), ("conflict".to_string(),1.), ("suffer".to_string(), 0.)
796        ]);
797        let word_counts4 = BTreeMap::from([
798            ("fear".to_string(), 0.), ("lead".to_string(), 1.), ("to".to_string(), 1.), ("anger".to_string(), 0.), ("hatr".to_string(), 0.), ("conflict".to_string(), 1.), ("suffer".to_string(), 1.)
799        ]);
800        let term_frequencies = get_stemmed_term_frequencies_from_sentences(&sentences);
801        
802        assert_eq!(vec![word_counts1, word_counts2, word_counts3, word_counts4], term_frequencies);
803    }
804
805    #[test]
806    fn test_term_frequencies_from_sentences_with_stemming_without_stop_words() {
807        let stop_words = get_stop_words();
808        let sentences = vec!["fear leads to anger", "anger leads to hatred", "hatred leads to conflict", "conflict leads to suffering."];
809        let word_counts1 = BTreeMap::from([
810            ("fear".to_string(), 1.), ("lead".to_string(), 1.), ("anger".to_string(), 1.), ("hatr".to_string(), 0.), ("conflict".to_string(), 0.), ("suffer".to_string(), 0.)
811        ]);
812        let word_counts2 = BTreeMap::from([
813            ("fear".to_string(), 0.), ("lead".to_string(), 1.), ("anger".to_string(), 1.), ("hatr".to_string(), 1.), ("conflict".to_string(), 0.), ("suffer".to_string(), 0.)
814        ]);
815        let word_counts3 = BTreeMap::from([
816            ("fear".to_string(), 0.), ("lead".to_string(), 1.), ("anger".to_string(), 0.), ("hatr".to_string(), 1.), ("conflict".to_string(), 1.), ("suffer".to_string(), 0.)
817        ]);
818        let word_counts4 = BTreeMap::from([
819            ("fear".to_string(), 0.), ("lead".to_string(), 1.), ("anger".to_string(), 0.), ("hatr".to_string(), 0.), ("conflict".to_string(), 1.), ("suffer".to_string(), 1.)
820        ]);
821        let term_frequencies = get_stemmed_term_frequencies_from_sentences_without_stop_words(&sentences, stop_words);
822        
823        assert_eq!(vec![word_counts1, word_counts2, word_counts3, word_counts4], term_frequencies);
824    }
825
826    #[test]
827    fn test_term_frequencies_from_sentences_configurable() {
828        let token_config = TokenConfig::default();
829        let sentences = vec!["fear leads to anger", "anger leads to hatred", "hatred leads to conflict", "conflict leads to suffering."];
830        let word_counts1 = BTreeMap::from([
831            ("fear".to_string(), 1.), ("lead".to_string(), 1.), ("anger".to_string(), 1.), ("hatr".to_string(), 0.), ("conflict".to_string(), 0.), ("suffer".to_string(), 0.)
832        ]);
833        let word_counts2 = BTreeMap::from([
834            ("fear".to_string(), 0.), ("lead".to_string(), 1.), ("anger".to_string(), 1.), ("hatr".to_string(), 1.), ("conflict".to_string(), 0.), ("suffer".to_string(), 0.)
835        ]);
836        let word_counts3 = BTreeMap::from([
837            ("fear".to_string(), 0.), ("lead".to_string(), 1.), ("anger".to_string(), 0.), ("hatr".to_string(), 1.), ("conflict".to_string(), 1.), ("suffer".to_string(), 0.)
838        ]);
839        let word_counts4 = BTreeMap::from([
840            ("fear".to_string(), 0.), ("lead".to_string(), 1.), ("anger".to_string(), 0.), ("hatr".to_string(), 0.), ("conflict".to_string(), 1.), ("suffer".to_string(), 1.)
841        ]);
842        let term_frequencies = get_term_frequencies_from_sentences_configurable(&sentences, token_config);
843        
844        assert_eq!(vec![word_counts1, word_counts2, word_counts3, word_counts4], term_frequencies);
845    }
846}
rnltk/token.rs

rnltk/
token.rs