1use std::collections::BTreeMap;
4
5use regex::Regex;
6
7use crate::stem;
8
9pub fn get_stop_words() -> Vec<String> {
10 ["i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "you're", "you've", "you'll", "you'd", "your", "yours", "yourself", "yourselves", "he", "him", "his", "himself", "she", "she's", "her", "hers", "herself", "it", "it's", "its", "itself", "they", "them", "their", "theirs", "themselves", "what", "which", "who", "whom", "this", "that", "that'll", "these", "those", "am", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "having", "do", "does", "did", "doing", "a", "an", "the", "and", "but", "if", "or", "because", "as", "until", "while", "of", "at", "by", "for", "with", "about", "against", "between", "into", "through", "during", "before", "after", "above", "below", "to", "from", "up", "down", "in", "out", "on", "off", "over", "under", "again", "further", "then", "once", "here", "there", "when", "where", "why", "how", "all", "any", "both", "each", "few", "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so", "than", "too", "very", "s", "t", "can", "will", "just", "don", "don't", "should", "should've", "now", "d", "ll", "m", "o", "re", "ve", "y", "ain", "aren", "aren't", "couldn", "couldn't", "didn", "didn't", "doesn", "doesn't", "hadn", "hadn't", "hasn", "hasn't", "haven", "haven't", "isn", "isn't", "ma", "mightn", "mightn't", "mustn", "mustn't", "needn", "needn't", "shan", "shan't", "shouldn", "shouldn't", "wasn", "wasn't", "weren", "weren't", "won", "won't", "wouldn", "wouldn't"]
11 .map(String::from)
12 .to_vec()
13}
14
15#[derive(Debug, Clone)]
16pub struct TokenConfig {
17 pub stem: bool,
18 pub remove_stop_words: bool,
19 pub stop_words: Vec<String>,
20}
21
22impl Default for TokenConfig {
23 fn default() -> Self {
24 Self {
25 stem: true,
26 remove_stop_words: true,
27 stop_words: get_stop_words(),
28 }
29 }
30}
31
32pub fn tokenize_into_sentences(document: &str) -> Vec<String> {
46 let quote_regex = Regex::new(r#"[\.!\?]""#).expect("Invalid regex");
47 let updated_document: &str = "e_regex.replace_all(document, "\"");
48
49 let separator = Regex::new(r#"[\.!\?] *"#).expect("Invalid regex");
50 let mut full_sentences: Vec<String> = separator.split(updated_document).map(|s| s.to_string()).collect();
51 full_sentences.retain(|sentence| !sentence.is_empty());
52
53 full_sentences
54}
55
56pub fn tokenize_sentence(sentence: &str) -> Vec<String> {
70 let punctuation = Regex::new(r#"[!"\#$%&'()*+,-./:;<=>?@\[\]^_`{|}~]+"#).expect("Invalid regex");
71 let updated_sentence: &str = &punctuation.replace_all(sentence, "");
72
73 let mut tokens: Vec<String> = updated_sentence
74 .split(' ')
75 .map(|s| s.trim().to_ascii_lowercase())
76 .collect();
77 tokens.retain(|token| !token.is_empty());
78
79 tokens
80}
81
82pub fn tokenize_sentence_without_stop_words(sentence: &str, stop_words: Vec<String>) -> Vec<String> {
97 let punctuation = Regex::new(r#"[!"\#$%&'()*+,-./:;<=>?@\[\]^_`{|}~]+"#).expect("Invalid regex");
98 let updated_sentence: &str = &punctuation.replace_all(sentence, "");
99
100 let mut tokens: Vec<String> = tokenize_sentence(updated_sentence);
101 tokens.retain(|token| !stop_words.contains(token));
102
103 tokens
104}
105
106pub fn tokenize_stemmed_sentence(sentence: &str) -> Vec<String> {
120 let punctuation = Regex::new(r#"[!"\#$%&'()*+,-./:;<=>?@\[\]^_`{|}~]+"#).expect("Invalid regex");
121 let updated_sentence: &str = &punctuation.replace_all(sentence, "");
122
123 let tokens: Vec<String> = updated_sentence
124 .split(' ')
125 .map(|s| s.trim())
126 .filter(|s| !s.is_empty())
127 .map(|s| stem::get(s).unwrap_or_else(|_| s.to_string()))
128 .collect();
129
130 tokens
131}
132
133pub fn tokenize_stemmed_sentence_without_stop_words(sentence: &str, stop_words: Vec<String>) -> Vec<String> {
148 let punctuation = Regex::new(r#"[!"\#$%&'()*+,-./:;<=>?@\[\]^_`{|}~]+"#).expect("Invalid regex");
149 let updated_sentence: &str = &punctuation.replace_all(sentence, "");
150
151 let tokens: Vec<String> = updated_sentence
152 .split(' ')
153 .map(|token| token.trim().to_ascii_lowercase())
154 .filter(|token| !token.is_empty() && !stop_words.contains(&token.to_string()))
155 .map(|token| stem::get(&token).unwrap_or_else(|_| token.to_string()))
156 .collect();
157
158 tokens
159}
160
161pub fn tokenize_sentence_configurable(sentence: &str, config: TokenConfig) -> Vec<String> {
179 if config.remove_stop_words && config.stem {
180 tokenize_stemmed_sentence_without_stop_words(sentence, config.stop_words)
181 } else if config.remove_stop_words {
182 tokenize_sentence_without_stop_words(sentence, config.stop_words)
183 } else if config.stem {
184 tokenize_stemmed_sentence(sentence)
185 } else {
186 tokenize_sentence(sentence)
187 }
188}
189
190pub fn get_term_frequencies_from_word_vector(word_tokens: Vec<&str>) -> BTreeMap<String, f64> {
205 let mut word_counts: BTreeMap<String, f64> = BTreeMap::new();
206 for word in word_tokens {
207 let count = word_counts.entry(word.to_string()).or_insert(0.);
208 *count += 1.;
209 }
210 word_counts
211}
212
213pub fn get_term_frequencies_from_word_vector_without_stop_words(word_tokens: Vec<&str>, stop_words: Vec<String>) -> BTreeMap<String, f64> {
229 let mut word_counts: BTreeMap<String, f64> = BTreeMap::new();
230 for word in word_tokens {
231 if !stop_words.contains(&word.to_string()) {
232 let count = word_counts.entry(word.to_string()).or_insert(0.);
233 *count += 1.;
234 }
235 }
236 word_counts
237}
238
239pub fn get_stemmed_term_frequencies_from_word_vector(word_tokens: Vec<&str>) -> BTreeMap<String, f64> {
256 let mut word_counts: BTreeMap<String, f64> = BTreeMap::new();
257 for word in word_tokens {
258 let count = word_counts.entry(stem::get(word).unwrap_or_else(|_| word.to_string())).or_insert(0.);
259 *count += 1.;
260 }
261 word_counts
262}
263
264pub fn get_stemmed_term_frequencies_from_word_vector_without_stop_words(word_tokens: Vec<&str>, stop_words: Vec<String>) -> BTreeMap<String, f64> {
282 let mut word_counts: BTreeMap<String, f64> = BTreeMap::new();
283 for word in word_tokens {
284 if !stop_words.contains(&word.to_string()) {
285 let count = word_counts.entry(stem::get(word).unwrap_or_else(|_| word.to_string())).or_insert(0.);
286 *count += 1.;
287 }
288 }
289 word_counts
290}
291
292pub fn get_term_frequencies_from_word_vector_configurable(word_tokens: Vec<&str>, config: TokenConfig) -> BTreeMap<String, f64> {
311 if config.remove_stop_words && config.stem {
312 get_stemmed_term_frequencies_from_word_vector_without_stop_words(word_tokens, config.stop_words)
313 } else if config.remove_stop_words {
314 get_term_frequencies_from_word_vector_without_stop_words(word_tokens, config.stop_words)
315 } else if config.stem {
316 get_stemmed_term_frequencies_from_word_vector(word_tokens)
317 } else {
318 get_term_frequencies_from_word_vector(word_tokens)
319 }
320}
321
322pub fn get_term_frequencies_from_sentence(sentence: &str) -> BTreeMap<String, f64> {
337 let sentence_tokens = tokenize_sentence(sentence);
338 let sentence_tokens: Vec<&str> = sentence_tokens.iter().map(|s| s.as_str()).collect();
339 get_term_frequencies_from_word_vector(sentence_tokens)
340}
341
342pub fn get_term_frequencies_from_sentence_without_stop_words(sentence: &str, stop_words: Vec<String>) -> BTreeMap<String, f64> {
358 let sentence_tokens = tokenize_sentence(sentence);
359 let sentence_tokens: Vec<&str> = sentence_tokens.iter().map(|s| s.as_str()).collect();
360 get_term_frequencies_from_word_vector_without_stop_words(sentence_tokens, stop_words)
361}
362
363pub fn get_stemmed_term_frequencies_from_sentence(sentence: &str) -> BTreeMap<String, f64> {
378 let sentence_tokens = tokenize_sentence(sentence);
379 let sentence_tokens: Vec<&str> = sentence_tokens.iter().map(|s| s.as_str()).collect();
380 get_stemmed_term_frequencies_from_word_vector(sentence_tokens)
381}
382
383pub fn get_stemmed_term_frequencies_from_sentence_without_stop_words(sentence: &str, stop_words: Vec<String>) -> BTreeMap<String, f64> {
399 let sentence_tokens = tokenize_sentence(sentence);
400 let sentence_tokens: Vec<&str> = sentence_tokens.iter().map(|s| s.as_str()).collect();
401 get_stemmed_term_frequencies_from_word_vector_without_stop_words(sentence_tokens, stop_words)
402}
403
404pub fn get_term_frequencies_from_sentence_configurable(sentence: &str, config: TokenConfig) -> BTreeMap<String, f64> {
423 if config.remove_stop_words && config.stem {
424 get_stemmed_term_frequencies_from_sentence_without_stop_words(sentence, config.stop_words)
425 } else if config.remove_stop_words {
426 get_term_frequencies_from_sentence_without_stop_words(sentence, config.stop_words)
427 } else if config.stem {
428 get_stemmed_term_frequencies_from_sentence(sentence)
429 } else {
430 get_term_frequencies_from_sentence(sentence)
431 }
432}
433
434pub fn get_term_frequencies_from_sentences(sentences: &[&str]) -> Vec<BTreeMap<String, f64>> {
460 let mut total_terms: Vec<String> = vec![];
461 let mut term_frequencies: Vec<BTreeMap<String, f64>> = sentences.iter().map(|sentence| {
462 let frequencies = get_term_frequencies_from_sentence(sentence);
463 total_terms.extend(frequencies.keys().cloned().collect::<Vec<String>>());
464 frequencies
465 }).collect();
466 for frequency_counts in &mut term_frequencies {
467 for term in &total_terms {
468 if !frequency_counts.contains_key(term) {
469 frequency_counts.insert(term.to_string(), 0.);
470 }
471 }
472 }
473 term_frequencies
474}
475
476pub fn get_term_frequencies_from_sentences_without_stop_words(sentences: &[&str], stop_words: Vec<String>) -> Vec<BTreeMap<String, f64>> {
503 let mut total_terms: Vec<String> = vec![];
504 let mut term_frequencies: Vec<BTreeMap<String, f64>> = sentences.iter().map(|sentence| {
505 let frequencies = get_term_frequencies_from_sentence_without_stop_words(sentence, stop_words.clone());
506 total_terms.extend(frequencies.keys().cloned().collect::<Vec<String>>());
507 frequencies
508 }).collect();
509 for frequency_counts in &mut term_frequencies {
510 for term in &total_terms {
511 if !frequency_counts.contains_key(term) {
512 frequency_counts.insert(term.to_string(), 0.);
513 }
514 }
515 }
516 term_frequencies
517}
518
519pub fn get_stemmed_term_frequencies_from_sentences(sentences: &[&str]) -> Vec<BTreeMap<String, f64>> {
545 let mut total_terms: Vec<String> = vec![];
546 let mut term_frequencies: Vec<BTreeMap<String, f64>> = sentences.iter().map(|sentence| {
547 let frequencies = get_stemmed_term_frequencies_from_sentence(sentence);
548 total_terms.extend(frequencies.keys().cloned().collect::<Vec<String>>());
549 frequencies
550 }).collect();
551 for frequency_counts in &mut term_frequencies {
552 for term in &total_terms {
553 if !frequency_counts.contains_key(term) {
554 frequency_counts.insert(term.to_string(), 0.);
555 }
556 }
557 }
558 term_frequencies
559}
560
561
562pub fn get_stemmed_term_frequencies_from_sentences_without_stop_words(sentences: &[&str], stop_words: Vec<String>) -> Vec<BTreeMap<String, f64>> {
589 let mut total_terms: Vec<String> = vec![];
590 let mut term_frequencies: Vec<BTreeMap<String, f64>> = sentences.iter().map(|sentence| {
591 let frequencies = get_stemmed_term_frequencies_from_sentence_without_stop_words(sentence, stop_words.clone());
592 total_terms.extend(frequencies.keys().cloned().collect::<Vec<String>>());
593 frequencies
594 }).collect();
595 for frequency_counts in &mut term_frequencies {
596 for term in &total_terms {
597 if !frequency_counts.contains_key(term) {
598 frequency_counts.insert(term.to_string(), 0.);
599 }
600 }
601 }
602 term_frequencies
603}
604
605pub fn get_term_frequencies_from_sentences_configurable(sentences: &[&str], config: TokenConfig) -> Vec<BTreeMap<String, f64>> {
635 if config.remove_stop_words && config.stem {
636 get_stemmed_term_frequencies_from_sentences_without_stop_words(sentences, config.stop_words)
637 } else if config.remove_stop_words {
638 get_term_frequencies_from_sentences_without_stop_words(sentences, config.stop_words)
639 } else if config.stem {
640 get_stemmed_term_frequencies_from_sentences(sentences)
641 } else {
642 get_term_frequencies_from_sentences(sentences)
643 }
644}
645
646#[cfg(test)]
647mod tests {
648 use super::*;
649
650 #[test]
651 fn test_document_tokenization() {
652 let text = "Why hello there. General Kenobi!";
653 let tokens = vec!["Why hello there", "General Kenobi"];
654 let tokenized_text = tokenize_into_sentences(text);
655 assert_eq!(tokens, tokenized_text);
656 }
657
658 #[test]
659 fn test_sentence_tokenization() {
660 let text = "Why hello there. General Kenobi!";
661 let tokens = vec!["why", "hello", "there", "general", "kenobi"];
662 let tokenized_text = tokenize_sentence(text);
663 assert_eq!(tokens, tokenized_text);
664 }
665
666 #[test]
667 fn test_sentence_tokenization_without_stop_words() {
668 let stop_words = get_stop_words();
669 let text = "Why hello there. General Kenobi!";
670 let tokens = vec!["hello", "general", "kenobi"];
671 let tokenized_text = tokenize_sentence_without_stop_words(text, stop_words);
672 assert_eq!(tokens, tokenized_text);
673 }
674
675 #[test]
676 fn test_sentence_tokenization_with_stemming() {
677 let text = "Why hello there. General Kenobi!";
678 let tokens = vec!["why", "hello", "there", "gener", "kenobi"];
679 let tokenized_text = tokenize_stemmed_sentence(text);
680 assert_eq!(tokens, tokenized_text);
681 }
682
683 #[test]
684 fn test_sentence_tokenization_with_stemming_without_stop_words() {
685 let stop_words = get_stop_words();
686 let text = "Why hello there. General Kenobi!";
687 let tokens = vec!["hello", "gener", "kenobi"];
688 let tokenized_text = tokenize_stemmed_sentence_without_stop_words(text, stop_words);
689 assert_eq!(tokens, tokenized_text);
690 }
691
692 #[test]
693 fn test_sentence_tokenization_configurable() {
694 let token_config = TokenConfig::default();
695 let text = "Why hello there. General Kenobi!";
696 let tokens = vec!["hello", "gener", "kenobi"];
697 let tokenized_text = tokenize_sentence_configurable(text, token_config);
698 assert_eq!(tokens, tokenized_text);
699 }
700
701 #[test]
702 fn test_term_frequencies_from_str_vector() {
703 let tokens = vec!["fear", "leads", "to", "anger", "anger", "leads", "to", "hatred", "hatred", "leads", "to", "conflict", "conflict", "leads", "to", "suffering"];
704 let word_counts = BTreeMap::from([("fear".to_string(), 1.), ("leads".to_string(), 4.), ("to".to_string(), 4.), ("anger".to_string(), 2.), ("hatred".to_string(), 2.), ("conflict".to_string(), 2.), ("suffering".to_string(), 1.)]);
705 let term_frequencies = get_term_frequencies_from_word_vector(tokens);
706 assert_eq!(word_counts, term_frequencies);
707 }
708
709 #[test]
710 fn test_term_frequencies_from_str_vector_without_stop_words() {
711 let stop_words = get_stop_words();
712 let tokens = vec!["fear", "leads", "to", "anger", "anger", "leads", "to", "hatred", "hatred", "leads", "to", "conflict", "conflict", "leads", "to", "suffering"];
713 let word_counts = BTreeMap::from([("fear".to_string(), 1.), ("leads".to_string(), 4.), ("anger".to_string(), 2.), ("hatred".to_string(), 2.), ("conflict".to_string(), 2.), ("suffering".to_string(), 1.)]);
714 let term_frequencies = get_term_frequencies_from_word_vector_without_stop_words(tokens, stop_words);
715 assert_eq!(word_counts, term_frequencies);
716 }
717
718 #[test]
719 fn test_term_frequencies_from_str_vector_with_stemming() {
720 let tokens = vec!["fear", "leads", "to", "anger", "anger", "leads", "to", "hatred", "hatred", "leads", "to", "conflict", "conflict", "leads", "to", "suffering"];
721 let word_counts = BTreeMap::from([("fear".to_string(), 1.), ("lead".to_string(), 4.), ("to".to_string(), 4.), ("anger".to_string(), 2.), ("hatr".to_string(), 2.), ("conflict".to_string(), 2.), ("suffer".to_string(), 1.)]);
722 let term_frequencies = get_stemmed_term_frequencies_from_word_vector(tokens);
723 assert_eq!(word_counts, term_frequencies);
724 }
725
726 #[test]
727 fn test_term_frequencies_from_str_vector_with_stemming_without_stop_words() {
728 let stop_words = get_stop_words();
729 let tokens = vec!["fear", "leads", "to", "anger", "anger", "leads", "to", "hatred", "hatred", "leads", "to", "conflict", "conflict", "leads", "to", "suffering"];
730 let word_counts = BTreeMap::from([("fear".to_string(), 1.), ("lead".to_string(), 4.), ("anger".to_string(), 2.), ("hatr".to_string(), 2.), ("conflict".to_string(), 2.), ("suffer".to_string(), 1.)]);
731 let term_frequencies = get_stemmed_term_frequencies_from_word_vector_without_stop_words(tokens, stop_words);
732 assert_eq!(word_counts, term_frequencies);
733 }
734
735 #[test]
736 fn test_term_frequencies_from_str_vector_configurable() {
737 let token_config = TokenConfig::default();
738 let tokens = vec!["fear", "leads", "to", "anger", "anger", "leads", "to", "hatred", "hatred", "leads", "to", "conflict", "conflict", "leads", "to", "suffering"];
739 let word_counts = BTreeMap::from([("fear".to_string(), 1.), ("lead".to_string(), 4.), ("anger".to_string(), 2.), ("hatr".to_string(), 2.), ("conflict".to_string(), 2.), ("suffer".to_string(), 1.)]);
740 let term_frequencies = get_term_frequencies_from_word_vector_configurable(tokens, token_config);
741 assert_eq!(word_counts, term_frequencies);
742 }
743
744 #[test]
745 fn test_term_frequencies_from_sentences() {
746 let sentences = vec!["fear leads to anger", "anger leads to hatred", "hatred leads to conflict", "conflict leads to suffering."];
747 let word_counts1 = BTreeMap::from([
748 ("fear".to_string(), 1.), ("leads".to_string(), 1.), ("to".to_string(), 1.), ("anger".to_string(), 1.), ("hatred".to_string(), 0.), ("conflict".to_string(), 0.), ("suffering".to_string(), 0.)
749 ]);
750 let word_counts2 = BTreeMap::from([
751 ("fear".to_string(), 0.), ("leads".to_string(), 1.), ("to".to_string(), 1.), ("anger".to_string(), 1.), ("hatred".to_string(), 1.), ("conflict".to_string(), 0.), ("suffering".to_string(), 0.)
752 ]);
753 let word_counts3 = BTreeMap::from([
754 ("fear".to_string(), 0.), ("leads".to_string(), 1.), ("to".to_string(), 1.), ("anger".to_string(), 0.), ("hatred".to_string(), 1.), ("conflict".to_string(), 1.), ("suffering".to_string(), 0.)
755 ]);
756 let word_counts4 = BTreeMap::from([
757 ("fear".to_string(), 0.), ("leads".to_string(), 1.), ("to".to_string(), 1.), ("anger".to_string(), 0.), ("hatred".to_string(), 0.), ("conflict".to_string(), 1.), ("suffering".to_string(), 1.)
758 ]);
759 let term_frequencies = get_term_frequencies_from_sentences(&sentences);
760
761 assert_eq!(vec![word_counts1, word_counts2, word_counts3, word_counts4], term_frequencies);
762 }
763
764 #[test]
765 fn test_term_frequencies_from_sentences_without_stop_words() {
766 let stop_words = get_stop_words();
767 let sentences = vec!["fear leads to anger", "anger leads to hatred", "hatred leads to conflict", "conflict leads to suffering."];
768 let word_counts1 = BTreeMap::from([
769 ("fear".to_string(), 1.), ("leads".to_string(), 1.), ("anger".to_string(), 1.), ("hatred".to_string(), 0.), ("conflict".to_string(), 0.), ("suffering".to_string(), 0.)
770 ]);
771 let word_counts2 = BTreeMap::from([
772 ("fear".to_string(), 0.), ("leads".to_string(), 1.), ("anger".to_string(), 1.), ("hatred".to_string(), 1.), ("conflict".to_string(), 0.), ("suffering".to_string(), 0.)
773 ]);
774 let word_counts3 = BTreeMap::from([
775 ("fear".to_string(), 0.), ("leads".to_string(), 1.), ("anger".to_string(), 0.), ("hatred".to_string(), 1.), ("conflict".to_string(), 1.), ("suffering".to_string(), 0.)
776 ]);
777 let word_counts4 = BTreeMap::from([
778 ("fear".to_string(), 0.), ("leads".to_string(), 1.), ("anger".to_string(), 0.), ("hatred".to_string(), 0.), ("conflict".to_string(), 1.), ("suffering".to_string(), 1.)
779 ]);
780 let term_frequencies = get_term_frequencies_from_sentences_without_stop_words(&sentences, stop_words);
781
782 assert_eq!(vec![word_counts1, word_counts2, word_counts3, word_counts4], term_frequencies);
783 }
784
785 #[test]
786 fn test_term_frequencies_from_sentences_with_stemming() {
787 let sentences = vec!["fear leads to anger", "anger leads to hatred", "hatred leads to conflict", "conflict leads to suffering."];
788 let word_counts1 = BTreeMap::from([
789 ("fear".to_string(), 1.), ("lead".to_string(), 1.), ("to".to_string(), 1.), ("anger".to_string(), 1.), ("hatr".to_string(), 0.), ("conflict".to_string(), 0.), ("suffer".to_string(), 0.)
790 ]);
791 let word_counts2 = BTreeMap::from([
792 ("fear".to_string(), 0.), ("lead".to_string(), 1.), ("to".to_string(), 1.), ("anger".to_string(), 1.), ("hatr".to_string(), 1.), ("conflict".to_string(), 0.), ("suffer".to_string(), 0.)
793 ]);
794 let word_counts3 = BTreeMap::from([
795 ("fear".to_string(), 0.), ("lead".to_string(), 1.), ("to".to_string(), 1.), ("anger".to_string(), 0.), ("hatr".to_string(), 1.), ("conflict".to_string(),1.), ("suffer".to_string(), 0.)
796 ]);
797 let word_counts4 = BTreeMap::from([
798 ("fear".to_string(), 0.), ("lead".to_string(), 1.), ("to".to_string(), 1.), ("anger".to_string(), 0.), ("hatr".to_string(), 0.), ("conflict".to_string(), 1.), ("suffer".to_string(), 1.)
799 ]);
800 let term_frequencies = get_stemmed_term_frequencies_from_sentences(&sentences);
801
802 assert_eq!(vec![word_counts1, word_counts2, word_counts3, word_counts4], term_frequencies);
803 }
804
805 #[test]
806 fn test_term_frequencies_from_sentences_with_stemming_without_stop_words() {
807 let stop_words = get_stop_words();
808 let sentences = vec!["fear leads to anger", "anger leads to hatred", "hatred leads to conflict", "conflict leads to suffering."];
809 let word_counts1 = BTreeMap::from([
810 ("fear".to_string(), 1.), ("lead".to_string(), 1.), ("anger".to_string(), 1.), ("hatr".to_string(), 0.), ("conflict".to_string(), 0.), ("suffer".to_string(), 0.)
811 ]);
812 let word_counts2 = BTreeMap::from([
813 ("fear".to_string(), 0.), ("lead".to_string(), 1.), ("anger".to_string(), 1.), ("hatr".to_string(), 1.), ("conflict".to_string(), 0.), ("suffer".to_string(), 0.)
814 ]);
815 let word_counts3 = BTreeMap::from([
816 ("fear".to_string(), 0.), ("lead".to_string(), 1.), ("anger".to_string(), 0.), ("hatr".to_string(), 1.), ("conflict".to_string(), 1.), ("suffer".to_string(), 0.)
817 ]);
818 let word_counts4 = BTreeMap::from([
819 ("fear".to_string(), 0.), ("lead".to_string(), 1.), ("anger".to_string(), 0.), ("hatr".to_string(), 0.), ("conflict".to_string(), 1.), ("suffer".to_string(), 1.)
820 ]);
821 let term_frequencies = get_stemmed_term_frequencies_from_sentences_without_stop_words(&sentences, stop_words);
822
823 assert_eq!(vec![word_counts1, word_counts2, word_counts3, word_counts4], term_frequencies);
824 }
825
826 #[test]
827 fn test_term_frequencies_from_sentences_configurable() {
828 let token_config = TokenConfig::default();
829 let sentences = vec!["fear leads to anger", "anger leads to hatred", "hatred leads to conflict", "conflict leads to suffering."];
830 let word_counts1 = BTreeMap::from([
831 ("fear".to_string(), 1.), ("lead".to_string(), 1.), ("anger".to_string(), 1.), ("hatr".to_string(), 0.), ("conflict".to_string(), 0.), ("suffer".to_string(), 0.)
832 ]);
833 let word_counts2 = BTreeMap::from([
834 ("fear".to_string(), 0.), ("lead".to_string(), 1.), ("anger".to_string(), 1.), ("hatr".to_string(), 1.), ("conflict".to_string(), 0.), ("suffer".to_string(), 0.)
835 ]);
836 let word_counts3 = BTreeMap::from([
837 ("fear".to_string(), 0.), ("lead".to_string(), 1.), ("anger".to_string(), 0.), ("hatr".to_string(), 1.), ("conflict".to_string(), 1.), ("suffer".to_string(), 0.)
838 ]);
839 let word_counts4 = BTreeMap::from([
840 ("fear".to_string(), 0.), ("lead".to_string(), 1.), ("anger".to_string(), 0.), ("hatr".to_string(), 0.), ("conflict".to_string(), 1.), ("suffer".to_string(), 1.)
841 ]);
842 let term_frequencies = get_term_frequencies_from_sentences_configurable(&sentences, token_config);
843
844 assert_eq!(vec![word_counts1, word_counts2, word_counts3, word_counts4], term_frequencies);
845 }
846}