1extern crate stemmer;
2
3use self::stemmer::Stemmer;
4use std::collections::{HashMap};
5
6pub fn filter_unwanted_chars(doc: &String) -> String {
7 let unwanted_chars: Vec<char> = ".?!',()[];{}:”“".chars().collect();
8 doc.chars().filter(|c| !unwanted_chars.contains(&c) ).collect::<String>()
9}
10
11pub fn filter_stop_words(document: &String) -> Vec<&str> {
12 static STOP_WORDS: &'static str = include_str!("stopwords.txt");
13
14 let stop_words = STOP_WORDS.split_whitespace().collect::<Vec<&str>>();
16
17 let mut term_set = document.split_whitespace().collect::<Vec<&str>>();
19
20 term_set.retain(|&term| !stop_words.contains(&term));
22
23 term_set
24}
25
26pub fn clean_article(content: &String, lang: &str) -> Vec<String> {
29 let language = match lang {
31 "fr" => "french".to_string(),
32 _ => "english".to_string(),
33 };
34
35 let mut stemmer = Stemmer::new(&language).unwrap();
36
37 let lowercase_doc = &content.to_lowercase();
40
41 let clean_doc = filter_unwanted_chars(&lowercase_doc);
45
46 let filtered_doc = filter_stop_words(&clean_doc);
51
52 let mut stemmed_doc = Vec::new();
55 for word in filtered_doc.into_iter() {
56 if word != "" {
57 stemmed_doc.push(stemmer.stem(&word));
58 }
59 }
60
61 stemmed_doc
62}
63
64#[cfg(test)]
65mod tests {
66 use super::filter_unwanted_chars;
67 use super::filter_stop_words;
68
69 #[test]
70 fn it_filters_unwanted_chars() {
71 let doc = "this is, awesome!".to_string();
72 let filtered_doc = filter_unwanted_chars(&doc);
73
74 assert_eq!(filtered_doc, "this is awesome");
75 }
76
77 #[test]
78 fn it_removes_stop_words() {
79 let doc = "this is awesome".to_string();
80 let filtered_doc = filter_stop_words(&doc);
81
82 assert_eq!(filtered_doc, vec!["awesome"]);
83 }
84}
85