bleach/
lib.rs

1extern crate stemmer;
2
3use self::stemmer::Stemmer;
4use std::collections::{HashMap};
5
6pub fn filter_unwanted_chars(doc: &String) -> String {
7    let unwanted_chars: Vec<char> = ".?!',()[];{}:”“".chars().collect();
8    doc.chars().filter(|c| !unwanted_chars.contains(&c) ).collect::<String>()
9}
10
11pub fn filter_stop_words(document: &String) -> Vec<&str> {
12    static STOP_WORDS: &'static str = include_str!("stopwords.txt");
13
14    // From the "file string", we construct a Set of stopwords
15    let stop_words = STOP_WORDS.split_whitespace().collect::<Vec<&str>>();
16
17    // Build the Set of words in the document
18    let mut term_set = document.split_whitespace().collect::<Vec<&str>>();
19
20    // Build the final Set of term in the document, filtered of all the stopwords
21    term_set.retain(|&term| !stop_words.contains(&term));
22
23    term_set
24}
25
26// Make it the more pluggeable possible, so that everyone can choose
27// what operation to operate
28pub fn clean_article(content: &String, lang: &str) -> Vec<String> {
29    // Handle gracefully unknown languages
30    let language = match lang {
31        "fr" => "french".to_string(),
32        _ => "english".to_string(),
33    };
34
35    let mut stemmer = Stemmer::new(&language).unwrap();
36
37    // TODO
38    // Benchmark lowercase a big string vs map.lowercase Vec
39    let lowercase_doc = &content.to_lowercase();
40
41    // Filter unwanted Chars
42    // TODO
43    // Benchmark filter a big string vs map.lowercase Vec
44    let clean_doc = filter_unwanted_chars(&lowercase_doc);
45
46    // Filter Stop Words
47    // TODO
48    // Add stop word list for other languages
49    // benchmark actual process versus operations on Vec
50    let filtered_doc = filter_stop_words(&clean_doc);
51
52    // Stemmification
53    // Look for better stemmers ?
54    let mut stemmed_doc = Vec::new();
55    for word in filtered_doc.into_iter() {
56        if word != "" {
57            stemmed_doc.push(stemmer.stem(&word));
58        }
59    }
60
61    stemmed_doc
62}
63
64#[cfg(test)]
65mod tests {
66    use super::filter_unwanted_chars;
67    use super::filter_stop_words;
68
69    #[test]
70    fn it_filters_unwanted_chars() {
71        let doc = "this is, awesome!".to_string();
72        let filtered_doc = filter_unwanted_chars(&doc);
73
74        assert_eq!(filtered_doc, "this is awesome");
75    }
76
77    #[test]
78    fn it_removes_stop_words() {
79        let doc = "this is awesome".to_string();
80        let filtered_doc = filter_stop_words(&doc);
81
82        assert_eq!(filtered_doc, vec!["awesome"]);
83    }
84}
85