1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
extern crate stemmer;

use self::stemmer::Stemmer;
use std::collections::{HashMap};

pub fn filter_unwanted_chars(doc: &String) -> String {
    let unwanted_chars: Vec<char> = ".?!',()[];{}:”“".chars().collect();
    doc.chars().filter(|c| !unwanted_chars.contains(&c) ).collect::<String>()
}

pub fn filter_stop_words(document: &String) -> Vec<&str> {
    static STOP_WORDS: &'static str = include_str!("stopwords.txt");

    // From the "file string", we construct a Set of stopwords
    let stop_words = STOP_WORDS.split_whitespace().collect::<Vec<&str>>();

    // Build the Set of words in the document
    let mut term_set = document.split_whitespace().collect::<Vec<&str>>();

    // Build the final Set of term in the document, filtered of all the stopwords
    term_set.retain(|&term| !stop_words.contains(&term));

    term_set
}

// Make it the more pluggeable possible, so that everyone can choose
// what operation to operate
pub fn clean_article(content: &String, lang: &str) -> Vec<String> {
    // Handle gracefully unknown languages
    let language = match lang {
        "fr" => "french".to_string(),
        _ => "english".to_string(),
    };

    let mut stemmer = Stemmer::new(&language).unwrap();

    // TODO
    // Benchmark lowercase a big string vs map.lowercase Vec
    let lowercase_doc = &content.to_lowercase();

    // Filter unwanted Chars
    // TODO
    // Benchmark filter a big string vs map.lowercase Vec
    let clean_doc = filter_unwanted_chars(&lowercase_doc);

    // Filter Stop Words
    // TODO
    // Add stop word list for other languages
    // benchmark actual process versus operations on Vec
    let filtered_doc = filter_stop_words(&clean_doc);

    // Stemmification
    // Look for better stemmers ?
    let mut stemmed_doc = Vec::new();
    for word in filtered_doc.into_iter() {
        if word != "" {
            stemmed_doc.push(stemmer.stem(&word));
        }
    }

    stemmed_doc
}

#[cfg(test)]
mod tests {
    use super::filter_unwanted_chars;
    use super::filter_stop_words;

    #[test]
    fn it_filters_unwanted_chars() {
        let doc = "this is, awesome!".to_string();
        let filtered_doc = filter_unwanted_chars(&doc);

        assert_eq!(filtered_doc, "this is awesome");
    }

    #[test]
    fn it_removes_stop_words() {
        let doc = "this is awesome".to_string();
        let filtered_doc = filter_stop_words(&doc);

        assert_eq!(filtered_doc, vec!["awesome"]);
    }
}