summarizer/
tokenizer.rs

1use std::collections::HashMap;
2use punkt::{SentenceTokenizer, TrainingData};
3use punkt::params::Standard;
4
5static STOPWORDS: [ &str ; 127 ] = [ "i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", 
6    "your", "yours", "yourself", "yourselves", "he", "him", "his", "himself", "she", "her", "hers", "herself", 
7    "it", "its", "itself", "they", "them", "their", "theirs", "themselves", "what", "which", "who", "whom", "this",
8     "that", "these", "those", "am", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "having", 
9     "do", "does", "did", "doing", "a", "an", "the", "and", "but", "if", "or", "because", "as", "until", "while", "of", 
10     "at", "by", "for", "with", "about", "against", "between", "into", "through", "during", "before", "after", "above",
11     "below", "to", "from", "up", "down", "in", "out", "on", "off", "over", "under", "again", "further", "then", "once",
12       "here", "there", "when", "where", "why", "how", "all", "any", "both", "each", "few", "more", "most", "other", 
13       "some", "such", "no", "nor", "not", "only", "own", "same", "so", "than", "too", "very", "s", "t", "can",
14        "will", "just", "don", "should", "now" ] ;
15
16pub struct Tokenizer {}
17
18impl Tokenizer {
19
20    /// Transform a `text` into a list of sentences
21    /// It uses the popular Punkt sentence tokenizer from a Rust port: 
22    /// <`/`>https://github.com/ferristseng/rust-punkt<`/`>
23    pub fn text_to_sentences( text: &str ) -> Vec<String> {
24        let english = TrainingData::english();
25        let mut sentences: Vec<String> = Vec::new() ; 
26        for s in SentenceTokenizer::<Standard>::new(text, &english) {
27            sentences.push( s.to_owned() ) ; 
28        }
29        sentences
30    }
31
32    /// Transforms the sentence into a list of words (tokens)
33    /// eliminating stopwords while doing so
34    pub fn sentence_to_tokens( sentence: &str ) -> Vec<&str> {
35        let tokens: Vec<&str> = sentence.split_ascii_whitespace().collect() ; 
36        let filtered_tokens: Vec<&str> = tokens
37                                    .into_iter()
38                                    .filter( |token| !STOPWORDS.contains( &token.to_lowercase().as_str() ) )
39                                    .collect() ;
40        filtered_tokens
41    }
42
43    /// Given a list of words, build a frequency map
44    /// where keys are words and values are the frequencies of those words
45    /// This method will be used to compute the term frequencies of each word
46    /// present in a sentence
47    pub fn get_freq_map<'a>( words: &'a Vec<&'a str> ) -> HashMap<&'a str,usize> {
48        let mut freq_map: HashMap<&str,usize> = HashMap::new() ; 
49        for word in words {
50            if freq_map.contains_key( word ) {
51                freq_map
52                    .entry( word )
53                    .and_modify( | e | { 
54                        *e += 1 ; 
55                    } ) ; 
56            }
57            else {
58                freq_map.insert( *word , 1 ) ; 
59            }
60        }
61        freq_map
62    }
63
64}
65
summarizer/tokenizer.rs

summarizer/
tokenizer.rs