vader_sentiment 0.1.0

Bindings for Rust from the original Python VaderSentiment analysis tool.
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
/**
 * If you use the VADER sentiment analysis tools, please cite:
 * Hutto, C.J. & Gilbert, E.E. (2014). VADER: A Parsimonious Rule-based Model for
 * Sentiment Analysis of Social Media Text. Eighth International Conference on
 * Weblogs and Social Media (ICWSM-14). Ann Arbor, MI, June 2014.
 **/


#[macro_use] extern crate maplit;
#[macro_use] extern crate lazy_static;
extern crate regex;
extern crate unicase;

use std::cmp::min;
use std::collections::{HashMap, HashSet};
use regex::Regex;
use unicase::UniCase;

#[cfg(test)]
mod tests;

//empirically derived constants for scaling/amplifying sentiments
const B_INCR: f64 =  0.293;
const B_DECR: f64 = -0.293;

const C_INCR:   f64 =  0.733;
const NEGATION_SCALAR: f64 = -0.740;

//sentiment increases for text with question or exclamation marks
const QMARK_INCR: f64 = 0.180;
const EMARK_INCR: f64 = 0.292;

//Maximum amount of question or question marks before their contribution to sentiment is
//disregarded
const MAX_EMARK: i32 = 4;
const MAX_QMARK: i32 = 3;
const MAX_QMARK_INCR: f64 = 0.96;

const NORMALIZATION_ALPHA: f64 = 15.0;

static RAW_LEXICON: &'static str = include_str!("resources/vader_lexicon.txt");
static RAW_EMOJI_LEXICON: &'static str = include_str!("resources/emoji_utf8_lexicon.txt");

lazy_static! {

    static ref NEGATION_TOKENS: HashSet<UniCase<&'static str>> = convert_args!(hashset!(
        "aint", "arent", "cannot", "cant", "couldnt", "darent", "didnt", "doesnt",
        "ain't", "aren't", "can't", "couldn't", "daren't", "didn't", "doesn't",
        "dont", "hadnt", "hasnt", "havent", "isnt", "mightnt", "mustnt", "neither",
        "don't", "hadn't", "hasn't", "haven't", "isn't", "mightn't", "mustn't",
        "neednt", "needn't", "never", "none", "nope", "nor", "not", "nothing", "nowhere",
        "oughtnt", "shant", "shouldnt", "uhuh", "wasnt", "werent",
        "oughtn't", "shan't", "shouldn't", "uh-uh", "wasn't", "weren't",
        "without", "wont", "wouldnt", "won't", "wouldn't", "rarely", "seldom", "despite"));

    static ref BOOSTER_DICT: HashMap<UniCase<&'static str>, f64> =  convert_args!(hashmap!(
         "absolutely"=> B_INCR, "amazingly"=> B_INCR, "awfully"=> B_INCR,
          "completely"=> B_INCR, "considerable"=> B_INCR, "considerably"=> B_INCR,
          "decidedly"=> B_INCR, "deeply"=> B_INCR, "effing"=> B_INCR, "enormous"=> B_INCR, "enormously"=> B_INCR,
          "entirely"=> B_INCR, "especially"=> B_INCR, "exceptional"=> B_INCR, "exceptionally"=> B_INCR,
          "extreme"=> B_INCR, "extremely"=> B_INCR,
          "fabulously"=> B_INCR, "flipping"=> B_INCR, "flippin"=> B_INCR, "frackin"=> B_INCR, "fracking"=> B_INCR,
          "fricking"=> B_INCR, "frickin"=> B_INCR, "frigging"=> B_INCR, "friggin"=> B_INCR, "fully"=> B_INCR,
          "fuckin"=> B_INCR, "fucking"=> B_INCR, "fuggin"=> B_INCR, "fugging"=> B_INCR,
          "greatly"=> B_INCR, "hella"=> B_INCR, "highly"=> B_INCR, "hugely"=> B_INCR,
          "incredible"=> B_INCR, "incredibly"=> B_INCR, "intensely"=> B_INCR,
          "major"=> B_INCR, "majorly"=> B_INCR, "more"=> B_INCR, "most"=> B_INCR, "particularly"=> B_INCR,
          "purely"=> B_INCR, "quite"=> B_INCR, "really"=> B_INCR, "remarkably"=> B_INCR,
          "so"=> B_INCR, "substantially"=> B_INCR,
          "thoroughly"=> B_INCR, "total"=> B_INCR, "totally"=> B_INCR, "tremendous"=> B_INCR, "tremendously"=> B_INCR,
          "uber"=> B_INCR, "unbelievably"=> B_INCR, "unusually"=> B_INCR, "utter"=> B_INCR, "utterly"=> B_INCR,
          "very"=> B_INCR,
          "almost"=> B_DECR, "barely"=> B_DECR, "hardly"=> B_DECR, "just enough"=> B_DECR,
          "kind of"=> B_DECR, "kinda"=> B_DECR, "kindof"=> B_DECR, "kind-of"=> B_DECR,
          "less"=> B_DECR, "little"=> B_DECR, "marginal"=> B_DECR, "marginally"=> B_DECR,
          "occasional"=> B_DECR, "occasionally"=> B_DECR, "partly"=> B_DECR,
          "scarce"=> B_DECR, "scarcely"=> B_DECR, "slight"=> B_DECR, "slightly"=> B_DECR, "somewhat"=> B_DECR,
          "sort of"=> B_DECR, "sorta"=> B_DECR, "sortof"=> B_DECR, "sort-of"=> B_DECR
));

    /**
     * These dicts were used in some WIP or planned features in the original
     * I may implement them later if I can understand how they're intended to work
     **/

    // // check for sentiment laden idioms that do not contain lexicon words (future work, not yet implemented)
    // static ref SENTIMENT_LADEN_IDIOMS: HashMap<&'static str, f64> = hashmap![
    //      "cut the mustard" => 2.0, "hand to mouth" => -2.0,
    //      "back handed" => -2.0, "blow smoke" => -2.0, "blowing smoke" => -2.0,
    //      "upper hand" => 1.0, "break a leg" => 2.0,
    //      "cooking with gas" => 2.0, "in the black" => 2.0, "in the red" => -2.0,
    //      "on the ball" => 2.0, "under the weather" => -2.0];


    // check for special case idioms containing lexicon words
    static ref SPECIAL_CASE_IDIOMS: HashMap<UniCase<&'static str>, f64> = convert_args!(hashmap!(
         "the shit" => 3.0, "the bomb" => 3.0, "bad ass" => 1.5, "badass" => 1.5, "yeah right" => -2.0,
         "kiss of death" => -1.5, "to die for" => 3.0));

    static ref ALL_CAPS_RE: Regex = Regex::new(r"^[A-Z\W]+$").unwrap();

    static ref PUNCTUATION: &'static str = "[!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~]";

    pub static ref LEXICON: HashMap<UniCase<&'static str>, f64> = parse_raw_lexicon(RAW_LEXICON);
    pub static ref EMOJI_LEXICON: HashMap<&'static str, &'static str> = parse_raw_emoji_lexicon(RAW_EMOJI_LEXICON);

    static ref STATIC_BUT: UniCase<&'static str> = UniCase::new("but");
    static ref STATIC_THIS: UniCase<&'static str> = UniCase::new("this");
    static ref STATIC_AT: UniCase<&'static str> = UniCase::new("at");
    static ref STATIC_LEAST: UniCase<&'static str> = UniCase::new("least");
    static ref STATIC_VERY: UniCase<&'static str> = UniCase::new("very");
    static ref STATIC_WITHOUT: UniCase<&'static str> = UniCase::new("without");
    static ref STATIC_DOUBT: UniCase<&'static str> = UniCase::new("doubt");
    static ref STATIC_SO: UniCase<&'static str> = UniCase::new("so");
    static ref STATIC_NEVER: UniCase<&'static str> = UniCase::new("never");
    static ref STATIC_KIND: UniCase<&'static str> = UniCase::new("kind");
    static ref STATIC_OF: UniCase<&'static str> = UniCase::new("of");


}


/**
 * Takes the raw text of the lexicon files and creates HashMaps
 **/
pub fn parse_raw_lexicon(raw_lexicon: &str) -> HashMap<UniCase<&str>, f64> {
    let lines = raw_lexicon.trim_end_matches("\n").split("\n");
    let mut lex_dict = HashMap::new();
    for line in lines {
        if line.is_empty() {
          continue;
        }
        let mut split_line = line.split('\t');
        let word = split_line.next().unwrap();
        let val  = split_line.next().unwrap();
        lex_dict.insert(UniCase::new(word), val.parse().unwrap());
    }
    lex_dict
}

pub fn parse_raw_emoji_lexicon(raw_emoji_lexicon: &str) -> HashMap<&str, &str> {
    let lines = raw_emoji_lexicon.trim_end_matches("\n").split("\n");
    let mut emoji_dict = HashMap::new();
    for line in lines {
        if line.is_empty() {
          continue;
        }
        let mut split_line = line.split('\t');
        let word = split_line.next().unwrap();
        let desc  = split_line.next().unwrap();
        emoji_dict.insert(word, desc);
    }
    emoji_dict
}

/**
 *  Stores tokens and useful info about text
 **/
struct ParsedText<'a> {
    tokens: Vec<UniCase<&'a str>>,
    has_mixed_caps: bool,
    punc_amplifier: f64,
}

impl<'a> ParsedText<'a> {
    //Tokenizes and extracts useful properties of input text
    fn from_text(text: &'a str) -> ParsedText {
        let _tokens = ParsedText::tokenize(text);
        let _has_mixed_caps = ParsedText::has_mixed_caps(&_tokens);
        let _punc_amplifier = ParsedText::get_punctuation_emphasis(text);
        ParsedText {
            tokens: _tokens,
            has_mixed_caps: _has_mixed_caps,
            punc_amplifier: _punc_amplifier,
         }
    }

    fn tokenize(text: &str) -> Vec<UniCase<&str>> {
        let tokens = text.split_whitespace()
                                    .filter(|s| s.len() > 1)
                                    .map(|s| ParsedText::strip_punc_if_word(s))
                                    .map(UniCase::new)
                                    .collect();
        tokens
    }

    // Removes punctuation from words, ie "hello!!!" -> "hello" and ",don't??" -> "don't"
    // Keeps most emoticons, ie ":^)" -> ":^)"\
    fn strip_punc_if_word(token: &str) -> &str {
        let stripped = token.trim_matches(|c| PUNCTUATION.contains(c));
        if stripped.len() <= 1 {
            return token;
        }
        stripped
    }

    // Determines if message has a mix of both all caps and non all caps words
    fn has_mixed_caps<S: AsRef<str>>(tokens: &[S]) -> bool {
        let (mut has_caps, mut has_non_caps) = (false, false);
        for token in tokens.iter() {
            if is_all_caps(token.as_ref()) {
                has_caps = true;
            } else {
                has_non_caps = true;
            }
            if has_non_caps && has_caps {
                return true;
            }
        }
        false
    }

    //uses empirical values to determine how the use of '?' and '!' contribute to sentiment
    fn get_punctuation_emphasis(text: &str) -> f64 {
       let emark_count: i32 = text.as_bytes().iter().filter(|b| **b == b'!').count() as i32;
       let qmark_count: i32 = text.as_bytes().iter().filter(|b| **b == b'?').count() as i32;

       let emark_emph = min(emark_count, MAX_EMARK) as f64 * EMARK_INCR;
       let mut qmark_emph = (qmark_count as f64) * QMARK_INCR;
       if qmark_count > MAX_QMARK {
           qmark_emph = MAX_QMARK_INCR;
       }
       qmark_emph + emark_emph
    }
}

//Checks if all letters in token are capitalized
fn is_all_caps<S: AsRef<str>>(token: S) -> bool {
    let token_ref = token.as_ref();
    ALL_CAPS_RE.is_match(token_ref) && token_ref.len() > 1
}

//Checks if token is in the list of NEGATION_SCALAR
fn is_negated(token: &UniCase<&str>) -> bool {
    if NEGATION_TOKENS.contains(token) {
        return true;
    }
    token.contains("n't")
}

//Normalizes score between -1.0 and 1.0. Alpha value is expected upper limit for a score
fn normalize_score(score: f64) -> f64 {
    let norm_score = score / (score * score + NORMALIZATION_ALPHA).sqrt();
    if norm_score < -1.0 {
        return -1.0;
    } else if norm_score > 1.0 {
        return 1.0;
    }
    norm_score
}

//Checks how previous tokens affect the valence of the current token
fn scalar_inc_dec(token: &UniCase<&str>, valence: f64, has_mixed_caps: bool) -> f64 {
    let mut scalar = 0.0;
    if BOOSTER_DICT.contains_key(token) {
        scalar = *BOOSTER_DICT.get(token).unwrap();
        if valence < 0.0 {
            scalar *= -1.0;
        }
        if is_all_caps(token) && has_mixed_caps {
            if valence > 0.0 {
                scalar += C_INCR;
            } else {
                scalar -= C_INCR;
            }
        }
    }
    scalar
}

fn sum_sentiment_scores(scores: Vec<f64>) -> (f64, f64, u32) {
    let (mut pos_sum, mut neg_sum, mut neu_count) = (0f64, 0f64, 0);
    for score in scores {
        if score > 0f64 {
            pos_sum += score + 1.0;
        } else if score < 0f64 {
            neg_sum += score - 1.0;
        } else {
            neu_count += 1;
        }
    }
    (pos_sum, neg_sum, neu_count)
}

pub struct SentimentIntensityAnalyzer<'a> {
    lexicon: &'a HashMap<UniCase<&'a str>, f64>,
    emoji_lexicon: &'a HashMap<&'a str, &'a str>,
}

impl<'a> SentimentIntensityAnalyzer<'a> {
    pub fn new() -> SentimentIntensityAnalyzer<'static>{
        SentimentIntensityAnalyzer {
            lexicon: &LEXICON,
            emoji_lexicon: &EMOJI_LEXICON,
        }
    }

    pub fn from_lexicon<'b>(_lexicon: &'b HashMap<UniCase<&str>, f64>) ->
                                        SentimentIntensityAnalyzer<'b> {
        SentimentIntensityAnalyzer {
            lexicon: _lexicon,
            emoji_lexicon: &EMOJI_LEXICON,
        }
    }

    fn get_total_sentiment(&self, sentiments: Vec<f64>, punct_emph_amplifier: f64) -> HashMap<&str, f64> {
        let (mut neg, mut neu, mut pos, mut compound) = (0f64, 0f64, 0f64, 0f64);
        if sentiments.len() > 0 {
            let mut total_sentiment: f64 = sentiments.iter().sum();
            if total_sentiment > 0f64 {
                total_sentiment += punct_emph_amplifier;
            } else {
                total_sentiment -= punct_emph_amplifier;
            }
            compound = normalize_score(total_sentiment);

            let (mut pos_sum, mut neg_sum, neu_count) = sum_sentiment_scores(sentiments);

            if pos_sum > neg_sum.abs() {
                pos_sum += punct_emph_amplifier;
            } else if pos_sum < neg_sum.abs() {
                neg_sum -= punct_emph_amplifier;
            }

            let total = pos_sum + neg_sum.abs() + (neu_count as f64);
            pos = (pos_sum / total).abs();
            neg = (neg_sum / total).abs();
            neu = (neu_count as f64 / total).abs();
        }
        let sentiment_dict = hashmap!["neg" => neg,
                                      "neu" => neu,
                                      "pos" => pos,
                                      "compound" => compound];
        sentiment_dict
    }

    pub fn polarity_scores(&self, text: &str) -> HashMap<&str, f64>{
        let text = self.append_emoji_descriptions(text);
        let parsedtext = ParsedText::from_text(&text);
        println!("{:#?}", parsedtext.tokens);
        let mut sentiments = Vec::new();
        let tokens = &parsedtext.tokens;

        for (i, word) in tokens.iter().enumerate() {
            if BOOSTER_DICT.contains_key(word) {
                sentiments.push(0f64);
            } else if i < tokens.len() - 1 && word == &*STATIC_KIND
                                  && tokens[i + 1] == *STATIC_OF {
                sentiments.push(0f64);
            } else {
                sentiments.push(self.sentiment_valence(&parsedtext, word, i));
            }
        }
        but_check(tokens, &mut sentiments);
        self.get_total_sentiment(sentiments, parsedtext.punc_amplifier)
    }

    //Removes emoji and appends their description to the end the input text
    fn append_emoji_descriptions(&self, text: &str) -> String {
        let mut result = String::new();
        let mut prev_space = true;
        for chr in text.chars() {
            let chr_string = chr.to_string();
            if let Some(chr_replacement) = self.emoji_lexicon.get(chr_string.as_str()) {
                if !prev_space {
                    result.push(' ');
                }
                result.push_str(chr_replacement);
                prev_space = false;
            } else {
                prev_space = chr == ' ';
                result.push(chr);
            }
        }
        println!("{}", result);
        result
    }

    fn sentiment_valence(&self, parsed: &ParsedText, word: &UniCase<&str>, i: usize) -> f64 {
        let mut valence = 0f64;
        let tokens = &parsed.tokens;
        if let Some(word_valence) = self.lexicon.get(word) {
            valence = *word_valence;
            if is_all_caps(word) && parsed.has_mixed_caps {
                if valence > 0f64 {
                    valence += C_INCR;
                } else {
                    valence -= C_INCR
                }
            }
            for start_i in 0..3 {
                if i > start_i && !self.lexicon.contains_key(
                                &tokens[i - start_i - 1]) {
                    let mut s = scalar_inc_dec(&tokens[i - start_i - 1], valence, parsed.has_mixed_caps);
                    if start_i == 1 {
                        s *= 0.95;
                    } else if start_i == 2 {
                        s *= 0.9
                    }
                    valence += s;
                    valence = negation_check(valence, tokens, start_i, i);
                    if start_i == 2 {
                        valence = special_idioms_check(valence, tokens, i);
                    }
                }
            }
            valence = least_check(valence, tokens, i);
        }
        valence
    }
}

/**
 * Check for specific patterns or tokens, and modify sentiment as needed
 **/
fn negation_check(valence: f64, tokens: &[UniCase<&str>], start_i: usize, i: usize) -> f64 {
   let mut valence = valence;
   if start_i == 0 {
       if is_negated(&tokens[i - start_i - 1]) {
           valence *= NEGATION_SCALAR;
       }
   } else if start_i == 1 {
       if tokens[i - 2] == *STATIC_NEVER &&
         (tokens[i - 1] == *STATIC_SO ||
          tokens[i - 1] == *STATIC_THIS) {
           valence *= 1.25
       } else if tokens[i - 2] == *STATIC_WITHOUT && tokens[i - 1] == *STATIC_DOUBT {
           valence *= 1.0
       } else if is_negated(&tokens[i - start_i - 1]) {
           valence *= NEGATION_SCALAR;
       }
   } else if start_i == 2 {
       if tokens[i - 3] == *STATIC_NEVER &&
          tokens[i - 2] == *STATIC_SO || tokens[i - 2] == *STATIC_THIS||
          tokens[i - 1] == *STATIC_SO || tokens[i - 1] == *STATIC_THIS {
           valence *= 1.25
       } else if tokens[i - 3] == *STATIC_WITHOUT &&
                 tokens[i - 2] == *STATIC_DOUBT ||
                 tokens[i - 1] == *STATIC_DOUBT {
           valence *= 1.0;
       } else if is_negated(&tokens[i - start_i - 1]) {
           valence *= NEGATION_SCALAR;
       }
   }
   valence
}

// If "but" is in the tokens, scales down the sentiment of words before "but" and
// adds more emphasis to the words after
fn but_check(tokens: &[UniCase<&str>], sentiments: &mut Vec<f64>) {
    match tokens.iter().position(|&s| s == *STATIC_BUT) {
        Some(but_index) => {
            for i in 0..sentiments.len() {
                if i < but_index {
                    sentiments[i] *= 0.5;
                } else if i > but_index {
                    sentiments[i] *= 1.5;
                }
            }
        },
        None => return,
    }
}

fn least_check(_valence: f64, tokens: &[UniCase<&str>], i: usize) -> f64 {
    let mut valence = _valence;
    if i > 1 && tokens[i - 1] == *STATIC_LEAST
             && tokens[i - 2] == *STATIC_AT
             && tokens[i - 2] == *STATIC_VERY {
        valence *= NEGATION_SCALAR;
    } else if i > 0 && tokens[i - 1] == *STATIC_LEAST {
        valence *= NEGATION_SCALAR;
    }
    valence
}

// //This was present in the original python implementation, but unused
// fn idioms_check(valence: f64, text: &str) -> f64 {
//     let mut total_valence = 0f64;
//     let mut count = 0;
//     for (idiom, val) in SENTIMENT_LADEN_IDIOMS.iter() {
//         if text.contains(idiom) {
//             total_valence += val;
//             count += 1;
//         }
//     }
//     if count > 0 {
//         return total_valence / count as f64;
//     }
//     0f64
// }

fn special_idioms_check(_valence: f64, tokens: &[UniCase<&str>], i: usize) -> f64 {
    assert_eq!(i > 2, true);
    let mut valence = _valence;
    let mut end_i = i + 1;

    //if i isn't the last index
    if tokens.len() - 1 > i {
        //make the end of the window 2 words ahead, or until the end of the tokens
        end_i = min(i + 3, tokens.len());
    }

    // TODO: We can do this faster by comparing splits?
    let target_window = tokens[(i - 3)..end_i].iter().map(|u| u.as_ref()).collect::<Vec<&str>>().join(" ").to_lowercase();

    println!("{}", target_window);
    for (key, val) in SPECIAL_CASE_IDIOMS.iter() {
        if target_window.contains(key.as_ref()) {
            valence = *val;
            break;
        }
    }
    let prev_three = tokens[(i - 3)..i].iter().map(|u| u.as_ref()).collect::<Vec<&str>>().join(" ").to_lowercase();
    for (key, val) in BOOSTER_DICT.iter() {
        if prev_three.contains(key.as_ref()) {
            valence += *val;
        }
    }
    valence
}

pub mod demo;