scirs2_text/
sentiment.rs

1//! Sentiment analysis functionality
2//!
3//! This module provides sentiment analysis capabilities including
4//! lexicon-based and rule-based sentiment analysis.
5
6use crate::error::Result;
7use crate::tokenize::{Tokenizer, WordTokenizer};
8use std::collections::HashMap;
9
10/// Sentiment polarity
11#[derive(Debug, Clone, Copy, PartialEq)]
12pub enum Sentiment {
13    /// Positive sentiment
14    Positive,
15    /// Negative sentiment
16    Negative,
17    /// Neutral sentiment
18    Neutral,
19}
20
21impl std::fmt::Display for Sentiment {
22    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
23        match self {
24            Sentiment::Positive => write!(f, "Positive"),
25            Sentiment::Negative => write!(f, "Negative"),
26            Sentiment::Neutral => write!(f, "Neutral"),
27        }
28    }
29}
30
31impl Sentiment {
32    /// Convert sentiment to a numerical score
33    pub fn to_score(&self) -> f64 {
34        match self {
35            Sentiment::Positive => 1.0,
36            Sentiment::Neutral => 0.0,
37            Sentiment::Negative => -1.0,
38        }
39    }
40
41    /// Convert a numerical score to sentiment
42    pub fn from_score(score: f64) -> Self {
43        if score > 0.0 {
44            Sentiment::Positive
45        } else if score < 0.0 {
46            Sentiment::Negative
47        } else {
48            Sentiment::Neutral
49        }
50    }
51}
52
53/// Result of sentiment analysis
54#[derive(Debug, Clone)]
55pub struct SentimentResult {
56    /// The overall sentiment
57    pub sentiment: Sentiment,
58    /// The raw sentiment score
59    pub score: f64,
60    /// Confidence level (0-1)
61    pub confidence: f64,
62    /// Breakdown of positive and negative word counts
63    pub word_counts: SentimentWordCounts,
64}
65
66/// Word counts for sentiment analysis
67#[derive(Debug, Clone, Default)]
68pub struct SentimentWordCounts {
69    /// Number of positive words
70    pub positive_words: usize,
71    /// Number of negative words
72    pub negative_words: usize,
73    /// Number of neutral words
74    pub neutral_words: usize,
75    /// Total number of words analyzed
76    pub total_words: usize,
77}
78
79/// A sentiment lexicon mapping words to sentiment scores
80#[derive(Debug, Clone)]
81pub struct SentimentLexicon {
82    /// Word to sentiment score mapping
83    lexicon: HashMap<String, f64>,
84    /// Default score for unknown words
85    default_score: f64,
86}
87
88impl SentimentLexicon {
89    /// Create a new sentiment lexicon
90    pub fn new() -> Self {
91        Self {
92            lexicon: HashMap::new(),
93            default_score: 0.0,
94        }
95    }
96
97    /// Create a basic sentiment lexicon with common words
98    pub fn with_basiclexicon() -> Self {
99        let mut lexicon = HashMap::new();
100
101        // Positive words
102        let positive_words = [
103            ("good", 1.0),
104            ("great", 2.0),
105            ("excellent", 3.0),
106            ("amazing", 3.0),
107            ("wonderful", 2.5),
108            ("fantastic", 2.5),
109            ("love", 2.0),
110            ("like", 1.0),
111            ("happy", 2.0),
112            ("joy", 2.0),
113            ("pleased", 1.5),
114            ("satisfied", 1.0),
115            ("positive", 1.0),
116            ("perfect", 3.0),
117            ("best", 2.5),
118            ("awesome", 2.5),
119            ("beautiful", 2.0),
120            ("brilliant", 2.5),
121            ("superb", 2.5),
122            ("nice", 1.0),
123        ];
124
125        // Negative words
126        let negative_words = [
127            ("bad", -1.0),
128            ("terrible", -2.5),
129            ("awful", -2.5),
130            ("horrible", -3.0),
131            ("hate", -2.5),
132            ("dislike", -1.5),
133            ("sad", -2.0),
134            ("unhappy", -2.0),
135            ("disappointed", -2.0),
136            ("negative", -1.0),
137            ("worst", -3.0),
138            ("poor", -1.5),
139            ("disgusting", -3.0),
140            ("ugly", -2.0),
141            ("nasty", -2.5),
142            ("stupid", -2.0),
143            ("pathetic", -2.5),
144            ("failure", -2.0),
145            ("fail", -2.0),
146            ("sucks", -2.0),
147        ];
148
149        for (word, score) in &positive_words {
150            lexicon.insert(word.to_string(), *score);
151        }
152
153        for (word, score) in &negative_words {
154            lexicon.insert(word.to_string(), *score);
155        }
156
157        Self {
158            lexicon,
159            default_score: 0.0,
160        }
161    }
162
163    /// Add a word to the lexicon
164    pub fn add_word(&mut self, word: String, score: f64) {
165        self.lexicon.insert(word.to_lowercase(), score);
166    }
167
168    /// Get the sentiment score for a word
169    pub fn get_score(&self, word: &str) -> f64 {
170        self.lexicon
171            .get(&word.to_lowercase())
172            .copied()
173            .unwrap_or(self.default_score)
174    }
175
176    /// Check if a word is in the lexicon
177    pub fn contains(&self, word: &str) -> bool {
178        self.lexicon.contains_key(&word.to_lowercase())
179    }
180
181    /// Get the size of the lexicon
182    pub fn len(&self) -> usize {
183        self.lexicon.len()
184    }
185
186    /// Check if the lexicon is empty
187    pub fn is_empty(&self) -> bool {
188        self.lexicon.is_empty()
189    }
190}
191
192impl Default for SentimentLexicon {
193    fn default() -> Self {
194        Self::new()
195    }
196}
197
198/// Lexicon-based sentiment analyzer
199pub struct LexiconSentimentAnalyzer {
200    /// The sentiment lexicon
201    lexicon: SentimentLexicon,
202    /// The tokenizer to use
203    tokenizer: Box<dyn Tokenizer + Send + Sync>,
204    /// Negation words that reverse sentiment
205    negation_words: Vec<String>,
206    /// Window size for negation detection
207    negation_window: usize,
208}
209
210impl LexiconSentimentAnalyzer {
211    /// Create a new lexicon-based sentiment analyzer
212    pub fn new(lexicon: SentimentLexicon) -> Self {
213        let negation_words = vec![
214            "not".to_string(),
215            "no".to_string(),
216            "never".to_string(),
217            "neither".to_string(),
218            "nobody".to_string(),
219            "nothing".to_string(),
220            "nowhere".to_string(),
221            "n't".to_string(),
222            "cannot".to_string(),
223            "without".to_string(),
224        ];
225
226        Self {
227            lexicon,
228            tokenizer: Box::new(WordTokenizer::default()),
229            negation_words,
230            negation_window: 3,
231        }
232    }
233
234    /// Create an analyzer with a basic lexicon
235    pub fn with_basiclexicon() -> Self {
236        Self::new(SentimentLexicon::with_basiclexicon())
237    }
238
239    /// Set a custom tokenizer
240    pub fn with_tokenizer(mut self, tokenizer: Box<dyn Tokenizer + Send + Sync>) -> Self {
241        self.tokenizer = tokenizer;
242        self
243    }
244
245    /// Analyze the sentiment of a text
246    pub fn analyze(&self, text: &str) -> Result<SentimentResult> {
247        let tokens = self.tokenizer.tokenize(text)?;
248
249        if tokens.is_empty() {
250            return Ok(SentimentResult {
251                sentiment: Sentiment::Neutral,
252                score: 0.0,
253                confidence: 0.0,
254                word_counts: SentimentWordCounts {
255                    positive_words: 0,
256                    negative_words: 0,
257                    neutral_words: 0,
258                    total_words: 0,
259                },
260            });
261        }
262
263        let mut total_score = 0.0;
264        let mut positive_count = 0;
265        let mut negative_count = 0;
266        let mut neutral_count = 0;
267
268        // Analyze each token
269        for (i, token) in tokens.iter().enumerate() {
270            let token_lower = token.to_lowercase();
271            let mut score = self.lexicon.get_score(&token_lower);
272
273            // Check for negation
274            if score != 0.0 {
275                for j in 1..=self.negation_window.min(i) {
276                    let prev_token = &tokens[i - j].to_lowercase();
277                    if self.negation_words.contains(prev_token) {
278                        score *= -1.0;
279                        break;
280                    }
281                }
282            }
283
284            total_score += score;
285
286            if score > 0.0 {
287                positive_count += 1;
288            } else if score < 0.0 {
289                negative_count += 1;
290            } else {
291                neutral_count += 1;
292            }
293        }
294
295        let total_words = tokens.len();
296        let sentiment = Sentiment::from_score(total_score);
297
298        // Calculate confidence based on the proportion of sentiment-bearing words
299        let sentiment_words = positive_count + negative_count;
300        let confidence = if total_words > 0 {
301            (sentiment_words as f64 / total_words as f64).min(1.0)
302        } else {
303            0.0
304        };
305
306        Ok(SentimentResult {
307            sentiment,
308            score: total_score,
309            confidence,
310            word_counts: SentimentWordCounts {
311                positive_words: positive_count,
312                negative_words: negative_count,
313                neutral_words: neutral_count,
314                total_words,
315            },
316        })
317    }
318
319    /// Analyze sentiment for multiple texts
320    pub fn analyze_batch(&self, texts: &[&str]) -> Result<Vec<SentimentResult>> {
321        texts.iter().map(|&text| self.analyze(text)).collect()
322    }
323}
324
325/// Rule-based sentiment modifications
326#[derive(Debug, Clone)]
327pub struct SentimentRules {
328    /// Intensifier words that increase sentiment magnitude
329    intensifiers: HashMap<String, f64>,
330    /// Diminisher words that decrease sentiment magnitude
331    diminishers: HashMap<String, f64>,
332}
333
334impl Default for SentimentRules {
335    fn default() -> Self {
336        let mut intensifiers = HashMap::new();
337        intensifiers.insert("very".to_string(), 1.5);
338        intensifiers.insert("extremely".to_string(), 2.0);
339        intensifiers.insert("incredibly".to_string(), 2.0);
340        intensifiers.insert("really".to_string(), 1.3);
341        intensifiers.insert("so".to_string(), 1.3);
342        intensifiers.insert("absolutely".to_string(), 2.0);
343
344        let mut diminishers = HashMap::new();
345        diminishers.insert("somewhat".to_string(), 0.5);
346        diminishers.insert("slightly".to_string(), 0.5);
347        diminishers.insert("barely".to_string(), 0.3);
348        diminishers.insert("hardly".to_string(), 0.3);
349        diminishers.insert("a little".to_string(), 0.5);
350
351        Self {
352            intensifiers,
353            diminishers,
354        }
355    }
356}
357
358impl SentimentRules {
359    /// Apply rules to modify a sentiment score
360    pub fn apply(&self, tokens: &[String], basescores: &[f64]) -> Vec<f64> {
361        let mut modified_scores = basescores.to_vec();
362
363        for (i, score) in modified_scores.iter_mut().enumerate() {
364            if *score == 0.0 {
365                continue;
366            }
367
368            // Check for intensifiers/diminishers in the preceding words
369            for j in 1..=2.min(i) {
370                let prev_token = &tokens[i - j].to_lowercase();
371
372                if let Some(&multiplier) = self.intensifiers.get(prev_token) {
373                    *score *= multiplier;
374                    break;
375                } else if let Some(&multiplier) = self.diminishers.get(prev_token) {
376                    *score *= multiplier;
377                    break;
378                }
379            }
380        }
381
382        modified_scores
383    }
384}
385
386/// Advanced rule-based sentiment analyzer
387pub struct RuleBasedSentimentAnalyzer {
388    /// The base analyzer
389    base_analyzer: LexiconSentimentAnalyzer,
390    /// Sentiment modification rules
391    rules: SentimentRules,
392}
393
394impl RuleBasedSentimentAnalyzer {
395    /// Create a new rule-based sentiment analyzer
396    pub fn new(lexicon: SentimentLexicon) -> Self {
397        Self {
398            base_analyzer: LexiconSentimentAnalyzer::new(lexicon),
399            rules: SentimentRules::default(),
400        }
401    }
402
403    /// Create an analyzer with a basic lexicon
404    pub fn with_basiclexicon() -> Self {
405        Self::new(SentimentLexicon::with_basiclexicon())
406    }
407
408    /// Analyze sentiment with rule modifications
409    pub fn analyze(&self, text: &str) -> Result<SentimentResult> {
410        let tokens = self.base_analyzer.tokenizer.tokenize(text)?;
411
412        if tokens.is_empty() {
413            return self.base_analyzer.analyze(text);
414        }
415
416        // Get base scores for each token
417        let basescores: Vec<f64> = tokens
418            .iter()
419            .map(|token| self.base_analyzer.lexicon.get_score(token))
420            .collect();
421
422        // Apply rules to modify scores
423        let modified_scores = self.rules.apply(&tokens, &basescores);
424
425        // Calculate final sentiment
426        let total_score: f64 = modified_scores.iter().sum();
427        let sentiment = Sentiment::from_score(total_score);
428
429        // Count sentiment words
430        let mut positive_count = 0;
431        let mut negative_count = 0;
432        let mut neutral_count = 0;
433
434        for &score in &modified_scores {
435            if score > 0.0 {
436                positive_count += 1;
437            } else if score < 0.0 {
438                negative_count += 1;
439            } else {
440                neutral_count += 1;
441            }
442        }
443
444        let total_words = tokens.len();
445        let sentiment_words = positive_count + negative_count;
446        let confidence = if total_words > 0 {
447            (sentiment_words as f64 / total_words as f64).min(1.0)
448        } else {
449            0.0
450        };
451
452        Ok(SentimentResult {
453            sentiment,
454            score: total_score,
455            confidence,
456            word_counts: SentimentWordCounts {
457                positive_words: positive_count,
458                negative_words: negative_count,
459                neutral_words: neutral_count,
460                total_words,
461            },
462        })
463    }
464}
465
466#[cfg(test)]
467mod tests {
468    use super::*;
469
470    #[test]
471    fn test_sentimentlexicon() {
472        let mut lexicon = SentimentLexicon::new();
473        lexicon.add_word("happy".to_string(), 2.0);
474        lexicon.add_word("sad".to_string(), -2.0);
475
476        assert_eq!(lexicon.get_score("happy"), 2.0);
477        assert_eq!(lexicon.get_score("sad"), -2.0);
478        assert_eq!(lexicon.get_score("unknown"), 0.0);
479    }
480
481    #[test]
482    fn test_basic_sentiment_analysis() {
483        let analyzer = LexiconSentimentAnalyzer::with_basiclexicon();
484
485        let positive_result = analyzer.analyze("This is a wonderful day!").unwrap();
486        assert_eq!(positive_result.sentiment, Sentiment::Positive);
487        assert!(positive_result.score > 0.0);
488
489        let negative_result = analyzer.analyze("This is terrible and awful").unwrap();
490        assert_eq!(negative_result.sentiment, Sentiment::Negative);
491        assert!(negative_result.score < 0.0);
492
493        let neutral_result = analyzer.analyze("This is a book").unwrap();
494        assert_eq!(neutral_result.sentiment, Sentiment::Neutral);
495        assert_eq!(neutral_result.score, 0.0);
496    }
497
498    #[test]
499    fn test_negation_handling() {
500        let analyzer = LexiconSentimentAnalyzer::with_basiclexicon();
501
502        let negated_result = analyzer.analyze("This is not good").unwrap();
503        assert_eq!(negated_result.sentiment, Sentiment::Negative);
504        assert!(negated_result.score < 0.0);
505    }
506
507    #[test]
508    fn test_rule_based_sentiment() {
509        let analyzer = RuleBasedSentimentAnalyzer::with_basiclexicon();
510
511        let intensified_result = analyzer.analyze("This is very good").unwrap();
512        let normal_result = analyzer.analyze("This is good").unwrap();
513
514        assert!(intensified_result.score > normal_result.score);
515    }
516
517    #[test]
518    fn test_sentiment_batch_analysis() {
519        let analyzer = LexiconSentimentAnalyzer::with_basiclexicon();
520        let texts = vec!["I love this", "I hate this", "This is okay"];
521
522        let results = analyzer.analyze_batch(&texts).unwrap();
523        assert_eq!(results.len(), 3);
524        assert_eq!(results[0].sentiment, Sentiment::Positive);
525        assert_eq!(results[1].sentiment, Sentiment::Negative);
526    }
527}