scirs2_text/
sentiment.rs

1//! Sentiment analysis functionality
2//!
3//! This module provides comprehensive sentiment analysis capabilities:
4//!
5//! - **Lexicon-based**: Score text using word-level sentiment dictionaries
6//! - **Rule-based (VADER-inspired)**: Handle negation, intensifiers, but-clauses,
7//!   capitalization emphasis, and punctuation heuristics
8//! - **Naive Bayes classifier**: Train a probabilistic sentiment classifier from labeled data
9//! - **Aspect-based**: Extract sentiment for specific aspects/entities in text
10//! - **Document aggregation**: Aggregate sentiment across multiple texts/paragraphs
11//!
12//! ## Quick Start
13//!
14//! ```rust
15//! use scirs2_text::sentiment::{LexiconSentimentAnalyzer, VaderSentimentAnalyzer, Sentiment};
16//!
17//! // Basic lexicon-based analysis
18//! let analyzer = LexiconSentimentAnalyzer::with_basiclexicon();
19//! let result = analyzer.analyze("I love this product!").unwrap();
20//! assert_eq!(result.sentiment, Sentiment::Positive);
21//!
22//! // VADER-style analysis with intensifiers and negation
23//! let vader = VaderSentimentAnalyzer::new();
24//! let result = vader.analyze("This movie is not just good, it is ABSOLUTELY amazing!").unwrap();
25//! assert_eq!(result.sentiment, Sentiment::Positive);
26//! ```
27
28use crate::error::{Result, TextError};
29use crate::tokenize::{Tokenizer, WordTokenizer};
30use std::collections::HashMap;
31
32/// Sentiment polarity
33#[derive(Debug, Clone, Copy, PartialEq)]
34pub enum Sentiment {
35    /// Positive sentiment
36    Positive,
37    /// Negative sentiment
38    Negative,
39    /// Neutral sentiment
40    Neutral,
41}
42
43impl std::fmt::Display for Sentiment {
44    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
45        match self {
46            Sentiment::Positive => write!(f, "Positive"),
47            Sentiment::Negative => write!(f, "Negative"),
48            Sentiment::Neutral => write!(f, "Neutral"),
49        }
50    }
51}
52
53impl Sentiment {
54    /// Convert sentiment to a numerical score
55    pub fn to_score(&self) -> f64 {
56        match self {
57            Sentiment::Positive => 1.0,
58            Sentiment::Neutral => 0.0,
59            Sentiment::Negative => -1.0,
60        }
61    }
62
63    /// Convert a numerical score to sentiment
64    pub fn from_score(score: f64) -> Self {
65        if score > 0.05 {
66            Sentiment::Positive
67        } else if score < -0.05 {
68            Sentiment::Negative
69        } else {
70            Sentiment::Neutral
71        }
72    }
73}
74
75/// Result of sentiment analysis
76#[derive(Debug, Clone)]
77pub struct SentimentResult {
78    /// The overall sentiment
79    pub sentiment: Sentiment,
80    /// The raw sentiment score
81    pub score: f64,
82    /// Confidence level (0-1)
83    pub confidence: f64,
84    /// Breakdown of positive and negative word counts
85    pub word_counts: SentimentWordCounts,
86}
87
88/// Word counts for sentiment analysis
89#[derive(Debug, Clone, Default)]
90pub struct SentimentWordCounts {
91    /// Number of positive words
92    pub positive_words: usize,
93    /// Number of negative words
94    pub negative_words: usize,
95    /// Number of neutral words
96    pub neutral_words: usize,
97    /// Total number of words analyzed
98    pub total_words: usize,
99}
100
101// ─── Sentiment Lexicon ───────────────────────────────────────────────────────
102
103/// A sentiment lexicon mapping words to sentiment scores
104#[derive(Debug, Clone)]
105pub struct SentimentLexicon {
106    /// Word to sentiment score mapping
107    lexicon: HashMap<String, f64>,
108    /// Default score for unknown words
109    default_score: f64,
110}
111
112impl SentimentLexicon {
113    /// Create a new sentiment lexicon
114    pub fn new() -> Self {
115        Self {
116            lexicon: HashMap::new(),
117            default_score: 0.0,
118        }
119    }
120
121    /// Create a basic sentiment lexicon with common words
122    pub fn with_basiclexicon() -> Self {
123        let mut lexicon = HashMap::new();
124
125        // Positive words (AFINN-style scores)
126        let positive_words = [
127            ("good", 1.0),
128            ("great", 2.0),
129            ("excellent", 3.0),
130            ("amazing", 3.0),
131            ("wonderful", 2.5),
132            ("fantastic", 2.5),
133            ("love", 2.0),
134            ("like", 1.0),
135            ("happy", 2.0),
136            ("joy", 2.0),
137            ("pleased", 1.5),
138            ("satisfied", 1.0),
139            ("positive", 1.0),
140            ("perfect", 3.0),
141            ("best", 2.5),
142            ("awesome", 2.5),
143            ("beautiful", 2.0),
144            ("brilliant", 2.5),
145            ("superb", 2.5),
146            ("nice", 1.0),
147            ("outstanding", 3.0),
148            ("exceptional", 3.0),
149            ("remarkable", 2.0),
150            ("delightful", 2.5),
151            ("impressive", 2.0),
152            ("enjoy", 1.5),
153            ("recommend", 1.5),
154            ("better", 1.0),
155            ("superior", 2.0),
156            ("exciting", 2.0),
157        ];
158
159        // Negative words
160        let negative_words = [
161            ("bad", -1.0),
162            ("terrible", -2.5),
163            ("awful", -2.5),
164            ("horrible", -3.0),
165            ("hate", -2.5),
166            ("dislike", -1.5),
167            ("sad", -2.0),
168            ("unhappy", -2.0),
169            ("disappointed", -2.0),
170            ("negative", -1.0),
171            ("worst", -3.0),
172            ("poor", -1.5),
173            ("disgusting", -3.0),
174            ("ugly", -2.0),
175            ("nasty", -2.5),
176            ("stupid", -2.0),
177            ("pathetic", -2.5),
178            ("failure", -2.0),
179            ("fail", -2.0),
180            ("sucks", -2.0),
181            ("boring", -1.5),
182            ("mediocre", -1.0),
183            ("inferior", -2.0),
184            ("lousy", -2.0),
185            ("dreadful", -2.5),
186            ("annoying", -1.5),
187            ("frustrating", -2.0),
188            ("disappointing", -2.0),
189            ("terrible", -2.5),
190            ("useless", -2.0),
191        ];
192
193        for (word, score) in &positive_words {
194            lexicon.insert(word.to_string(), *score);
195        }
196
197        for (word, score) in &negative_words {
198            lexicon.insert(word.to_string(), *score);
199        }
200
201        Self {
202            lexicon,
203            default_score: 0.0,
204        }
205    }
206
207    /// Add a word to the lexicon
208    pub fn add_word(&mut self, word: String, score: f64) {
209        self.lexicon.insert(word.to_lowercase(), score);
210    }
211
212    /// Get the sentiment score for a word
213    pub fn get_score(&self, word: &str) -> f64 {
214        self.lexicon
215            .get(&word.to_lowercase())
216            .copied()
217            .unwrap_or(self.default_score)
218    }
219
220    /// Check if a word is in the lexicon
221    pub fn contains(&self, word: &str) -> bool {
222        self.lexicon.contains_key(&word.to_lowercase())
223    }
224
225    /// Get the size of the lexicon
226    pub fn len(&self) -> usize {
227        self.lexicon.len()
228    }
229
230    /// Check if the lexicon is empty
231    pub fn is_empty(&self) -> bool {
232        self.lexicon.is_empty()
233    }
234
235    /// Get all words and their scores
236    pub fn entries(&self) -> &HashMap<String, f64> {
237        &self.lexicon
238    }
239}
240
241impl Default for SentimentLexicon {
242    fn default() -> Self {
243        Self::new()
244    }
245}
246
247// ─── Lexicon-based Sentiment Analyzer ────────────────────────────────────────
248
249/// Lexicon-based sentiment analyzer
250pub struct LexiconSentimentAnalyzer {
251    /// The sentiment lexicon
252    lexicon: SentimentLexicon,
253    /// The tokenizer to use
254    tokenizer: Box<dyn Tokenizer + Send + Sync>,
255    /// Negation words that reverse sentiment
256    negation_words: Vec<String>,
257    /// Window size for negation detection
258    negation_window: usize,
259}
260
261impl LexiconSentimentAnalyzer {
262    /// Create a new lexicon-based sentiment analyzer
263    pub fn new(lexicon: SentimentLexicon) -> Self {
264        let negation_words = vec![
265            "not".to_string(),
266            "no".to_string(),
267            "never".to_string(),
268            "neither".to_string(),
269            "nobody".to_string(),
270            "nothing".to_string(),
271            "nowhere".to_string(),
272            "n't".to_string(),
273            "cannot".to_string(),
274            "without".to_string(),
275        ];
276
277        Self {
278            lexicon,
279            tokenizer: Box::new(WordTokenizer::default()),
280            negation_words,
281            negation_window: 3,
282        }
283    }
284
285    /// Create an analyzer with a basic lexicon
286    pub fn with_basiclexicon() -> Self {
287        Self::new(SentimentLexicon::with_basiclexicon())
288    }
289
290    /// Set a custom tokenizer
291    pub fn with_tokenizer(mut self, tokenizer: Box<dyn Tokenizer + Send + Sync>) -> Self {
292        self.tokenizer = tokenizer;
293        self
294    }
295
296    /// Analyze the sentiment of a text
297    pub fn analyze(&self, text: &str) -> Result<SentimentResult> {
298        let tokens = self.tokenizer.tokenize(text)?;
299
300        if tokens.is_empty() {
301            return Ok(SentimentResult {
302                sentiment: Sentiment::Neutral,
303                score: 0.0,
304                confidence: 0.0,
305                word_counts: SentimentWordCounts {
306                    positive_words: 0,
307                    negative_words: 0,
308                    neutral_words: 0,
309                    total_words: 0,
310                },
311            });
312        }
313
314        let mut total_score = 0.0;
315        let mut positive_count = 0;
316        let mut negative_count = 0;
317        let mut neutral_count = 0;
318
319        // Analyze each token
320        for (i, token) in tokens.iter().enumerate() {
321            let token_lower = token.to_lowercase();
322            let mut score = self.lexicon.get_score(&token_lower);
323
324            // Check for negation
325            if score != 0.0 {
326                for j in 1..=self.negation_window.min(i) {
327                    let prev_token = &tokens[i - j].to_lowercase();
328                    if self.negation_words.contains(prev_token) {
329                        score *= -1.0;
330                        break;
331                    }
332                }
333            }
334
335            total_score += score;
336
337            if score > 0.0 {
338                positive_count += 1;
339            } else if score < 0.0 {
340                negative_count += 1;
341            } else {
342                neutral_count += 1;
343            }
344        }
345
346        let total_words = tokens.len();
347        let sentiment = Sentiment::from_score(total_score);
348
349        // Calculate confidence based on the proportion of sentiment-bearing words
350        let sentiment_words = positive_count + negative_count;
351        let confidence = if total_words > 0 {
352            (sentiment_words as f64 / total_words as f64).min(1.0)
353        } else {
354            0.0
355        };
356
357        Ok(SentimentResult {
358            sentiment,
359            score: total_score,
360            confidence,
361            word_counts: SentimentWordCounts {
362                positive_words: positive_count,
363                negative_words: negative_count,
364                neutral_words: neutral_count,
365                total_words,
366            },
367        })
368    }
369
370    /// Analyze sentiment for multiple texts
371    pub fn analyze_batch(&self, texts: &[&str]) -> Result<Vec<SentimentResult>> {
372        texts.iter().map(|&text| self.analyze(text)).collect()
373    }
374}
375
376// ─── Rule-based Sentiment (intensifiers, diminishers) ────────────────────────
377
378/// Rule-based sentiment modifications
379#[derive(Debug, Clone)]
380pub struct SentimentRules {
381    /// Intensifier words that increase sentiment magnitude
382    intensifiers: HashMap<String, f64>,
383    /// Diminisher words that decrease sentiment magnitude
384    diminishers: HashMap<String, f64>,
385}
386
387impl Default for SentimentRules {
388    fn default() -> Self {
389        let mut intensifiers = HashMap::new();
390        intensifiers.insert("very".to_string(), 1.5);
391        intensifiers.insert("extremely".to_string(), 2.0);
392        intensifiers.insert("incredibly".to_string(), 2.0);
393        intensifiers.insert("really".to_string(), 1.3);
394        intensifiers.insert("so".to_string(), 1.3);
395        intensifiers.insert("absolutely".to_string(), 2.0);
396        intensifiers.insert("truly".to_string(), 1.5);
397        intensifiers.insert("totally".to_string(), 1.5);
398        intensifiers.insert("utterly".to_string(), 1.8);
399        intensifiers.insert("remarkably".to_string(), 1.5);
400
401        let mut diminishers = HashMap::new();
402        diminishers.insert("somewhat".to_string(), 0.5);
403        diminishers.insert("slightly".to_string(), 0.5);
404        diminishers.insert("barely".to_string(), 0.3);
405        diminishers.insert("hardly".to_string(), 0.3);
406        diminishers.insert("a little".to_string(), 0.5);
407        diminishers.insert("kind of".to_string(), 0.5);
408        diminishers.insert("sort of".to_string(), 0.5);
409        diminishers.insert("marginally".to_string(), 0.4);
410
411        Self {
412            intensifiers,
413            diminishers,
414        }
415    }
416}
417
418impl SentimentRules {
419    /// Apply rules to modify a sentiment score
420    pub fn apply(&self, tokens: &[String], basescores: &[f64]) -> Vec<f64> {
421        let mut modified_scores = basescores.to_vec();
422
423        for (i, score) in modified_scores.iter_mut().enumerate() {
424            if *score == 0.0 {
425                continue;
426            }
427
428            // Check for intensifiers/diminishers in the preceding words
429            for j in 1..=2.min(i) {
430                let prev_token = &tokens[i - j].to_lowercase();
431
432                if let Some(&multiplier) = self.intensifiers.get(prev_token) {
433                    *score *= multiplier;
434                    break;
435                } else if let Some(&multiplier) = self.diminishers.get(prev_token) {
436                    *score *= multiplier;
437                    break;
438                }
439            }
440        }
441
442        modified_scores
443    }
444}
445
446/// Advanced rule-based sentiment analyzer
447pub struct RuleBasedSentimentAnalyzer {
448    /// The base analyzer
449    base_analyzer: LexiconSentimentAnalyzer,
450    /// Sentiment modification rules
451    rules: SentimentRules,
452}
453
454impl RuleBasedSentimentAnalyzer {
455    /// Create a new rule-based sentiment analyzer
456    pub fn new(lexicon: SentimentLexicon) -> Self {
457        Self {
458            base_analyzer: LexiconSentimentAnalyzer::new(lexicon),
459            rules: SentimentRules::default(),
460        }
461    }
462
463    /// Create an analyzer with a basic lexicon
464    pub fn with_basiclexicon() -> Self {
465        Self::new(SentimentLexicon::with_basiclexicon())
466    }
467
468    /// Analyze sentiment with rule modifications
469    pub fn analyze(&self, text: &str) -> Result<SentimentResult> {
470        let tokens = self.base_analyzer.tokenizer.tokenize(text)?;
471
472        if tokens.is_empty() {
473            return self.base_analyzer.analyze(text);
474        }
475
476        // Get base scores for each token
477        let basescores: Vec<f64> = tokens
478            .iter()
479            .map(|token| self.base_analyzer.lexicon.get_score(token))
480            .collect();
481
482        // Apply rules to modify scores
483        let modified_scores = self.rules.apply(&tokens, &basescores);
484
485        // Calculate final sentiment
486        let total_score: f64 = modified_scores.iter().sum();
487        let sentiment = Sentiment::from_score(total_score);
488
489        // Count sentiment words
490        let mut positive_count = 0;
491        let mut negative_count = 0;
492        let mut neutral_count = 0;
493
494        for &score in &modified_scores {
495            if score > 0.0 {
496                positive_count += 1;
497            } else if score < 0.0 {
498                negative_count += 1;
499            } else {
500                neutral_count += 1;
501            }
502        }
503
504        let total_words = tokens.len();
505        let sentiment_words = positive_count + negative_count;
506        let confidence = if total_words > 0 {
507            (sentiment_words as f64 / total_words as f64).min(1.0)
508        } else {
509            0.0
510        };
511
512        Ok(SentimentResult {
513            sentiment,
514            score: total_score,
515            confidence,
516            word_counts: SentimentWordCounts {
517                positive_words: positive_count,
518                negative_words: negative_count,
519                neutral_words: neutral_count,
520                total_words,
521            },
522        })
523    }
524}
525
526// ─── VADER-Inspired Sentiment Analyzer ───────────────────────────────────────
527
528/// VADER (Valence Aware Dictionary and sEntiment Reasoner) inspired sentiment result
529#[derive(Debug, Clone)]
530pub struct VaderResult {
531    /// Positive proportion (0-1)
532    pub positive: f64,
533    /// Negative proportion (0-1)
534    pub negative: f64,
535    /// Neutral proportion (0-1)
536    pub neutral: f64,
537    /// Compound score (-1 to +1), normalized using the formula from the VADER paper
538    pub compound: f64,
539    /// Overall sentiment label
540    pub sentiment: Sentiment,
541}
542
543/// VADER-inspired sentiment analyzer
544///
545/// Implements key heuristics from VADER (Hutto & Gilbert, 2014):
546/// - Negation handling (flips sentiment polarity)
547/// - Intensifier words (boost magnitude)
548/// - But-clause handling (weight sentiment after "but" more heavily)
549/// - ALL CAPS emphasis (boosts sentiment of capitalized words)
550/// - Exclamation marks (slight positive boost)
551/// - Question marks at end (reduce sentiment)
552pub struct VaderSentimentAnalyzer {
553    /// Sentiment lexicon
554    lexicon: SentimentLexicon,
555    /// Tokenizer
556    tokenizer: Box<dyn Tokenizer + Send + Sync>,
557    /// Negation words
558    negation_words: Vec<String>,
559    /// Intensifier multipliers
560    intensifiers: HashMap<String, f64>,
561    /// Diminisher multipliers
562    diminishers: HashMap<String, f64>,
563    /// But-clause weight (how much more weight to give sentiment after "but")
564    but_weight: f64,
565    /// Caps emphasis multiplier
566    caps_multiplier: f64,
567    /// Exclamation boost per mark (up to 4)
568    exclamation_boost: f64,
569    /// Question reduction factor
570    question_reduction: f64,
571}
572
573impl VaderSentimentAnalyzer {
574    /// Create a new VADER-style analyzer with default settings
575    pub fn new() -> Self {
576        let mut intensifiers = HashMap::new();
577        intensifiers.insert("very".to_string(), 0.293);
578        intensifiers.insert("extremely".to_string(), 0.293);
579        intensifiers.insert("absolutely".to_string(), 0.293);
580        intensifiers.insert("incredibly".to_string(), 0.293);
581        intensifiers.insert("really".to_string(), 0.18);
582        intensifiers.insert("so".to_string(), 0.18);
583        intensifiers.insert("truly".to_string(), 0.18);
584        intensifiers.insert("totally".to_string(), 0.18);
585        intensifiers.insert("quite".to_string(), 0.1);
586
587        let mut diminishers = HashMap::new();
588        diminishers.insert("somewhat".to_string(), -0.1);
589        diminishers.insert("barely".to_string(), -0.2);
590        diminishers.insert("hardly".to_string(), -0.2);
591        diminishers.insert("slightly".to_string(), -0.1);
592        diminishers.insert("kind of".to_string(), -0.1);
593        diminishers.insert("sort of".to_string(), -0.1);
594
595        let negation_words = vec![
596            "not".to_string(),
597            "no".to_string(),
598            "never".to_string(),
599            "neither".to_string(),
600            "nobody".to_string(),
601            "nothing".to_string(),
602            "nowhere".to_string(),
603            "cannot".to_string(),
604            "without".to_string(),
605            "don't".to_string(),
606            "doesn't".to_string(),
607            "didn't".to_string(),
608            "isn't".to_string(),
609            "wasn't".to_string(),
610            "won't".to_string(),
611            "wouldn't".to_string(),
612            "shouldn't".to_string(),
613            "couldn't".to_string(),
614            "aren't".to_string(),
615            "weren't".to_string(),
616        ];
617
618        Self {
619            lexicon: SentimentLexicon::with_basiclexicon(),
620            tokenizer: Box::new(WordTokenizer::default()),
621            negation_words,
622            intensifiers,
623            diminishers,
624            but_weight: 0.5,
625            caps_multiplier: 0.733,
626            exclamation_boost: 0.292,
627            question_reduction: 0.18,
628        }
629    }
630
631    /// Create with a custom lexicon
632    pub fn with_lexicon(mut self, lexicon: SentimentLexicon) -> Self {
633        self.lexicon = lexicon;
634        self
635    }
636
637    /// Analyze text and return VADER-style compound scores
638    pub fn analyze(&self, text: &str) -> Result<VaderResult> {
639        let tokens = self.tokenizer.tokenize(text)?;
640
641        if tokens.is_empty() {
642            return Ok(VaderResult {
643                positive: 0.0,
644                negative: 0.0,
645                neutral: 1.0,
646                compound: 0.0,
647                sentiment: Sentiment::Neutral,
648            });
649        }
650
651        // Get raw sentiment scores for each token
652        let mut sentiments: Vec<f64> = Vec::with_capacity(tokens.len());
653
654        for (i, token) in tokens.iter().enumerate() {
655            let lower = token.to_lowercase();
656            let mut score = self.lexicon.get_score(&lower);
657
658            if score == 0.0 {
659                sentiments.push(0.0);
660                continue;
661            }
662
663            // Check for ALL CAPS emphasis (token must be > 1 char and all uppercase)
664            if token.len() > 1 && token.chars().all(|c| c.is_uppercase()) {
665                if score > 0.0 {
666                    score += self.caps_multiplier;
667                } else {
668                    score -= self.caps_multiplier;
669                }
670            }
671
672            // Check preceding words for intensifiers/diminishers
673            for j in 1..=3.min(i) {
674                let prev = tokens[i - j].to_lowercase();
675                if let Some(&boost) = self.intensifiers.get(&prev) {
676                    if score > 0.0 {
677                        score += boost;
678                    } else {
679                        score -= boost;
680                    }
681                    break;
682                } else if let Some(&reduce) = self.diminishers.get(&prev) {
683                    if score > 0.0 {
684                        score += reduce; // reduce is negative
685                    } else {
686                        score -= reduce;
687                    }
688                    break;
689                }
690            }
691
692            // Check for negation in preceding words
693            let mut negated = false;
694            for j in 1..=3.min(i) {
695                let prev = tokens[i - j].to_lowercase();
696                if self.negation_words.contains(&prev) {
697                    negated = true;
698                    break;
699                }
700            }
701
702            if negated {
703                score *= -0.74; // VADER uses a constant negation multiplier
704            }
705
706            sentiments.push(score);
707        }
708
709        // But-clause handling: weight sentiment after "but" more heavily
710        let mut but_idx = None;
711        for (i, token) in tokens.iter().enumerate() {
712            if token.to_lowercase() == "but" || token.to_lowercase() == "however" {
713                but_idx = Some(i);
714            }
715        }
716
717        if let Some(idx) = but_idx {
718            // Reduce weight of sentiment before "but", increase after
719            for (i, score) in sentiments.iter_mut().enumerate() {
720                if i < idx {
721                    *score *= 1.0 - self.but_weight;
722                } else if i > idx {
723                    *score *= 1.0 + self.but_weight;
724                }
725            }
726        }
727
728        // Sum all sentiments
729        let mut sum_scores: f64 = sentiments.iter().sum();
730
731        // Exclamation mark boost (count in original text, up to 4)
732        let excl_count = text.chars().filter(|&c| c == '!').count().min(4);
733        if excl_count > 0 {
734            sum_scores += excl_count as f64 * self.exclamation_boost * sum_scores.signum();
735        }
736
737        // Question mark at end reduces sentiment
738        if text.trim_end().ends_with('?') {
739            sum_scores *= 1.0 - self.question_reduction;
740        }
741
742        // Compute compound score using VADER's normalization
743        let compound = self.normalize(sum_scores);
744
745        // Compute positive, negative, neutral proportions
746        let mut pos_sum = 0.0;
747        let mut neg_sum = 0.0;
748        let mut neu_count = 0.0;
749
750        for &s in &sentiments {
751            if s > 0.0 {
752                pos_sum += s;
753            } else if s < 0.0 {
754                neg_sum += s;
755            } else {
756                neu_count += 1.0;
757            }
758        }
759
760        let total = pos_sum + neg_sum.abs() + neu_count;
761        let (positive, negative, neutral) = if total > 0.0 {
762            (
763                (pos_sum / total).abs(),
764                (neg_sum / total).abs(),
765                neu_count / total,
766            )
767        } else {
768            (0.0, 0.0, 1.0)
769        };
770
771        let sentiment = if compound >= 0.05 {
772            Sentiment::Positive
773        } else if compound <= -0.05 {
774            Sentiment::Negative
775        } else {
776            Sentiment::Neutral
777        };
778
779        Ok(VaderResult {
780            positive,
781            negative,
782            neutral,
783            compound,
784            sentiment,
785        })
786    }
787
788    /// Normalize the sum of sentiments using VADER's formula
789    fn normalize(&self, score: f64) -> f64 {
790        let alpha = 15.0; // Approximation parameter
791        score / (score * score + alpha).sqrt()
792    }
793
794    /// Analyze multiple texts
795    pub fn analyze_batch(&self, texts: &[&str]) -> Result<Vec<VaderResult>> {
796        texts.iter().map(|&text| self.analyze(text)).collect()
797    }
798}
799
800impl Default for VaderSentimentAnalyzer {
801    fn default() -> Self {
802        Self::new()
803    }
804}
805
806// ─── Naive Bayes Sentiment Classifier ────────────────────────────────────────
807
808/// Naive Bayes classifier for sentiment analysis
809///
810/// Implements multinomial Naive Bayes with Laplace smoothing.
811/// Can be trained on labeled (text, sentiment) pairs and used to
812/// predict sentiment for new text.
813pub struct NaiveBayesSentiment {
814    /// Word counts per class: class_label -> (word -> count)
815    word_counts: HashMap<String, HashMap<String, f64>>,
816    /// Total word count per class
817    class_word_totals: HashMap<String, f64>,
818    /// Document count per class
819    class_doc_counts: HashMap<String, usize>,
820    /// Total documents
821    total_docs: usize,
822    /// Vocabulary of all known words
823    vocabulary: HashMap<String, usize>,
824    /// Laplace smoothing factor
825    alpha: f64,
826    /// Tokenizer
827    tokenizer: Box<dyn Tokenizer + Send + Sync>,
828}
829
830impl std::fmt::Debug for NaiveBayesSentiment {
831    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
832        f.debug_struct("NaiveBayesSentiment")
833            .field("total_docs", &self.total_docs)
834            .field("vocabulary_size", &self.vocabulary.len())
835            .field("alpha", &self.alpha)
836            .field("classes", &self.class_doc_counts.keys().collect::<Vec<_>>())
837            .finish()
838    }
839}
840
841impl NaiveBayesSentiment {
842    /// Create a new Naive Bayes sentiment classifier
843    pub fn new() -> Self {
844        Self {
845            word_counts: HashMap::new(),
846            class_word_totals: HashMap::new(),
847            class_doc_counts: HashMap::new(),
848            total_docs: 0,
849            vocabulary: HashMap::new(),
850            alpha: 1.0,
851            tokenizer: Box::new(WordTokenizer::default()),
852        }
853    }
854
855    /// Set the Laplace smoothing parameter
856    pub fn with_alpha(mut self, alpha: f64) -> Self {
857        self.alpha = alpha;
858        self
859    }
860
861    /// Train the classifier on labeled examples
862    ///
863    /// # Arguments
864    /// * `texts` - Training text samples
865    /// * `labels` - Corresponding labels (e.g., "positive", "negative", "neutral")
866    pub fn train(&mut self, texts: &[&str], labels: &[&str]) -> Result<()> {
867        if texts.len() != labels.len() {
868            return Err(TextError::InvalidInput(
869                "texts and labels must have the same length".into(),
870            ));
871        }
872
873        if texts.is_empty() {
874            return Err(TextError::InvalidInput("No training data provided".into()));
875        }
876
877        for (text, &label) in texts.iter().zip(labels.iter()) {
878            let tokens = self.tokenizer.tokenize(text)?;
879
880            // Update class document count
881            *self.class_doc_counts.entry(label.to_string()).or_insert(0) += 1;
882            self.total_docs += 1;
883
884            // Update word counts for this class
885            let class_words = self.word_counts.entry(label.to_string()).or_default();
886
887            for token in &tokens {
888                let lower = token.to_lowercase();
889                *class_words.entry(lower.clone()).or_insert(0.0) += 1.0;
890                *self
891                    .class_word_totals
892                    .entry(label.to_string())
893                    .or_insert(0.0) += 1.0;
894
895                // Add to vocabulary
896                let vocab_len = self.vocabulary.len();
897                self.vocabulary.entry(lower).or_insert(vocab_len);
898            }
899        }
900
901        Ok(())
902    }
903
904    /// Predict the class label for a text
905    pub fn predict(&self, text: &str) -> Result<String> {
906        let (label, _) = self.predict_with_score(text)?;
907        Ok(label)
908    }
909
910    /// Predict the class label with log-probability scores
911    pub fn predict_with_score(&self, text: &str) -> Result<(String, f64)> {
912        if self.total_docs == 0 {
913            return Err(TextError::ModelNotFitted(
914                "Classifier not trained. Call train() first".into(),
915            ));
916        }
917
918        let tokens = self.tokenizer.tokenize(text)?;
919        let vocab_size = self.vocabulary.len() as f64;
920
921        let mut best_label = String::new();
922        let mut best_score = f64::NEG_INFINITY;
923
924        for (label, &doc_count) in &self.class_doc_counts {
925            // Log prior: P(class)
926            let log_prior = (doc_count as f64 / self.total_docs as f64).ln();
927
928            // Log likelihood: sum of log P(word|class) for each word
929            let class_words = self.word_counts.get(label);
930            let class_total = self.class_word_totals.get(label).copied().unwrap_or(0.0);
931
932            let mut log_likelihood = 0.0;
933
934            for token in &tokens {
935                let lower = token.to_lowercase();
936                let word_count = class_words
937                    .and_then(|wc| wc.get(&lower))
938                    .copied()
939                    .unwrap_or(0.0);
940
941                // Laplace smoothing: P(word|class) = (count + alpha) / (total + alpha * vocab_size)
942                let prob = (word_count + self.alpha) / (class_total + self.alpha * vocab_size);
943                log_likelihood += prob.ln();
944            }
945
946            let score = log_prior + log_likelihood;
947            if score > best_score {
948                best_score = score;
949                best_label = label.clone();
950            }
951        }
952
953        Ok((best_label, best_score))
954    }
955
956    /// Predict probabilities for all classes
957    pub fn predict_proba(&self, text: &str) -> Result<HashMap<String, f64>> {
958        if self.total_docs == 0 {
959            return Err(TextError::ModelNotFitted("Classifier not trained".into()));
960        }
961
962        let tokens = self.tokenizer.tokenize(text)?;
963        let vocab_size = self.vocabulary.len() as f64;
964
965        let mut log_scores: Vec<(String, f64)> = Vec::new();
966
967        for (label, &doc_count) in &self.class_doc_counts {
968            let log_prior = (doc_count as f64 / self.total_docs as f64).ln();
969
970            let class_words = self.word_counts.get(label);
971            let class_total = self.class_word_totals.get(label).copied().unwrap_or(0.0);
972
973            let mut log_likelihood = 0.0;
974            for token in &tokens {
975                let lower = token.to_lowercase();
976                let word_count = class_words
977                    .and_then(|wc| wc.get(&lower))
978                    .copied()
979                    .unwrap_or(0.0);
980
981                let prob = (word_count + self.alpha) / (class_total + self.alpha * vocab_size);
982                log_likelihood += prob.ln();
983            }
984
985            log_scores.push((label.clone(), log_prior + log_likelihood));
986        }
987
988        // Convert log-scores to probabilities using log-sum-exp trick
989        let max_score = log_scores
990            .iter()
991            .map(|(_, s)| *s)
992            .fold(f64::NEG_INFINITY, f64::max);
993
994        let sum_exp: f64 = log_scores.iter().map(|(_, s)| (s - max_score).exp()).sum();
995
996        let mut probas = HashMap::new();
997        for (label, score) in &log_scores {
998            let prob = (score - max_score).exp() / sum_exp;
999            probas.insert(label.clone(), prob);
1000        }
1001
1002        Ok(probas)
1003    }
1004
1005    /// Get the classes this classifier knows about
1006    pub fn classes(&self) -> Vec<String> {
1007        self.class_doc_counts.keys().cloned().collect()
1008    }
1009}
1010
1011impl Default for NaiveBayesSentiment {
1012    fn default() -> Self {
1013        Self::new()
1014    }
1015}
1016
1017// ─── Aspect-Based Sentiment ─────────────────────────────────────────────────
1018
1019/// An aspect with its associated sentiment
1020#[derive(Debug, Clone)]
1021pub struct AspectSentiment {
1022    /// The aspect/entity name
1023    pub aspect: String,
1024    /// The sentiment for this aspect
1025    pub sentiment: Sentiment,
1026    /// The sentiment score
1027    pub score: f64,
1028    /// The relevant text snippet
1029    pub context: String,
1030}
1031
1032/// Aspect-based sentiment analyzer
1033///
1034/// Extracts sentiment for specific aspects mentioned in text.
1035/// Uses a window-based approach: for each aspect mention found,
1036/// computes sentiment from surrounding words.
1037pub struct AspectSentimentAnalyzer {
1038    /// The sentiment lexicon
1039    lexicon: SentimentLexicon,
1040    /// Tokenizer
1041    tokenizer: Box<dyn Tokenizer + Send + Sync>,
1042    /// Window size around aspect for sentiment extraction
1043    context_window: usize,
1044    /// Negation words
1045    negation_words: Vec<String>,
1046}
1047
1048impl AspectSentimentAnalyzer {
1049    /// Create a new aspect-based sentiment analyzer
1050    pub fn new() -> Self {
1051        Self {
1052            lexicon: SentimentLexicon::with_basiclexicon(),
1053            tokenizer: Box::new(WordTokenizer::default()),
1054            context_window: 5,
1055            negation_words: vec![
1056                "not".to_string(),
1057                "no".to_string(),
1058                "never".to_string(),
1059                "n't".to_string(),
1060                "without".to_string(),
1061            ],
1062        }
1063    }
1064
1065    /// Set a custom lexicon
1066    pub fn with_lexicon(mut self, lexicon: SentimentLexicon) -> Self {
1067        self.lexicon = lexicon;
1068        self
1069    }
1070
1071    /// Set the context window size
1072    pub fn with_context_window(mut self, window: usize) -> Self {
1073        self.context_window = window;
1074        self
1075    }
1076
1077    /// Extract sentiment for specific aspects in text
1078    ///
1079    /// # Arguments
1080    /// * `text` - The text to analyze
1081    /// * `aspects` - List of aspect keywords to look for
1082    pub fn analyze(&self, text: &str, aspects: &[&str]) -> Result<Vec<AspectSentiment>> {
1083        let tokens = self.tokenizer.tokenize(text)?;
1084        let lower_tokens: Vec<String> = tokens.iter().map(|t| t.to_lowercase()).collect();
1085
1086        let mut results = Vec::new();
1087
1088        for &aspect in aspects {
1089            let aspect_lower = aspect.to_lowercase();
1090            let aspect_tokens: Vec<String> =
1091                aspect_lower.split_whitespace().map(String::from).collect();
1092
1093            // Find all positions where the aspect occurs
1094            for pos in 0..lower_tokens.len() {
1095                // Check if the aspect (possibly multi-word) starts at this position
1096                let aspect_matches = if aspect_tokens.len() == 1 {
1097                    lower_tokens[pos] == aspect_tokens[0]
1098                } else {
1099                    pos + aspect_tokens.len() <= lower_tokens.len()
1100                        && aspect_tokens
1101                            .iter()
1102                            .enumerate()
1103                            .all(|(j, at)| lower_tokens[pos + j] == *at)
1104                };
1105
1106                if !aspect_matches {
1107                    continue;
1108                }
1109
1110                // Extract sentiment from surrounding context
1111                // Respect discourse boundaries: "but", "however", "although", "yet"
1112                let discourse_markers = [
1113                    "but",
1114                    "however",
1115                    "although",
1116                    "yet",
1117                    "though",
1118                    "nevertheless",
1119                ];
1120                let mut start = pos.saturating_sub(self.context_window);
1121                let end = (pos + aspect_tokens.len() + self.context_window).min(lower_tokens.len());
1122
1123                // Adjust start to not cross discourse markers before the aspect
1124                // Find the last discourse marker before pos and set start after it
1125                let initial_start = start;
1126                if let Some(last_marker_idx) = (initial_start..pos)
1127                    .rev()
1128                    .find(|&i| discourse_markers.contains(&lower_tokens[i].as_str()))
1129                {
1130                    start = last_marker_idx + 1;
1131                }
1132                // Adjust end to not cross discourse markers after the aspect
1133                let mut effective_end = end;
1134                for i in (pos + aspect_tokens.len())..end {
1135                    if discourse_markers.contains(&lower_tokens[i].as_str()) {
1136                        effective_end = i;
1137                        break;
1138                    }
1139                }
1140
1141                let mut score = 0.0;
1142                let mut is_negated = false;
1143
1144                for i in start..effective_end {
1145                    // Skip the aspect tokens themselves
1146                    if i >= pos && i < pos + aspect_tokens.len() {
1147                        continue;
1148                    }
1149
1150                    let token = &lower_tokens[i];
1151
1152                    // Track negation
1153                    if self.negation_words.contains(token) {
1154                        is_negated = true;
1155                        continue;
1156                    }
1157
1158                    let word_score = self.lexicon.get_score(token);
1159                    if word_score != 0.0 {
1160                        if is_negated {
1161                            score -= word_score;
1162                            is_negated = false;
1163                        } else {
1164                            score += word_score;
1165                        }
1166                    }
1167                }
1168
1169                // Build context string
1170                let context_tokens = &tokens[start..end];
1171                let context = context_tokens.join(" ");
1172
1173                results.push(AspectSentiment {
1174                    aspect: aspect.to_string(),
1175                    sentiment: Sentiment::from_score(score),
1176                    score,
1177                    context,
1178                });
1179            }
1180        }
1181
1182        Ok(results)
1183    }
1184}
1185
1186impl Default for AspectSentimentAnalyzer {
1187    fn default() -> Self {
1188        Self::new()
1189    }
1190}
1191
1192// ─── Document Sentiment Aggregation ──────────────────────────────────────────
1193
1194/// Result of aggregating sentiment across multiple texts
1195#[derive(Debug, Clone)]
1196pub struct AggregatedSentiment {
1197    /// Mean sentiment score across all texts
1198    pub mean_score: f64,
1199    /// Standard deviation of scores
1200    pub std_score: f64,
1201    /// Overall sentiment based on mean score
1202    pub overall_sentiment: Sentiment,
1203    /// Proportion of positive texts
1204    pub positive_ratio: f64,
1205    /// Proportion of negative texts
1206    pub negative_ratio: f64,
1207    /// Proportion of neutral texts
1208    pub neutral_ratio: f64,
1209    /// Number of texts analyzed
1210    pub count: usize,
1211    /// Individual results
1212    pub results: Vec<SentimentResult>,
1213}
1214
1215/// Aggregate sentiment analysis results across multiple texts/documents
1216pub fn aggregate_sentiment(results: &[SentimentResult]) -> AggregatedSentiment {
1217    if results.is_empty() {
1218        return AggregatedSentiment {
1219            mean_score: 0.0,
1220            std_score: 0.0,
1221            overall_sentiment: Sentiment::Neutral,
1222            positive_ratio: 0.0,
1223            negative_ratio: 0.0,
1224            neutral_ratio: 0.0,
1225            count: 0,
1226            results: Vec::new(),
1227        };
1228    }
1229
1230    let n = results.len() as f64;
1231
1232    // Calculate mean score
1233    let sum: f64 = results.iter().map(|r| r.score).sum();
1234    let mean_score = sum / n;
1235
1236    // Calculate standard deviation
1237    let variance: f64 = results
1238        .iter()
1239        .map(|r| (r.score - mean_score).powi(2))
1240        .sum::<f64>()
1241        / n;
1242    let std_score = variance.sqrt();
1243
1244    // Count sentiments
1245    let mut pos = 0;
1246    let mut neg = 0;
1247    let mut neu = 0;
1248    for r in results {
1249        match r.sentiment {
1250            Sentiment::Positive => pos += 1,
1251            Sentiment::Negative => neg += 1,
1252            Sentiment::Neutral => neu += 1,
1253        }
1254    }
1255
1256    AggregatedSentiment {
1257        mean_score,
1258        std_score,
1259        overall_sentiment: Sentiment::from_score(mean_score),
1260        positive_ratio: pos as f64 / n,
1261        negative_ratio: neg as f64 / n,
1262        neutral_ratio: neu as f64 / n,
1263        count: results.len(),
1264        results: results.to_vec(),
1265    }
1266}
1267
1268/// Analyze and aggregate sentiment for a batch of texts
1269pub fn analyze_and_aggregate(texts: &[&str]) -> Result<AggregatedSentiment> {
1270    let analyzer = LexiconSentimentAnalyzer::with_basiclexicon();
1271    let results = analyzer.analyze_batch(texts)?;
1272    Ok(aggregate_sentiment(&results))
1273}
1274
1275#[cfg(test)]
1276mod tests {
1277    use super::*;
1278
1279    // ─── Lexicon Tests ───────────────────────────────────────────────
1280
1281    #[test]
1282    fn test_sentimentlexicon() {
1283        let mut lexicon = SentimentLexicon::new();
1284        lexicon.add_word("happy".to_string(), 2.0);
1285        lexicon.add_word("sad".to_string(), -2.0);
1286
1287        assert_eq!(lexicon.get_score("happy"), 2.0);
1288        assert_eq!(lexicon.get_score("sad"), -2.0);
1289        assert_eq!(lexicon.get_score("unknown"), 0.0);
1290    }
1291
1292    #[test]
1293    fn test_basic_sentiment_analysis() {
1294        let analyzer = LexiconSentimentAnalyzer::with_basiclexicon();
1295
1296        let positive_result = analyzer
1297            .analyze("This is a wonderful day!")
1298            .expect("Operation failed");
1299        assert_eq!(positive_result.sentiment, Sentiment::Positive);
1300        assert!(positive_result.score > 0.0);
1301
1302        let negative_result = analyzer
1303            .analyze("This is terrible and awful")
1304            .expect("Operation failed");
1305        assert_eq!(negative_result.sentiment, Sentiment::Negative);
1306        assert!(negative_result.score < 0.0);
1307
1308        let neutral_result = analyzer
1309            .analyze("This is a book")
1310            .expect("Operation failed");
1311        assert_eq!(neutral_result.sentiment, Sentiment::Neutral);
1312    }
1313
1314    #[test]
1315    fn test_negation_handling() {
1316        let analyzer = LexiconSentimentAnalyzer::with_basiclexicon();
1317
1318        let negated_result = analyzer
1319            .analyze("This is not good")
1320            .expect("Operation failed");
1321        assert_eq!(negated_result.sentiment, Sentiment::Negative);
1322        assert!(negated_result.score < 0.0);
1323    }
1324
1325    #[test]
1326    fn test_rule_based_sentiment() {
1327        let analyzer = RuleBasedSentimentAnalyzer::with_basiclexicon();
1328
1329        let intensified_result = analyzer
1330            .analyze("This is very good")
1331            .expect("Operation failed");
1332        let normal_result = analyzer.analyze("This is good").expect("Operation failed");
1333
1334        assert!(intensified_result.score > normal_result.score);
1335    }
1336
1337    #[test]
1338    fn test_sentiment_batch_analysis() {
1339        let analyzer = LexiconSentimentAnalyzer::with_basiclexicon();
1340        let texts = vec!["I love this", "I hate this", "This is okay"];
1341
1342        let results = analyzer.analyze_batch(&texts).expect("Operation failed");
1343        assert_eq!(results.len(), 3);
1344        assert_eq!(results[0].sentiment, Sentiment::Positive);
1345        assert_eq!(results[1].sentiment, Sentiment::Negative);
1346    }
1347
1348    // ─── VADER Tests ─────────────────────────────────────────────────
1349
1350    #[test]
1351    fn test_vader_positive() {
1352        let vader = VaderSentimentAnalyzer::new();
1353        let result = vader
1354            .analyze("This movie is amazing and wonderful")
1355            .expect("analyze");
1356        assert_eq!(result.sentiment, Sentiment::Positive);
1357        assert!(result.compound > 0.0);
1358    }
1359
1360    #[test]
1361    fn test_vader_negative() {
1362        let vader = VaderSentimentAnalyzer::new();
1363        let result = vader
1364            .analyze("This movie is terrible and awful")
1365            .expect("analyze");
1366        assert_eq!(result.sentiment, Sentiment::Negative);
1367        assert!(result.compound < 0.0);
1368    }
1369
1370    #[test]
1371    fn test_vader_neutral() {
1372        let vader = VaderSentimentAnalyzer::new();
1373        let result = vader.analyze("The sky is blue").expect("analyze");
1374        assert_eq!(result.sentiment, Sentiment::Neutral);
1375    }
1376
1377    #[test]
1378    fn test_vader_negation() {
1379        let vader = VaderSentimentAnalyzer::new();
1380        let result = vader.analyze("This is not good at all").expect("analyze");
1381        assert!(result.compound < 0.0, "Negated positive should be negative");
1382    }
1383
1384    #[test]
1385    fn test_vader_intensifier() {
1386        let vader = VaderSentimentAnalyzer::new();
1387        let base = vader.analyze("This is good").expect("analyze");
1388        let intensified = vader.analyze("This is very good").expect("analyze");
1389        assert!(
1390            intensified.compound > base.compound,
1391            "Intensified should score higher: {} vs {}",
1392            intensified.compound,
1393            base.compound
1394        );
1395    }
1396
1397    #[test]
1398    fn test_vader_but_clause() {
1399        let vader = VaderSentimentAnalyzer::new();
1400        let result = vader
1401            .analyze("The food was good but the service was terrible")
1402            .expect("analyze");
1403        // After "but" has more weight, so service's negative should dominate
1404        assert!(result.compound < 0.0);
1405    }
1406
1407    #[test]
1408    fn test_vader_caps_emphasis() {
1409        let vader = VaderSentimentAnalyzer::new();
1410        let normal = vader.analyze("This is good").expect("analyze");
1411        let caps = vader.analyze("This is GOOD").expect("analyze");
1412        assert!(
1413            caps.compound >= normal.compound,
1414            "CAPS should score higher or equal"
1415        );
1416    }
1417
1418    #[test]
1419    fn test_vader_batch() {
1420        let vader = VaderSentimentAnalyzer::new();
1421        let texts = vec!["I love this!", "I hate this!"];
1422        let results = vader.analyze_batch(&texts).expect("batch");
1423        assert_eq!(results.len(), 2);
1424        assert_eq!(results[0].sentiment, Sentiment::Positive);
1425        assert_eq!(results[1].sentiment, Sentiment::Negative);
1426    }
1427
1428    #[test]
1429    fn test_vader_compound_range() {
1430        let vader = VaderSentimentAnalyzer::new();
1431        let result = vader
1432            .analyze("This is the most absolutely amazing incredible thing ever!!!")
1433            .expect("analyze");
1434        assert!(result.compound >= -1.0 && result.compound <= 1.0);
1435    }
1436
1437    // ─── Naive Bayes Tests ───────────────────────────────────────────
1438
1439    #[test]
1440    fn test_naive_bayes_train_predict() {
1441        let mut clf = NaiveBayesSentiment::new();
1442
1443        let texts = vec![
1444            "I love this product it is amazing",
1445            "Great quality excellent experience",
1446            "Wonderful service very happy",
1447            "This is terrible and awful",
1448            "Horrible experience very bad",
1449            "Worst product I have ever bought",
1450        ];
1451        let labels = vec![
1452            "positive", "positive", "positive", "negative", "negative", "negative",
1453        ];
1454
1455        clf.train(&texts, &labels).expect("training failed");
1456
1457        // Positive prediction
1458        let pred = clf.predict("This is amazing and great").expect("predict");
1459        assert_eq!(pred, "positive");
1460
1461        // Negative prediction
1462        let pred = clf
1463            .predict("This is terrible and horrible")
1464            .expect("predict");
1465        assert_eq!(pred, "negative");
1466    }
1467
1468    #[test]
1469    fn test_naive_bayes_predict_proba() {
1470        let mut clf = NaiveBayesSentiment::new();
1471
1472        let texts = vec![
1473            "good great excellent",
1474            "good wonderful amazing",
1475            "bad terrible awful",
1476            "bad horrible disgusting",
1477        ];
1478        let labels = vec!["positive", "positive", "negative", "negative"];
1479
1480        clf.train(&texts, &labels).expect("training failed");
1481
1482        let probas = clf.predict_proba("good excellent").expect("predict_proba");
1483        assert!(probas.contains_key("positive"));
1484        assert!(probas.contains_key("negative"));
1485
1486        // Positive should have higher probability
1487        let pos_prob = probas.get("positive").copied().unwrap_or(0.0);
1488        let neg_prob = probas.get("negative").copied().unwrap_or(0.0);
1489        assert!(pos_prob > neg_prob);
1490
1491        // Probabilities should sum to ~1
1492        let total: f64 = probas.values().sum();
1493        assert!((total - 1.0).abs() < 1e-6);
1494    }
1495
1496    #[test]
1497    fn test_naive_bayes_not_trained() {
1498        let clf = NaiveBayesSentiment::new();
1499        let result = clf.predict("test");
1500        assert!(result.is_err());
1501    }
1502
1503    #[test]
1504    fn test_naive_bayes_classes() {
1505        let mut clf = NaiveBayesSentiment::new();
1506        let texts = vec!["a", "b", "c"];
1507        let labels = vec!["pos", "neg", "pos"];
1508        clf.train(&texts, &labels).expect("train");
1509
1510        let classes = clf.classes();
1511        assert_eq!(classes.len(), 2);
1512    }
1513
1514    // ─── Aspect-Based Sentiment Tests ────────────────────────────────
1515
1516    #[test]
1517    fn test_aspect_sentiment_basic() {
1518        let analyzer = AspectSentimentAnalyzer::new();
1519
1520        let results = analyzer
1521            .analyze(
1522                "The food was excellent but the service was terrible",
1523                &["food", "service"],
1524            )
1525            .expect("analyze");
1526
1527        assert_eq!(results.len(), 2);
1528
1529        let food_result = results.iter().find(|r| r.aspect == "food");
1530        assert!(food_result.is_some());
1531        let food = food_result.expect("food aspect");
1532        assert_eq!(food.sentiment, Sentiment::Positive);
1533
1534        let service_result = results.iter().find(|r| r.aspect == "service");
1535        assert!(service_result.is_some());
1536        let service = service_result.expect("service aspect");
1537        assert_eq!(service.sentiment, Sentiment::Negative);
1538    }
1539
1540    #[test]
1541    fn test_aspect_sentiment_negation() {
1542        let analyzer = AspectSentimentAnalyzer::new();
1543
1544        let results = analyzer
1545            .analyze("The price was not good", &["price"])
1546            .expect("analyze");
1547
1548        assert!(!results.is_empty());
1549        // "not good" should flip to negative
1550        assert_eq!(results[0].sentiment, Sentiment::Negative);
1551    }
1552
1553    #[test]
1554    fn test_aspect_sentiment_no_match() {
1555        let analyzer = AspectSentimentAnalyzer::new();
1556        let results = analyzer
1557            .analyze("The sky is blue", &["food", "service"])
1558            .expect("analyze");
1559        assert!(results.is_empty());
1560    }
1561
1562    #[test]
1563    fn test_aspect_with_custom_window() {
1564        let analyzer = AspectSentimentAnalyzer::new().with_context_window(2);
1565        let results = analyzer
1566            .analyze("The food here is really great and beautiful", &["food"])
1567            .expect("analyze");
1568        assert!(!results.is_empty());
1569    }
1570
1571    // ─── Aggregation Tests ───────────────────────────────────────────
1572
1573    #[test]
1574    fn test_aggregate_sentiment() {
1575        let analyzer = LexiconSentimentAnalyzer::with_basiclexicon();
1576        let results = analyzer
1577            .analyze_batch(&["I love this", "I love this too", "This is terrible"])
1578            .expect("batch");
1579
1580        let agg = aggregate_sentiment(&results);
1581        assert_eq!(agg.count, 3);
1582        assert!(agg.mean_score > 0.0); // 2 positive, 1 negative
1583        assert!(agg.positive_ratio > 0.5);
1584        assert!(agg.std_score > 0.0);
1585    }
1586
1587    #[test]
1588    fn test_aggregate_empty() {
1589        let agg = aggregate_sentiment(&[]);
1590        assert_eq!(agg.count, 0);
1591        assert_eq!(agg.overall_sentiment, Sentiment::Neutral);
1592    }
1593
1594    #[test]
1595    fn test_analyze_and_aggregate() {
1596        let texts = vec!["I love this product", "It is amazing", "Very good quality"];
1597        let agg = analyze_and_aggregate(&texts).expect("aggregate");
1598        assert_eq!(agg.count, 3);
1599        assert!(agg.mean_score > 0.0);
1600        assert_eq!(agg.overall_sentiment, Sentiment::Positive);
1601    }
1602
1603    #[test]
1604    fn test_sentiment_display() {
1605        assert_eq!(format!("{}", Sentiment::Positive), "Positive");
1606        assert_eq!(format!("{}", Sentiment::Negative), "Negative");
1607        assert_eq!(format!("{}", Sentiment::Neutral), "Neutral");
1608    }
1609
1610    #[test]
1611    fn test_sentiment_from_score_thresholds() {
1612        assert_eq!(Sentiment::from_score(0.1), Sentiment::Positive);
1613        assert_eq!(Sentiment::from_score(-0.1), Sentiment::Negative);
1614        assert_eq!(Sentiment::from_score(0.0), Sentiment::Neutral);
1615        assert_eq!(Sentiment::from_score(0.03), Sentiment::Neutral);
1616    }
1617}
scirs2_text/sentiment.rs

scirs2_text/
sentiment.rs