1use crate::error::{Result, TextError};
29use crate::tokenize::{Tokenizer, WordTokenizer};
30use std::collections::HashMap;
31
32#[derive(Debug, Clone, Copy, PartialEq)]
34pub enum Sentiment {
35 Positive,
37 Negative,
39 Neutral,
41}
42
43impl std::fmt::Display for Sentiment {
44 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
45 match self {
46 Sentiment::Positive => write!(f, "Positive"),
47 Sentiment::Negative => write!(f, "Negative"),
48 Sentiment::Neutral => write!(f, "Neutral"),
49 }
50 }
51}
52
53impl Sentiment {
54 pub fn to_score(&self) -> f64 {
56 match self {
57 Sentiment::Positive => 1.0,
58 Sentiment::Neutral => 0.0,
59 Sentiment::Negative => -1.0,
60 }
61 }
62
63 pub fn from_score(score: f64) -> Self {
65 if score > 0.05 {
66 Sentiment::Positive
67 } else if score < -0.05 {
68 Sentiment::Negative
69 } else {
70 Sentiment::Neutral
71 }
72 }
73}
74
75#[derive(Debug, Clone)]
77pub struct SentimentResult {
78 pub sentiment: Sentiment,
80 pub score: f64,
82 pub confidence: f64,
84 pub word_counts: SentimentWordCounts,
86}
87
88#[derive(Debug, Clone, Default)]
90pub struct SentimentWordCounts {
91 pub positive_words: usize,
93 pub negative_words: usize,
95 pub neutral_words: usize,
97 pub total_words: usize,
99}
100
101#[derive(Debug, Clone)]
105pub struct SentimentLexicon {
106 lexicon: HashMap<String, f64>,
108 default_score: f64,
110}
111
112impl SentimentLexicon {
113 pub fn new() -> Self {
115 Self {
116 lexicon: HashMap::new(),
117 default_score: 0.0,
118 }
119 }
120
121 pub fn with_basiclexicon() -> Self {
123 let mut lexicon = HashMap::new();
124
125 let positive_words = [
127 ("good", 1.0),
128 ("great", 2.0),
129 ("excellent", 3.0),
130 ("amazing", 3.0),
131 ("wonderful", 2.5),
132 ("fantastic", 2.5),
133 ("love", 2.0),
134 ("like", 1.0),
135 ("happy", 2.0),
136 ("joy", 2.0),
137 ("pleased", 1.5),
138 ("satisfied", 1.0),
139 ("positive", 1.0),
140 ("perfect", 3.0),
141 ("best", 2.5),
142 ("awesome", 2.5),
143 ("beautiful", 2.0),
144 ("brilliant", 2.5),
145 ("superb", 2.5),
146 ("nice", 1.0),
147 ("outstanding", 3.0),
148 ("exceptional", 3.0),
149 ("remarkable", 2.0),
150 ("delightful", 2.5),
151 ("impressive", 2.0),
152 ("enjoy", 1.5),
153 ("recommend", 1.5),
154 ("better", 1.0),
155 ("superior", 2.0),
156 ("exciting", 2.0),
157 ];
158
159 let negative_words = [
161 ("bad", -1.0),
162 ("terrible", -2.5),
163 ("awful", -2.5),
164 ("horrible", -3.0),
165 ("hate", -2.5),
166 ("dislike", -1.5),
167 ("sad", -2.0),
168 ("unhappy", -2.0),
169 ("disappointed", -2.0),
170 ("negative", -1.0),
171 ("worst", -3.0),
172 ("poor", -1.5),
173 ("disgusting", -3.0),
174 ("ugly", -2.0),
175 ("nasty", -2.5),
176 ("stupid", -2.0),
177 ("pathetic", -2.5),
178 ("failure", -2.0),
179 ("fail", -2.0),
180 ("sucks", -2.0),
181 ("boring", -1.5),
182 ("mediocre", -1.0),
183 ("inferior", -2.0),
184 ("lousy", -2.0),
185 ("dreadful", -2.5),
186 ("annoying", -1.5),
187 ("frustrating", -2.0),
188 ("disappointing", -2.0),
189 ("terrible", -2.5),
190 ("useless", -2.0),
191 ];
192
193 for (word, score) in &positive_words {
194 lexicon.insert(word.to_string(), *score);
195 }
196
197 for (word, score) in &negative_words {
198 lexicon.insert(word.to_string(), *score);
199 }
200
201 Self {
202 lexicon,
203 default_score: 0.0,
204 }
205 }
206
207 pub fn add_word(&mut self, word: String, score: f64) {
209 self.lexicon.insert(word.to_lowercase(), score);
210 }
211
212 pub fn get_score(&self, word: &str) -> f64 {
214 self.lexicon
215 .get(&word.to_lowercase())
216 .copied()
217 .unwrap_or(self.default_score)
218 }
219
220 pub fn contains(&self, word: &str) -> bool {
222 self.lexicon.contains_key(&word.to_lowercase())
223 }
224
225 pub fn len(&self) -> usize {
227 self.lexicon.len()
228 }
229
230 pub fn is_empty(&self) -> bool {
232 self.lexicon.is_empty()
233 }
234
235 pub fn entries(&self) -> &HashMap<String, f64> {
237 &self.lexicon
238 }
239}
240
241impl Default for SentimentLexicon {
242 fn default() -> Self {
243 Self::new()
244 }
245}
246
247pub struct LexiconSentimentAnalyzer {
251 lexicon: SentimentLexicon,
253 tokenizer: Box<dyn Tokenizer + Send + Sync>,
255 negation_words: Vec<String>,
257 negation_window: usize,
259}
260
261impl LexiconSentimentAnalyzer {
262 pub fn new(lexicon: SentimentLexicon) -> Self {
264 let negation_words = vec![
265 "not".to_string(),
266 "no".to_string(),
267 "never".to_string(),
268 "neither".to_string(),
269 "nobody".to_string(),
270 "nothing".to_string(),
271 "nowhere".to_string(),
272 "n't".to_string(),
273 "cannot".to_string(),
274 "without".to_string(),
275 ];
276
277 Self {
278 lexicon,
279 tokenizer: Box::new(WordTokenizer::default()),
280 negation_words,
281 negation_window: 3,
282 }
283 }
284
285 pub fn with_basiclexicon() -> Self {
287 Self::new(SentimentLexicon::with_basiclexicon())
288 }
289
290 pub fn with_tokenizer(mut self, tokenizer: Box<dyn Tokenizer + Send + Sync>) -> Self {
292 self.tokenizer = tokenizer;
293 self
294 }
295
296 pub fn analyze(&self, text: &str) -> Result<SentimentResult> {
298 let tokens = self.tokenizer.tokenize(text)?;
299
300 if tokens.is_empty() {
301 return Ok(SentimentResult {
302 sentiment: Sentiment::Neutral,
303 score: 0.0,
304 confidence: 0.0,
305 word_counts: SentimentWordCounts {
306 positive_words: 0,
307 negative_words: 0,
308 neutral_words: 0,
309 total_words: 0,
310 },
311 });
312 }
313
314 let mut total_score = 0.0;
315 let mut positive_count = 0;
316 let mut negative_count = 0;
317 let mut neutral_count = 0;
318
319 for (i, token) in tokens.iter().enumerate() {
321 let token_lower = token.to_lowercase();
322 let mut score = self.lexicon.get_score(&token_lower);
323
324 if score != 0.0 {
326 for j in 1..=self.negation_window.min(i) {
327 let prev_token = &tokens[i - j].to_lowercase();
328 if self.negation_words.contains(prev_token) {
329 score *= -1.0;
330 break;
331 }
332 }
333 }
334
335 total_score += score;
336
337 if score > 0.0 {
338 positive_count += 1;
339 } else if score < 0.0 {
340 negative_count += 1;
341 } else {
342 neutral_count += 1;
343 }
344 }
345
346 let total_words = tokens.len();
347 let sentiment = Sentiment::from_score(total_score);
348
349 let sentiment_words = positive_count + negative_count;
351 let confidence = if total_words > 0 {
352 (sentiment_words as f64 / total_words as f64).min(1.0)
353 } else {
354 0.0
355 };
356
357 Ok(SentimentResult {
358 sentiment,
359 score: total_score,
360 confidence,
361 word_counts: SentimentWordCounts {
362 positive_words: positive_count,
363 negative_words: negative_count,
364 neutral_words: neutral_count,
365 total_words,
366 },
367 })
368 }
369
370 pub fn analyze_batch(&self, texts: &[&str]) -> Result<Vec<SentimentResult>> {
372 texts.iter().map(|&text| self.analyze(text)).collect()
373 }
374}
375
376#[derive(Debug, Clone)]
380pub struct SentimentRules {
381 intensifiers: HashMap<String, f64>,
383 diminishers: HashMap<String, f64>,
385}
386
387impl Default for SentimentRules {
388 fn default() -> Self {
389 let mut intensifiers = HashMap::new();
390 intensifiers.insert("very".to_string(), 1.5);
391 intensifiers.insert("extremely".to_string(), 2.0);
392 intensifiers.insert("incredibly".to_string(), 2.0);
393 intensifiers.insert("really".to_string(), 1.3);
394 intensifiers.insert("so".to_string(), 1.3);
395 intensifiers.insert("absolutely".to_string(), 2.0);
396 intensifiers.insert("truly".to_string(), 1.5);
397 intensifiers.insert("totally".to_string(), 1.5);
398 intensifiers.insert("utterly".to_string(), 1.8);
399 intensifiers.insert("remarkably".to_string(), 1.5);
400
401 let mut diminishers = HashMap::new();
402 diminishers.insert("somewhat".to_string(), 0.5);
403 diminishers.insert("slightly".to_string(), 0.5);
404 diminishers.insert("barely".to_string(), 0.3);
405 diminishers.insert("hardly".to_string(), 0.3);
406 diminishers.insert("a little".to_string(), 0.5);
407 diminishers.insert("kind of".to_string(), 0.5);
408 diminishers.insert("sort of".to_string(), 0.5);
409 diminishers.insert("marginally".to_string(), 0.4);
410
411 Self {
412 intensifiers,
413 diminishers,
414 }
415 }
416}
417
418impl SentimentRules {
419 pub fn apply(&self, tokens: &[String], basescores: &[f64]) -> Vec<f64> {
421 let mut modified_scores = basescores.to_vec();
422
423 for (i, score) in modified_scores.iter_mut().enumerate() {
424 if *score == 0.0 {
425 continue;
426 }
427
428 for j in 1..=2.min(i) {
430 let prev_token = &tokens[i - j].to_lowercase();
431
432 if let Some(&multiplier) = self.intensifiers.get(prev_token) {
433 *score *= multiplier;
434 break;
435 } else if let Some(&multiplier) = self.diminishers.get(prev_token) {
436 *score *= multiplier;
437 break;
438 }
439 }
440 }
441
442 modified_scores
443 }
444}
445
446pub struct RuleBasedSentimentAnalyzer {
448 base_analyzer: LexiconSentimentAnalyzer,
450 rules: SentimentRules,
452}
453
454impl RuleBasedSentimentAnalyzer {
455 pub fn new(lexicon: SentimentLexicon) -> Self {
457 Self {
458 base_analyzer: LexiconSentimentAnalyzer::new(lexicon),
459 rules: SentimentRules::default(),
460 }
461 }
462
463 pub fn with_basiclexicon() -> Self {
465 Self::new(SentimentLexicon::with_basiclexicon())
466 }
467
468 pub fn analyze(&self, text: &str) -> Result<SentimentResult> {
470 let tokens = self.base_analyzer.tokenizer.tokenize(text)?;
471
472 if tokens.is_empty() {
473 return self.base_analyzer.analyze(text);
474 }
475
476 let basescores: Vec<f64> = tokens
478 .iter()
479 .map(|token| self.base_analyzer.lexicon.get_score(token))
480 .collect();
481
482 let modified_scores = self.rules.apply(&tokens, &basescores);
484
485 let total_score: f64 = modified_scores.iter().sum();
487 let sentiment = Sentiment::from_score(total_score);
488
489 let mut positive_count = 0;
491 let mut negative_count = 0;
492 let mut neutral_count = 0;
493
494 for &score in &modified_scores {
495 if score > 0.0 {
496 positive_count += 1;
497 } else if score < 0.0 {
498 negative_count += 1;
499 } else {
500 neutral_count += 1;
501 }
502 }
503
504 let total_words = tokens.len();
505 let sentiment_words = positive_count + negative_count;
506 let confidence = if total_words > 0 {
507 (sentiment_words as f64 / total_words as f64).min(1.0)
508 } else {
509 0.0
510 };
511
512 Ok(SentimentResult {
513 sentiment,
514 score: total_score,
515 confidence,
516 word_counts: SentimentWordCounts {
517 positive_words: positive_count,
518 negative_words: negative_count,
519 neutral_words: neutral_count,
520 total_words,
521 },
522 })
523 }
524}
525
526#[derive(Debug, Clone)]
530pub struct VaderResult {
531 pub positive: f64,
533 pub negative: f64,
535 pub neutral: f64,
537 pub compound: f64,
539 pub sentiment: Sentiment,
541}
542
543pub struct VaderSentimentAnalyzer {
553 lexicon: SentimentLexicon,
555 tokenizer: Box<dyn Tokenizer + Send + Sync>,
557 negation_words: Vec<String>,
559 intensifiers: HashMap<String, f64>,
561 diminishers: HashMap<String, f64>,
563 but_weight: f64,
565 caps_multiplier: f64,
567 exclamation_boost: f64,
569 question_reduction: f64,
571}
572
573impl VaderSentimentAnalyzer {
574 pub fn new() -> Self {
576 let mut intensifiers = HashMap::new();
577 intensifiers.insert("very".to_string(), 0.293);
578 intensifiers.insert("extremely".to_string(), 0.293);
579 intensifiers.insert("absolutely".to_string(), 0.293);
580 intensifiers.insert("incredibly".to_string(), 0.293);
581 intensifiers.insert("really".to_string(), 0.18);
582 intensifiers.insert("so".to_string(), 0.18);
583 intensifiers.insert("truly".to_string(), 0.18);
584 intensifiers.insert("totally".to_string(), 0.18);
585 intensifiers.insert("quite".to_string(), 0.1);
586
587 let mut diminishers = HashMap::new();
588 diminishers.insert("somewhat".to_string(), -0.1);
589 diminishers.insert("barely".to_string(), -0.2);
590 diminishers.insert("hardly".to_string(), -0.2);
591 diminishers.insert("slightly".to_string(), -0.1);
592 diminishers.insert("kind of".to_string(), -0.1);
593 diminishers.insert("sort of".to_string(), -0.1);
594
595 let negation_words = vec![
596 "not".to_string(),
597 "no".to_string(),
598 "never".to_string(),
599 "neither".to_string(),
600 "nobody".to_string(),
601 "nothing".to_string(),
602 "nowhere".to_string(),
603 "cannot".to_string(),
604 "without".to_string(),
605 "don't".to_string(),
606 "doesn't".to_string(),
607 "didn't".to_string(),
608 "isn't".to_string(),
609 "wasn't".to_string(),
610 "won't".to_string(),
611 "wouldn't".to_string(),
612 "shouldn't".to_string(),
613 "couldn't".to_string(),
614 "aren't".to_string(),
615 "weren't".to_string(),
616 ];
617
618 Self {
619 lexicon: SentimentLexicon::with_basiclexicon(),
620 tokenizer: Box::new(WordTokenizer::default()),
621 negation_words,
622 intensifiers,
623 diminishers,
624 but_weight: 0.5,
625 caps_multiplier: 0.733,
626 exclamation_boost: 0.292,
627 question_reduction: 0.18,
628 }
629 }
630
631 pub fn with_lexicon(mut self, lexicon: SentimentLexicon) -> Self {
633 self.lexicon = lexicon;
634 self
635 }
636
637 pub fn analyze(&self, text: &str) -> Result<VaderResult> {
639 let tokens = self.tokenizer.tokenize(text)?;
640
641 if tokens.is_empty() {
642 return Ok(VaderResult {
643 positive: 0.0,
644 negative: 0.0,
645 neutral: 1.0,
646 compound: 0.0,
647 sentiment: Sentiment::Neutral,
648 });
649 }
650
651 let mut sentiments: Vec<f64> = Vec::with_capacity(tokens.len());
653
654 for (i, token) in tokens.iter().enumerate() {
655 let lower = token.to_lowercase();
656 let mut score = self.lexicon.get_score(&lower);
657
658 if score == 0.0 {
659 sentiments.push(0.0);
660 continue;
661 }
662
663 if token.len() > 1 && token.chars().all(|c| c.is_uppercase()) {
665 if score > 0.0 {
666 score += self.caps_multiplier;
667 } else {
668 score -= self.caps_multiplier;
669 }
670 }
671
672 for j in 1..=3.min(i) {
674 let prev = tokens[i - j].to_lowercase();
675 if let Some(&boost) = self.intensifiers.get(&prev) {
676 if score > 0.0 {
677 score += boost;
678 } else {
679 score -= boost;
680 }
681 break;
682 } else if let Some(&reduce) = self.diminishers.get(&prev) {
683 if score > 0.0 {
684 score += reduce; } else {
686 score -= reduce;
687 }
688 break;
689 }
690 }
691
692 let mut negated = false;
694 for j in 1..=3.min(i) {
695 let prev = tokens[i - j].to_lowercase();
696 if self.negation_words.contains(&prev) {
697 negated = true;
698 break;
699 }
700 }
701
702 if negated {
703 score *= -0.74; }
705
706 sentiments.push(score);
707 }
708
709 let mut but_idx = None;
711 for (i, token) in tokens.iter().enumerate() {
712 if token.to_lowercase() == "but" || token.to_lowercase() == "however" {
713 but_idx = Some(i);
714 }
715 }
716
717 if let Some(idx) = but_idx {
718 for (i, score) in sentiments.iter_mut().enumerate() {
720 if i < idx {
721 *score *= 1.0 - self.but_weight;
722 } else if i > idx {
723 *score *= 1.0 + self.but_weight;
724 }
725 }
726 }
727
728 let mut sum_scores: f64 = sentiments.iter().sum();
730
731 let excl_count = text.chars().filter(|&c| c == '!').count().min(4);
733 if excl_count > 0 {
734 sum_scores += excl_count as f64 * self.exclamation_boost * sum_scores.signum();
735 }
736
737 if text.trim_end().ends_with('?') {
739 sum_scores *= 1.0 - self.question_reduction;
740 }
741
742 let compound = self.normalize(sum_scores);
744
745 let mut pos_sum = 0.0;
747 let mut neg_sum = 0.0;
748 let mut neu_count = 0.0;
749
750 for &s in &sentiments {
751 if s > 0.0 {
752 pos_sum += s;
753 } else if s < 0.0 {
754 neg_sum += s;
755 } else {
756 neu_count += 1.0;
757 }
758 }
759
760 let total = pos_sum + neg_sum.abs() + neu_count;
761 let (positive, negative, neutral) = if total > 0.0 {
762 (
763 (pos_sum / total).abs(),
764 (neg_sum / total).abs(),
765 neu_count / total,
766 )
767 } else {
768 (0.0, 0.0, 1.0)
769 };
770
771 let sentiment = if compound >= 0.05 {
772 Sentiment::Positive
773 } else if compound <= -0.05 {
774 Sentiment::Negative
775 } else {
776 Sentiment::Neutral
777 };
778
779 Ok(VaderResult {
780 positive,
781 negative,
782 neutral,
783 compound,
784 sentiment,
785 })
786 }
787
788 fn normalize(&self, score: f64) -> f64 {
790 let alpha = 15.0; score / (score * score + alpha).sqrt()
792 }
793
794 pub fn analyze_batch(&self, texts: &[&str]) -> Result<Vec<VaderResult>> {
796 texts.iter().map(|&text| self.analyze(text)).collect()
797 }
798}
799
800impl Default for VaderSentimentAnalyzer {
801 fn default() -> Self {
802 Self::new()
803 }
804}
805
806pub struct NaiveBayesSentiment {
814 word_counts: HashMap<String, HashMap<String, f64>>,
816 class_word_totals: HashMap<String, f64>,
818 class_doc_counts: HashMap<String, usize>,
820 total_docs: usize,
822 vocabulary: HashMap<String, usize>,
824 alpha: f64,
826 tokenizer: Box<dyn Tokenizer + Send + Sync>,
828}
829
830impl std::fmt::Debug for NaiveBayesSentiment {
831 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
832 f.debug_struct("NaiveBayesSentiment")
833 .field("total_docs", &self.total_docs)
834 .field("vocabulary_size", &self.vocabulary.len())
835 .field("alpha", &self.alpha)
836 .field("classes", &self.class_doc_counts.keys().collect::<Vec<_>>())
837 .finish()
838 }
839}
840
841impl NaiveBayesSentiment {
842 pub fn new() -> Self {
844 Self {
845 word_counts: HashMap::new(),
846 class_word_totals: HashMap::new(),
847 class_doc_counts: HashMap::new(),
848 total_docs: 0,
849 vocabulary: HashMap::new(),
850 alpha: 1.0,
851 tokenizer: Box::new(WordTokenizer::default()),
852 }
853 }
854
855 pub fn with_alpha(mut self, alpha: f64) -> Self {
857 self.alpha = alpha;
858 self
859 }
860
861 pub fn train(&mut self, texts: &[&str], labels: &[&str]) -> Result<()> {
867 if texts.len() != labels.len() {
868 return Err(TextError::InvalidInput(
869 "texts and labels must have the same length".into(),
870 ));
871 }
872
873 if texts.is_empty() {
874 return Err(TextError::InvalidInput("No training data provided".into()));
875 }
876
877 for (text, &label) in texts.iter().zip(labels.iter()) {
878 let tokens = self.tokenizer.tokenize(text)?;
879
880 *self.class_doc_counts.entry(label.to_string()).or_insert(0) += 1;
882 self.total_docs += 1;
883
884 let class_words = self.word_counts.entry(label.to_string()).or_default();
886
887 for token in &tokens {
888 let lower = token.to_lowercase();
889 *class_words.entry(lower.clone()).or_insert(0.0) += 1.0;
890 *self
891 .class_word_totals
892 .entry(label.to_string())
893 .or_insert(0.0) += 1.0;
894
895 let vocab_len = self.vocabulary.len();
897 self.vocabulary.entry(lower).or_insert(vocab_len);
898 }
899 }
900
901 Ok(())
902 }
903
904 pub fn predict(&self, text: &str) -> Result<String> {
906 let (label, _) = self.predict_with_score(text)?;
907 Ok(label)
908 }
909
910 pub fn predict_with_score(&self, text: &str) -> Result<(String, f64)> {
912 if self.total_docs == 0 {
913 return Err(TextError::ModelNotFitted(
914 "Classifier not trained. Call train() first".into(),
915 ));
916 }
917
918 let tokens = self.tokenizer.tokenize(text)?;
919 let vocab_size = self.vocabulary.len() as f64;
920
921 let mut best_label = String::new();
922 let mut best_score = f64::NEG_INFINITY;
923
924 for (label, &doc_count) in &self.class_doc_counts {
925 let log_prior = (doc_count as f64 / self.total_docs as f64).ln();
927
928 let class_words = self.word_counts.get(label);
930 let class_total = self.class_word_totals.get(label).copied().unwrap_or(0.0);
931
932 let mut log_likelihood = 0.0;
933
934 for token in &tokens {
935 let lower = token.to_lowercase();
936 let word_count = class_words
937 .and_then(|wc| wc.get(&lower))
938 .copied()
939 .unwrap_or(0.0);
940
941 let prob = (word_count + self.alpha) / (class_total + self.alpha * vocab_size);
943 log_likelihood += prob.ln();
944 }
945
946 let score = log_prior + log_likelihood;
947 if score > best_score {
948 best_score = score;
949 best_label = label.clone();
950 }
951 }
952
953 Ok((best_label, best_score))
954 }
955
956 pub fn predict_proba(&self, text: &str) -> Result<HashMap<String, f64>> {
958 if self.total_docs == 0 {
959 return Err(TextError::ModelNotFitted("Classifier not trained".into()));
960 }
961
962 let tokens = self.tokenizer.tokenize(text)?;
963 let vocab_size = self.vocabulary.len() as f64;
964
965 let mut log_scores: Vec<(String, f64)> = Vec::new();
966
967 for (label, &doc_count) in &self.class_doc_counts {
968 let log_prior = (doc_count as f64 / self.total_docs as f64).ln();
969
970 let class_words = self.word_counts.get(label);
971 let class_total = self.class_word_totals.get(label).copied().unwrap_or(0.0);
972
973 let mut log_likelihood = 0.0;
974 for token in &tokens {
975 let lower = token.to_lowercase();
976 let word_count = class_words
977 .and_then(|wc| wc.get(&lower))
978 .copied()
979 .unwrap_or(0.0);
980
981 let prob = (word_count + self.alpha) / (class_total + self.alpha * vocab_size);
982 log_likelihood += prob.ln();
983 }
984
985 log_scores.push((label.clone(), log_prior + log_likelihood));
986 }
987
988 let max_score = log_scores
990 .iter()
991 .map(|(_, s)| *s)
992 .fold(f64::NEG_INFINITY, f64::max);
993
994 let sum_exp: f64 = log_scores.iter().map(|(_, s)| (s - max_score).exp()).sum();
995
996 let mut probas = HashMap::new();
997 for (label, score) in &log_scores {
998 let prob = (score - max_score).exp() / sum_exp;
999 probas.insert(label.clone(), prob);
1000 }
1001
1002 Ok(probas)
1003 }
1004
1005 pub fn classes(&self) -> Vec<String> {
1007 self.class_doc_counts.keys().cloned().collect()
1008 }
1009}
1010
1011impl Default for NaiveBayesSentiment {
1012 fn default() -> Self {
1013 Self::new()
1014 }
1015}
1016
1017#[derive(Debug, Clone)]
1021pub struct AspectSentiment {
1022 pub aspect: String,
1024 pub sentiment: Sentiment,
1026 pub score: f64,
1028 pub context: String,
1030}
1031
1032pub struct AspectSentimentAnalyzer {
1038 lexicon: SentimentLexicon,
1040 tokenizer: Box<dyn Tokenizer + Send + Sync>,
1042 context_window: usize,
1044 negation_words: Vec<String>,
1046}
1047
1048impl AspectSentimentAnalyzer {
1049 pub fn new() -> Self {
1051 Self {
1052 lexicon: SentimentLexicon::with_basiclexicon(),
1053 tokenizer: Box::new(WordTokenizer::default()),
1054 context_window: 5,
1055 negation_words: vec![
1056 "not".to_string(),
1057 "no".to_string(),
1058 "never".to_string(),
1059 "n't".to_string(),
1060 "without".to_string(),
1061 ],
1062 }
1063 }
1064
1065 pub fn with_lexicon(mut self, lexicon: SentimentLexicon) -> Self {
1067 self.lexicon = lexicon;
1068 self
1069 }
1070
1071 pub fn with_context_window(mut self, window: usize) -> Self {
1073 self.context_window = window;
1074 self
1075 }
1076
1077 pub fn analyze(&self, text: &str, aspects: &[&str]) -> Result<Vec<AspectSentiment>> {
1083 let tokens = self.tokenizer.tokenize(text)?;
1084 let lower_tokens: Vec<String> = tokens.iter().map(|t| t.to_lowercase()).collect();
1085
1086 let mut results = Vec::new();
1087
1088 for &aspect in aspects {
1089 let aspect_lower = aspect.to_lowercase();
1090 let aspect_tokens: Vec<String> =
1091 aspect_lower.split_whitespace().map(String::from).collect();
1092
1093 for pos in 0..lower_tokens.len() {
1095 let aspect_matches = if aspect_tokens.len() == 1 {
1097 lower_tokens[pos] == aspect_tokens[0]
1098 } else {
1099 pos + aspect_tokens.len() <= lower_tokens.len()
1100 && aspect_tokens
1101 .iter()
1102 .enumerate()
1103 .all(|(j, at)| lower_tokens[pos + j] == *at)
1104 };
1105
1106 if !aspect_matches {
1107 continue;
1108 }
1109
1110 let discourse_markers = [
1113 "but",
1114 "however",
1115 "although",
1116 "yet",
1117 "though",
1118 "nevertheless",
1119 ];
1120 let mut start = pos.saturating_sub(self.context_window);
1121 let end = (pos + aspect_tokens.len() + self.context_window).min(lower_tokens.len());
1122
1123 let initial_start = start;
1126 if let Some(last_marker_idx) = (initial_start..pos)
1127 .rev()
1128 .find(|&i| discourse_markers.contains(&lower_tokens[i].as_str()))
1129 {
1130 start = last_marker_idx + 1;
1131 }
1132 let mut effective_end = end;
1134 for i in (pos + aspect_tokens.len())..end {
1135 if discourse_markers.contains(&lower_tokens[i].as_str()) {
1136 effective_end = i;
1137 break;
1138 }
1139 }
1140
1141 let mut score = 0.0;
1142 let mut is_negated = false;
1143
1144 for i in start..effective_end {
1145 if i >= pos && i < pos + aspect_tokens.len() {
1147 continue;
1148 }
1149
1150 let token = &lower_tokens[i];
1151
1152 if self.negation_words.contains(token) {
1154 is_negated = true;
1155 continue;
1156 }
1157
1158 let word_score = self.lexicon.get_score(token);
1159 if word_score != 0.0 {
1160 if is_negated {
1161 score -= word_score;
1162 is_negated = false;
1163 } else {
1164 score += word_score;
1165 }
1166 }
1167 }
1168
1169 let context_tokens = &tokens[start..end];
1171 let context = context_tokens.join(" ");
1172
1173 results.push(AspectSentiment {
1174 aspect: aspect.to_string(),
1175 sentiment: Sentiment::from_score(score),
1176 score,
1177 context,
1178 });
1179 }
1180 }
1181
1182 Ok(results)
1183 }
1184}
1185
1186impl Default for AspectSentimentAnalyzer {
1187 fn default() -> Self {
1188 Self::new()
1189 }
1190}
1191
1192#[derive(Debug, Clone)]
1196pub struct AggregatedSentiment {
1197 pub mean_score: f64,
1199 pub std_score: f64,
1201 pub overall_sentiment: Sentiment,
1203 pub positive_ratio: f64,
1205 pub negative_ratio: f64,
1207 pub neutral_ratio: f64,
1209 pub count: usize,
1211 pub results: Vec<SentimentResult>,
1213}
1214
1215pub fn aggregate_sentiment(results: &[SentimentResult]) -> AggregatedSentiment {
1217 if results.is_empty() {
1218 return AggregatedSentiment {
1219 mean_score: 0.0,
1220 std_score: 0.0,
1221 overall_sentiment: Sentiment::Neutral,
1222 positive_ratio: 0.0,
1223 negative_ratio: 0.0,
1224 neutral_ratio: 0.0,
1225 count: 0,
1226 results: Vec::new(),
1227 };
1228 }
1229
1230 let n = results.len() as f64;
1231
1232 let sum: f64 = results.iter().map(|r| r.score).sum();
1234 let mean_score = sum / n;
1235
1236 let variance: f64 = results
1238 .iter()
1239 .map(|r| (r.score - mean_score).powi(2))
1240 .sum::<f64>()
1241 / n;
1242 let std_score = variance.sqrt();
1243
1244 let mut pos = 0;
1246 let mut neg = 0;
1247 let mut neu = 0;
1248 for r in results {
1249 match r.sentiment {
1250 Sentiment::Positive => pos += 1,
1251 Sentiment::Negative => neg += 1,
1252 Sentiment::Neutral => neu += 1,
1253 }
1254 }
1255
1256 AggregatedSentiment {
1257 mean_score,
1258 std_score,
1259 overall_sentiment: Sentiment::from_score(mean_score),
1260 positive_ratio: pos as f64 / n,
1261 negative_ratio: neg as f64 / n,
1262 neutral_ratio: neu as f64 / n,
1263 count: results.len(),
1264 results: results.to_vec(),
1265 }
1266}
1267
1268pub fn analyze_and_aggregate(texts: &[&str]) -> Result<AggregatedSentiment> {
1270 let analyzer = LexiconSentimentAnalyzer::with_basiclexicon();
1271 let results = analyzer.analyze_batch(texts)?;
1272 Ok(aggregate_sentiment(&results))
1273}
1274
1275#[cfg(test)]
1276mod tests {
1277 use super::*;
1278
1279 #[test]
1282 fn test_sentimentlexicon() {
1283 let mut lexicon = SentimentLexicon::new();
1284 lexicon.add_word("happy".to_string(), 2.0);
1285 lexicon.add_word("sad".to_string(), -2.0);
1286
1287 assert_eq!(lexicon.get_score("happy"), 2.0);
1288 assert_eq!(lexicon.get_score("sad"), -2.0);
1289 assert_eq!(lexicon.get_score("unknown"), 0.0);
1290 }
1291
1292 #[test]
1293 fn test_basic_sentiment_analysis() {
1294 let analyzer = LexiconSentimentAnalyzer::with_basiclexicon();
1295
1296 let positive_result = analyzer
1297 .analyze("This is a wonderful day!")
1298 .expect("Operation failed");
1299 assert_eq!(positive_result.sentiment, Sentiment::Positive);
1300 assert!(positive_result.score > 0.0);
1301
1302 let negative_result = analyzer
1303 .analyze("This is terrible and awful")
1304 .expect("Operation failed");
1305 assert_eq!(negative_result.sentiment, Sentiment::Negative);
1306 assert!(negative_result.score < 0.0);
1307
1308 let neutral_result = analyzer
1309 .analyze("This is a book")
1310 .expect("Operation failed");
1311 assert_eq!(neutral_result.sentiment, Sentiment::Neutral);
1312 }
1313
1314 #[test]
1315 fn test_negation_handling() {
1316 let analyzer = LexiconSentimentAnalyzer::with_basiclexicon();
1317
1318 let negated_result = analyzer
1319 .analyze("This is not good")
1320 .expect("Operation failed");
1321 assert_eq!(negated_result.sentiment, Sentiment::Negative);
1322 assert!(negated_result.score < 0.0);
1323 }
1324
1325 #[test]
1326 fn test_rule_based_sentiment() {
1327 let analyzer = RuleBasedSentimentAnalyzer::with_basiclexicon();
1328
1329 let intensified_result = analyzer
1330 .analyze("This is very good")
1331 .expect("Operation failed");
1332 let normal_result = analyzer.analyze("This is good").expect("Operation failed");
1333
1334 assert!(intensified_result.score > normal_result.score);
1335 }
1336
1337 #[test]
1338 fn test_sentiment_batch_analysis() {
1339 let analyzer = LexiconSentimentAnalyzer::with_basiclexicon();
1340 let texts = vec!["I love this", "I hate this", "This is okay"];
1341
1342 let results = analyzer.analyze_batch(&texts).expect("Operation failed");
1343 assert_eq!(results.len(), 3);
1344 assert_eq!(results[0].sentiment, Sentiment::Positive);
1345 assert_eq!(results[1].sentiment, Sentiment::Negative);
1346 }
1347
1348 #[test]
1351 fn test_vader_positive() {
1352 let vader = VaderSentimentAnalyzer::new();
1353 let result = vader
1354 .analyze("This movie is amazing and wonderful")
1355 .expect("analyze");
1356 assert_eq!(result.sentiment, Sentiment::Positive);
1357 assert!(result.compound > 0.0);
1358 }
1359
1360 #[test]
1361 fn test_vader_negative() {
1362 let vader = VaderSentimentAnalyzer::new();
1363 let result = vader
1364 .analyze("This movie is terrible and awful")
1365 .expect("analyze");
1366 assert_eq!(result.sentiment, Sentiment::Negative);
1367 assert!(result.compound < 0.0);
1368 }
1369
1370 #[test]
1371 fn test_vader_neutral() {
1372 let vader = VaderSentimentAnalyzer::new();
1373 let result = vader.analyze("The sky is blue").expect("analyze");
1374 assert_eq!(result.sentiment, Sentiment::Neutral);
1375 }
1376
1377 #[test]
1378 fn test_vader_negation() {
1379 let vader = VaderSentimentAnalyzer::new();
1380 let result = vader.analyze("This is not good at all").expect("analyze");
1381 assert!(result.compound < 0.0, "Negated positive should be negative");
1382 }
1383
1384 #[test]
1385 fn test_vader_intensifier() {
1386 let vader = VaderSentimentAnalyzer::new();
1387 let base = vader.analyze("This is good").expect("analyze");
1388 let intensified = vader.analyze("This is very good").expect("analyze");
1389 assert!(
1390 intensified.compound > base.compound,
1391 "Intensified should score higher: {} vs {}",
1392 intensified.compound,
1393 base.compound
1394 );
1395 }
1396
1397 #[test]
1398 fn test_vader_but_clause() {
1399 let vader = VaderSentimentAnalyzer::new();
1400 let result = vader
1401 .analyze("The food was good but the service was terrible")
1402 .expect("analyze");
1403 assert!(result.compound < 0.0);
1405 }
1406
1407 #[test]
1408 fn test_vader_caps_emphasis() {
1409 let vader = VaderSentimentAnalyzer::new();
1410 let normal = vader.analyze("This is good").expect("analyze");
1411 let caps = vader.analyze("This is GOOD").expect("analyze");
1412 assert!(
1413 caps.compound >= normal.compound,
1414 "CAPS should score higher or equal"
1415 );
1416 }
1417
1418 #[test]
1419 fn test_vader_batch() {
1420 let vader = VaderSentimentAnalyzer::new();
1421 let texts = vec!["I love this!", "I hate this!"];
1422 let results = vader.analyze_batch(&texts).expect("batch");
1423 assert_eq!(results.len(), 2);
1424 assert_eq!(results[0].sentiment, Sentiment::Positive);
1425 assert_eq!(results[1].sentiment, Sentiment::Negative);
1426 }
1427
1428 #[test]
1429 fn test_vader_compound_range() {
1430 let vader = VaderSentimentAnalyzer::new();
1431 let result = vader
1432 .analyze("This is the most absolutely amazing incredible thing ever!!!")
1433 .expect("analyze");
1434 assert!(result.compound >= -1.0 && result.compound <= 1.0);
1435 }
1436
1437 #[test]
1440 fn test_naive_bayes_train_predict() {
1441 let mut clf = NaiveBayesSentiment::new();
1442
1443 let texts = vec![
1444 "I love this product it is amazing",
1445 "Great quality excellent experience",
1446 "Wonderful service very happy",
1447 "This is terrible and awful",
1448 "Horrible experience very bad",
1449 "Worst product I have ever bought",
1450 ];
1451 let labels = vec![
1452 "positive", "positive", "positive", "negative", "negative", "negative",
1453 ];
1454
1455 clf.train(&texts, &labels).expect("training failed");
1456
1457 let pred = clf.predict("This is amazing and great").expect("predict");
1459 assert_eq!(pred, "positive");
1460
1461 let pred = clf
1463 .predict("This is terrible and horrible")
1464 .expect("predict");
1465 assert_eq!(pred, "negative");
1466 }
1467
1468 #[test]
1469 fn test_naive_bayes_predict_proba() {
1470 let mut clf = NaiveBayesSentiment::new();
1471
1472 let texts = vec![
1473 "good great excellent",
1474 "good wonderful amazing",
1475 "bad terrible awful",
1476 "bad horrible disgusting",
1477 ];
1478 let labels = vec!["positive", "positive", "negative", "negative"];
1479
1480 clf.train(&texts, &labels).expect("training failed");
1481
1482 let probas = clf.predict_proba("good excellent").expect("predict_proba");
1483 assert!(probas.contains_key("positive"));
1484 assert!(probas.contains_key("negative"));
1485
1486 let pos_prob = probas.get("positive").copied().unwrap_or(0.0);
1488 let neg_prob = probas.get("negative").copied().unwrap_or(0.0);
1489 assert!(pos_prob > neg_prob);
1490
1491 let total: f64 = probas.values().sum();
1493 assert!((total - 1.0).abs() < 1e-6);
1494 }
1495
1496 #[test]
1497 fn test_naive_bayes_not_trained() {
1498 let clf = NaiveBayesSentiment::new();
1499 let result = clf.predict("test");
1500 assert!(result.is_err());
1501 }
1502
1503 #[test]
1504 fn test_naive_bayes_classes() {
1505 let mut clf = NaiveBayesSentiment::new();
1506 let texts = vec!["a", "b", "c"];
1507 let labels = vec!["pos", "neg", "pos"];
1508 clf.train(&texts, &labels).expect("train");
1509
1510 let classes = clf.classes();
1511 assert_eq!(classes.len(), 2);
1512 }
1513
1514 #[test]
1517 fn test_aspect_sentiment_basic() {
1518 let analyzer = AspectSentimentAnalyzer::new();
1519
1520 let results = analyzer
1521 .analyze(
1522 "The food was excellent but the service was terrible",
1523 &["food", "service"],
1524 )
1525 .expect("analyze");
1526
1527 assert_eq!(results.len(), 2);
1528
1529 let food_result = results.iter().find(|r| r.aspect == "food");
1530 assert!(food_result.is_some());
1531 let food = food_result.expect("food aspect");
1532 assert_eq!(food.sentiment, Sentiment::Positive);
1533
1534 let service_result = results.iter().find(|r| r.aspect == "service");
1535 assert!(service_result.is_some());
1536 let service = service_result.expect("service aspect");
1537 assert_eq!(service.sentiment, Sentiment::Negative);
1538 }
1539
1540 #[test]
1541 fn test_aspect_sentiment_negation() {
1542 let analyzer = AspectSentimentAnalyzer::new();
1543
1544 let results = analyzer
1545 .analyze("The price was not good", &["price"])
1546 .expect("analyze");
1547
1548 assert!(!results.is_empty());
1549 assert_eq!(results[0].sentiment, Sentiment::Negative);
1551 }
1552
1553 #[test]
1554 fn test_aspect_sentiment_no_match() {
1555 let analyzer = AspectSentimentAnalyzer::new();
1556 let results = analyzer
1557 .analyze("The sky is blue", &["food", "service"])
1558 .expect("analyze");
1559 assert!(results.is_empty());
1560 }
1561
1562 #[test]
1563 fn test_aspect_with_custom_window() {
1564 let analyzer = AspectSentimentAnalyzer::new().with_context_window(2);
1565 let results = analyzer
1566 .analyze("The food here is really great and beautiful", &["food"])
1567 .expect("analyze");
1568 assert!(!results.is_empty());
1569 }
1570
1571 #[test]
1574 fn test_aggregate_sentiment() {
1575 let analyzer = LexiconSentimentAnalyzer::with_basiclexicon();
1576 let results = analyzer
1577 .analyze_batch(&["I love this", "I love this too", "This is terrible"])
1578 .expect("batch");
1579
1580 let agg = aggregate_sentiment(&results);
1581 assert_eq!(agg.count, 3);
1582 assert!(agg.mean_score > 0.0); assert!(agg.positive_ratio > 0.5);
1584 assert!(agg.std_score > 0.0);
1585 }
1586
1587 #[test]
1588 fn test_aggregate_empty() {
1589 let agg = aggregate_sentiment(&[]);
1590 assert_eq!(agg.count, 0);
1591 assert_eq!(agg.overall_sentiment, Sentiment::Neutral);
1592 }
1593
1594 #[test]
1595 fn test_analyze_and_aggregate() {
1596 let texts = vec!["I love this product", "It is amazing", "Very good quality"];
1597 let agg = analyze_and_aggregate(&texts).expect("aggregate");
1598 assert_eq!(agg.count, 3);
1599 assert!(agg.mean_score > 0.0);
1600 assert_eq!(agg.overall_sentiment, Sentiment::Positive);
1601 }
1602
1603 #[test]
1604 fn test_sentiment_display() {
1605 assert_eq!(format!("{}", Sentiment::Positive), "Positive");
1606 assert_eq!(format!("{}", Sentiment::Negative), "Negative");
1607 assert_eq!(format!("{}", Sentiment::Neutral), "Neutral");
1608 }
1609
1610 #[test]
1611 fn test_sentiment_from_score_thresholds() {
1612 assert_eq!(Sentiment::from_score(0.1), Sentiment::Positive);
1613 assert_eq!(Sentiment::from_score(-0.1), Sentiment::Negative);
1614 assert_eq!(Sentiment::from_score(0.0), Sentiment::Neutral);
1615 assert_eq!(Sentiment::from_score(0.03), Sentiment::Neutral);
1616 }
1617}