xls_rs/text_analysis/
analyzer.rs1use crate::common::collection;
4use std::collections::HashSet;
5
6use super::types::*;
7
8pub struct TextAnalyzer {
10 stop_words: HashSet<String>,
11 sentiment_words: SentimentWords,
12}
13
14impl TextAnalyzer {
15 pub fn new() -> Self {
17 Self {
18 stop_words: Self::default_stop_words(),
19 sentiment_words: Self::default_sentiment_words(),
20 }
21 }
22
23 pub fn analyze_stats(&self, text: &str) -> TextStats {
25 let words = self.extract_words(text);
26 let sentences = self.extract_sentences(text);
27 let paragraphs = self.extract_paragraphs(text);
28
29 let word_count = words.len();
30 let character_count = text.chars().count();
31 let sentence_count = sentences.len();
32 let paragraph_count = paragraphs.len();
33
34 let avg_word_length = if word_count > 0 {
35 words.iter().map(|w| w.len()).sum::<usize>() as f64 / word_count as f64
36 } else {
37 0.0
38 };
39
40 let avg_sentence_length = if sentence_count > 0 {
41 words.len() as f64 / sentence_count as f64
42 } else {
43 0.0
44 };
45
46 let readability_score = self.calculate_readability_score(&words, &sentences);
47
48 let unique_words = collection::unique_preserve_order(&words).len();
49 let lexical_diversity = if word_count > 0 {
50 unique_words as f64 / word_count as f64
51 } else {
52 0.0
53 };
54
55 TextStats {
56 word_count,
57 character_count,
58 sentence_count,
59 paragraph_count,
60 avg_word_length,
61 avg_sentence_length,
62 readability_score,
63 unique_words,
64 lexical_diversity,
65 }
66 }
67
68 pub fn analyze_sentiment(&self, text: &str) -> SentimentResult {
70 let words = self.extract_words(text);
71
72 let mut positive_count = 0;
73 let mut negative_count = 0;
74 let mut neutral_count = 0;
75
76 for word in &words {
77 let lower_word = word.to_lowercase();
78 if self.sentiment_words.positive.contains(&lower_word) {
79 positive_count += 1;
80 } else if self.sentiment_words.negative.contains(&lower_word) {
81 negative_count += 1;
82 } else {
83 neutral_count += 1;
84 }
85 }
86
87 let total = positive_count + negative_count + neutral_count;
88 let (positive_score, negative_score, neutral_score) = if total > 0 {
89 (
90 positive_count as f64 / total as f64,
91 negative_count as f64 / total as f64,
92 neutral_count as f64 / total as f64,
93 )
94 } else {
95 (0.0, 0.0, 1.0)
96 };
97
98 let (sentiment, confidence) =
99 if positive_score > negative_score && positive_score > neutral_score {
100 (Sentiment::Positive, positive_score)
101 } else if negative_score > positive_score && negative_score > neutral_score {
102 (Sentiment::Negative, negative_score)
103 } else {
104 (Sentiment::Neutral, neutral_score)
105 };
106
107 SentimentResult {
108 sentiment,
109 confidence,
110 positive_score,
111 negative_score,
112 neutral_score,
113 }
114 }
115
116 pub fn extract_keywords(&self, text: &str, max_keywords: usize) -> KeywordResult {
118 let words = self.extract_words(text);
119
120 let mut word_frequencies: std::collections::HashMap<String, usize> =
122 std::collections::HashMap::new();
123
124 for word in &words {
125 let lower_word = word.to_lowercase();
126 if !self.stop_words.contains(&lower_word) && word.len() > 2 {
127 *word_frequencies.entry(lower_word.clone()).or_insert(0) += 1;
128 }
129 }
130
131 let total_words = words.len();
133 let mut keywords: Vec<Keyword> = word_frequencies
134 .into_iter()
135 .map(|(word, frequency)| {
136 let tf = frequency as f64 / total_words as f64;
137 let score = tf * (1.0 + word.len() as f64 * 0.1);
138
139 let importance = if frequency >= total_words / 10 {
140 Importance::High
141 } else if frequency >= total_words / 20 {
142 Importance::Medium
143 } else {
144 Importance::Low
145 };
146
147 Keyword {
148 word,
149 score,
150 frequency,
151 importance,
152 }
153 })
154 .collect();
155
156 keywords.sort_by(|a, b| {
158 b.score
159 .partial_cmp(&a.score)
160 .unwrap_or(std::cmp::Ordering::Equal)
161 .then_with(|| b.frequency.cmp(&a.frequency))
162 });
163
164 keywords.truncate(max_keywords);
166 let total_keywords = keywords.len();
167
168 KeywordResult {
169 keywords,
170 total_keywords,
171 }
172 }
173
174 pub fn detect_language(&self, text: &str) -> LanguageResult {
176 let supported_languages = vec![
180 "English".to_string(),
181 "Spanish".to_string(),
182 "French".to_string(),
183 "German".to_string(),
184 "Chinese".to_string(),
185 "Japanese".to_string(),
186 ];
187
188 let lower_text = text.to_lowercase();
190 let mut language_scores: std::collections::HashMap<String, f64> =
191 std::collections::HashMap::new();
192
193 if lower_text.contains("the ")
195 || lower_text.contains(" and ")
196 || lower_text.contains(" is ")
197 {
198 *language_scores.entry("English".to_string()).or_insert(0.0) += 0.3;
199 }
200
201 if lower_text.contains(" el ") || lower_text.contains(" la ") || lower_text.contains(" de ")
203 {
204 *language_scores.entry("Spanish".to_string()).or_insert(0.0) += 0.3;
205 }
206
207 if lower_text.contains(" le ") || lower_text.contains(" la ") || lower_text.contains(" et ")
209 {
210 *language_scores.entry("French".to_string()).or_insert(0.0) += 0.3;
211 }
212
213 let (language, confidence) = language_scores
215 .into_iter()
216 .max_by(|(_, a), (_, b)| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal))
217 .unwrap_or_else(|| ("English".to_string(), 0.5));
218
219 LanguageResult {
220 language,
221 confidence,
222 supported_languages,
223 }
224 }
225}
226
227impl Default for TextAnalyzer {
228 fn default() -> Self {
229 Self::new()
230 }
231}