Skip to main content

xls_rs/text_analysis/
analyzer.rs

1//! Main text analyzer implementation
2
3use crate::common::collection;
4use std::collections::HashSet;
5
6use super::types::*;
7
8/// Text analyzer
9pub struct TextAnalyzer {
10    stop_words: HashSet<String>,
11    sentiment_words: SentimentWords,
12}
13
14impl TextAnalyzer {
15    /// Create a new text analyzer
16    pub fn new() -> Self {
17        Self {
18            stop_words: Self::default_stop_words(),
19            sentiment_words: Self::default_sentiment_words(),
20        }
21    }
22
23    /// Analyze text statistics
24    pub fn analyze_stats(&self, text: &str) -> TextStats {
25        let words = self.extract_words(text);
26        let sentences = self.extract_sentences(text);
27        let paragraphs = self.extract_paragraphs(text);
28
29        let word_count = words.len();
30        let character_count = text.chars().count();
31        let sentence_count = sentences.len();
32        let paragraph_count = paragraphs.len();
33
34        let avg_word_length = if word_count > 0 {
35            words.iter().map(|w| w.len()).sum::<usize>() as f64 / word_count as f64
36        } else {
37            0.0
38        };
39
40        let avg_sentence_length = if sentence_count > 0 {
41            words.len() as f64 / sentence_count as f64
42        } else {
43            0.0
44        };
45
46        let readability_score = self.calculate_readability_score(&words, &sentences);
47
48        let unique_words = collection::unique_preserve_order(&words).len();
49        let lexical_diversity = if word_count > 0 {
50            unique_words as f64 / word_count as f64
51        } else {
52            0.0
53        };
54
55        TextStats {
56            word_count,
57            character_count,
58            sentence_count,
59            paragraph_count,
60            avg_word_length,
61            avg_sentence_length,
62            readability_score,
63            unique_words,
64            lexical_diversity,
65        }
66    }
67
68    /// Perform sentiment analysis
69    pub fn analyze_sentiment(&self, text: &str) -> SentimentResult {
70        let words = self.extract_words(text);
71
72        let mut positive_count = 0;
73        let mut negative_count = 0;
74        let mut neutral_count = 0;
75
76        for word in &words {
77            let lower_word = word.to_lowercase();
78            if self.sentiment_words.positive.contains(&lower_word) {
79                positive_count += 1;
80            } else if self.sentiment_words.negative.contains(&lower_word) {
81                negative_count += 1;
82            } else {
83                neutral_count += 1;
84            }
85        }
86
87        let total = positive_count + negative_count + neutral_count;
88        let (positive_score, negative_score, neutral_score) = if total > 0 {
89            (
90                positive_count as f64 / total as f64,
91                negative_count as f64 / total as f64,
92                neutral_count as f64 / total as f64,
93            )
94        } else {
95            (0.0, 0.0, 1.0)
96        };
97
98        let (sentiment, confidence) =
99            if positive_score > negative_score && positive_score > neutral_score {
100                (Sentiment::Positive, positive_score)
101            } else if negative_score > positive_score && negative_score > neutral_score {
102                (Sentiment::Negative, negative_score)
103            } else {
104                (Sentiment::Neutral, neutral_score)
105            };
106
107        SentimentResult {
108            sentiment,
109            confidence,
110            positive_score,
111            negative_score,
112            neutral_score,
113        }
114    }
115
116    /// Extract keywords from text
117    pub fn extract_keywords(&self, text: &str, max_keywords: usize) -> KeywordResult {
118        let words = self.extract_words(text);
119
120        // Filter out stop words and count frequencies
121        let mut word_frequencies: std::collections::HashMap<String, usize> =
122            std::collections::HashMap::new();
123
124        for word in &words {
125            let lower_word = word.to_lowercase();
126            if !self.stop_words.contains(&lower_word) && word.len() > 2 {
127                *word_frequencies.entry(lower_word.clone()).or_insert(0) += 1;
128            }
129        }
130
131        // Calculate TF-IDF-like scores
132        let total_words = words.len();
133        let mut keywords: Vec<Keyword> = word_frequencies
134            .into_iter()
135            .map(|(word, frequency)| {
136                let tf = frequency as f64 / total_words as f64;
137                let score = tf * (1.0 + word.len() as f64 * 0.1);
138
139                let importance = if frequency >= total_words / 10 {
140                    Importance::High
141                } else if frequency >= total_words / 20 {
142                    Importance::Medium
143                } else {
144                    Importance::Low
145                };
146
147                Keyword {
148                    word,
149                    score,
150                    frequency,
151                    importance,
152                }
153            })
154            .collect();
155
156        // Sort by score and frequency
157        keywords.sort_by(|a, b| {
158            b.score
159                .partial_cmp(&a.score)
160                .unwrap_or(std::cmp::Ordering::Equal)
161                .then_with(|| b.frequency.cmp(&a.frequency))
162        });
163
164        // Limit to max_keywords
165        keywords.truncate(max_keywords);
166        let total_keywords = keywords.len();
167
168        KeywordResult {
169            keywords,
170            total_keywords,
171        }
172    }
173
174    /// Detect language of text (simplified)
175    pub fn detect_language(&self, text: &str) -> LanguageResult {
176        // This is a very simplified language detection
177        // In a real implementation, you'd use proper language detection libraries
178
179        let supported_languages = vec![
180            "English".to_string(),
181            "Spanish".to_string(),
182            "French".to_string(),
183            "German".to_string(),
184            "Chinese".to_string(),
185            "Japanese".to_string(),
186        ];
187
188        // Simple heuristics for language detection
189        let lower_text = text.to_lowercase();
190        let mut language_scores: std::collections::HashMap<String, f64> =
191            std::collections::HashMap::new();
192
193        // English indicators
194        if lower_text.contains("the ")
195            || lower_text.contains(" and ")
196            || lower_text.contains(" is ")
197        {
198            *language_scores.entry("English".to_string()).or_insert(0.0) += 0.3;
199        }
200
201        // Spanish indicators
202        if lower_text.contains(" el ") || lower_text.contains(" la ") || lower_text.contains(" de ")
203        {
204            *language_scores.entry("Spanish".to_string()).or_insert(0.0) += 0.3;
205        }
206
207        // French indicators
208        if lower_text.contains(" le ") || lower_text.contains(" la ") || lower_text.contains(" et ")
209        {
210            *language_scores.entry("French".to_string()).or_insert(0.0) += 0.3;
211        }
212
213        // Default to English if no indicators found
214        let (language, confidence) = language_scores
215            .into_iter()
216            .max_by(|(_, a), (_, b)| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal))
217            .unwrap_or_else(|| ("English".to_string(), 0.5));
218
219        LanguageResult {
220            language,
221            confidence,
222            supported_languages,
223        }
224    }
225}
226
227impl Default for TextAnalyzer {
228    fn default() -> Self {
229        Self::new()
230    }
231}