Skip to main content

xls_rs/text_analysis/
helpers.rs

1//! Helper methods for text analysis
2
3use super::types::SentimentWords;
4
5use std::collections::HashMap;
6
7impl super::analyzer::TextAnalyzer {
8    /// Extract words from text
9    pub fn extract_words(&self, text: &str) -> Vec<String> {
10        text.split_whitespace()
11            .map(|word| {
12                word.chars()
13                    .filter(|c| c.is_alphabetic() || c.is_ascii_digit())
14                    .collect::<String>()
15            })
16            .filter(|word| !word.is_empty())
17            .collect()
18    }
19
20    /// Extract sentences from text
21    pub fn extract_sentences(&self, text: &str) -> Vec<String> {
22        text.split(&['.', '!', '?'][..])
23            .map(|s| s.trim())
24            .filter(|s| !s.is_empty())
25            .map(|s| s.to_string())
26            .collect()
27    }
28
29    /// Extract paragraphs from text
30    pub fn extract_paragraphs(&self, text: &str) -> Vec<String> {
31        text.split('\n')
32            .map(|p| p.trim())
33            .filter(|p| !p.is_empty())
34            .map(|p| p.to_string())
35            .collect()
36    }
37
38    /// Calculate readability score (simplified Flesch Reading Ease)
39    pub fn calculate_readability_score(&self, words: &[String], sentences: &[String]) -> f64 {
40        if sentences.is_empty() || words.is_empty() {
41            return 0.0;
42        }
43
44        let avg_sentence_length = words.len() as f64 / sentences.len() as f64;
45        let avg_syllables = self.estimate_syllables(words) as f64 / words.len() as f64;
46
47        // Simplified Flesch Reading Ease formula
48        206.835 - (1.015 * avg_sentence_length) - (84.6 * avg_syllables)
49    }
50
51    /// Estimate syllables in words (simplified)
52    pub fn estimate_syllables(&self, words: &[String]) -> usize {
53        words
54            .iter()
55            .map(|word| {
56                let word_lower = word.to_lowercase();
57                let vowel_groups = word_lower
58                    .chars()
59                    .fold((0, false), |(count, in_vowel_group), c| {
60                        let is_vowel = matches!(c, 'a' | 'e' | 'i' | 'o' | 'u' | 'y');
61                        if is_vowel && !in_vowel_group {
62                            (count + 1, true)
63                        } else if !is_vowel {
64                            (count, false)
65                        } else {
66                            (count, true)
67                        }
68                    })
69                    .0;
70
71                // At least one syllable per word
72                vowel_groups.max(1)
73            })
74            .sum()
75    }
76
77    /// Calculate word frequencies
78    pub fn calculate_word_frequencies(&self, words: &[String]) -> HashMap<String, usize> {
79        let mut frequencies = HashMap::new();
80
81        for word in words {
82            let lower_word = word.to_lowercase();
83            *frequencies.entry(lower_word).or_insert(0) += 1;
84        }
85
86        frequencies
87    }
88
89    /// Calculate language scores based on word patterns
90    pub fn calculate_language_scores(&self, words: &[String]) -> HashMap<String, f64> {
91        let mut scores = HashMap::new();
92
93        // This is a very simplified language detection
94        // In practice, you'd use n-gram models or statistical methods
95
96        for word in words {
97            let lower_word = word.to_lowercase();
98
99            // English indicators
100            if lower_word.contains("the") || lower_word.contains("and") || lower_word.contains("is")
101            {
102                *scores.entry("english".to_string()).or_insert(0.0) += 0.1;
103            }
104
105            // Spanish indicators
106            if lower_word.contains("el") || lower_word.contains("la") || lower_word.contains("de") {
107                *scores.entry("spanish".to_string()).or_insert(0.0) += 0.1;
108            }
109
110            // French indicators
111            if lower_word.contains("le") || lower_word.contains("la") || lower_word.contains("et") {
112                *scores.entry("french".to_string()).or_insert(0.0) += 0.1;
113            }
114
115            // German indicators
116            if lower_word.contains("der")
117                || lower_word.contains("die")
118                || lower_word.contains("und")
119            {
120                *scores.entry("german".to_string()).or_insert(0.0) += 0.1;
121            }
122        }
123
124        scores
125    }
126
127    /// Get default stop words
128    pub fn default_stop_words() -> std::collections::HashSet<String> {
129        vec![
130            "a",
131            "an",
132            "and",
133            "are",
134            "as",
135            "at",
136            "be",
137            "but",
138            "by",
139            "for",
140            "if",
141            "in",
142            "into",
143            "is",
144            "it",
145            "no",
146            "not",
147            "of",
148            "on",
149            "or",
150            "such",
151            "that",
152            "the",
153            "their",
154            "then",
155            "there",
156            "these",
157            "they",
158            "this",
159            "to",
160            "was",
161            "will",
162            "with",
163            "the",
164            "is",
165            "at",
166            "which",
167            "on",
168            "and",
169            "a",
170            "an",
171            "as",
172            "are",
173            "was",
174            "were",
175            "been",
176            "be",
177            "have",
178            "has",
179            "had",
180            "do",
181            "does",
182            "did",
183            "will",
184            "would",
185            "should",
186            "could",
187            "may",
188            "might",
189            "must",
190            "shall",
191            "can",
192            "cannot",
193            "cant",
194            "won't",
195            "wouldn't",
196            "shouldn't",
197            "couldn't",
198            "mustn't",
199            "shan't",
200            "mightn't",
201            "mustn't",
202        ]
203        .into_iter()
204        .map(|s| s.to_string())
205        .collect()
206    }
207
208    /// Get default sentiment word lists
209    pub fn default_sentiment_words() -> SentimentWords {
210        let positive: std::collections::HashSet<String> = vec![
211            "good",
212            "great",
213            "excellent",
214            "amazing",
215            "wonderful",
216            "fantastic",
217            "awesome",
218            "brilliant",
219            "outstanding",
220            "superb",
221            "magnificent",
222            "perfect",
223            "love",
224            "like",
225            "enjoy",
226            "happy",
227            "joy",
228            "delight",
229            "pleasure",
230            "satisfied",
231            "pleased",
232            "thrilled",
233            "excited",
234            "enthusiastic",
235            "positive",
236            "optimistic",
237            "hopeful",
238            "confident",
239            "proud",
240            "grateful",
241            "thankful",
242            "appreciate",
243            "beautiful",
244            "nice",
245            "pretty",
246            "handsome",
247            "attractive",
248            "gorgeous",
249            "stunning",
250            "elegant",
251        ]
252        .into_iter()
253        .map(|s| s.to_string())
254        .collect();
255
256        let negative: std::collections::HashSet<String> = vec![
257            "bad",
258            "terrible",
259            "awful",
260            "horrible",
261            "disgusting",
262            "disappointing",
263            "frustrating",
264            "annoying",
265            "irritating",
266            "angry",
267            "mad",
268            "furious",
269            "enraged",
270            "upset",
271            "sad",
272            "depressed",
273            "miserable",
274            "unhappy",
275            "gloomy",
276            "pessimistic",
277            "negative",
278            "worried",
279            "anxious",
280            "stressed",
281            "overwhelmed",
282            "exhausted",
283            "tired",
284            "bored",
285            "uninterested",
286            "apathetic",
287            "indifferent",
288            "ugly",
289            "disgusting",
290            "repulsive",
291            "hideous",
292            "grotesque",
293            "unpleasant",
294            "nasty",
295            "vile",
296        ]
297        .into_iter()
298        .map(|s| s.to_string())
299        .collect();
300
301        let neutral: std::collections::HashSet<String> = vec![
302            "okay",
303            "fine",
304            "average",
305            "normal",
306            "typical",
307            "standard",
308            "regular",
309            "ordinary",
310            "common",
311            "usual",
312            "expected",
313            "anticipated",
314            "predicted",
315            "forecasted",
316            "planned",
317            "scheduled",
318            "arranged",
319            "organized",
320            "prepared",
321            "ready",
322            "available",
323            "present",
324            "existing",
325            "current",
326            "ongoing",
327            "continuing",
328            "proceeding",
329            "happening",
330            "occurring",
331            "taking place",
332            "underway",
333            "in progress",
334        ]
335        .into_iter()
336        .map(|s| s.to_string())
337        .collect();
338
339        SentimentWords {
340            positive,
341            negative,
342            neutral,
343        }
344    }
345}