1use super::types::SentimentWords;
4
5use std::collections::HashMap;
6
7impl super::analyzer::TextAnalyzer {
8 pub fn extract_words(&self, text: &str) -> Vec<String> {
10 text.split_whitespace()
11 .map(|word| {
12 word.chars()
13 .filter(|c| c.is_alphabetic() || c.is_ascii_digit())
14 .collect::<String>()
15 })
16 .filter(|word| !word.is_empty())
17 .collect()
18 }
19
20 pub fn extract_sentences(&self, text: &str) -> Vec<String> {
22 text.split(&['.', '!', '?'][..])
23 .map(|s| s.trim())
24 .filter(|s| !s.is_empty())
25 .map(|s| s.to_string())
26 .collect()
27 }
28
29 pub fn extract_paragraphs(&self, text: &str) -> Vec<String> {
31 text.split('\n')
32 .map(|p| p.trim())
33 .filter(|p| !p.is_empty())
34 .map(|p| p.to_string())
35 .collect()
36 }
37
38 pub fn calculate_readability_score(&self, words: &[String], sentences: &[String]) -> f64 {
40 if sentences.is_empty() || words.is_empty() {
41 return 0.0;
42 }
43
44 let avg_sentence_length = words.len() as f64 / sentences.len() as f64;
45 let avg_syllables = self.estimate_syllables(words) as f64 / words.len() as f64;
46
47 206.835 - (1.015 * avg_sentence_length) - (84.6 * avg_syllables)
49 }
50
51 pub fn estimate_syllables(&self, words: &[String]) -> usize {
53 words
54 .iter()
55 .map(|word| {
56 let word_lower = word.to_lowercase();
57 let vowel_groups = word_lower
58 .chars()
59 .fold((0, false), |(count, in_vowel_group), c| {
60 let is_vowel = matches!(c, 'a' | 'e' | 'i' | 'o' | 'u' | 'y');
61 if is_vowel && !in_vowel_group {
62 (count + 1, true)
63 } else if !is_vowel {
64 (count, false)
65 } else {
66 (count, true)
67 }
68 })
69 .0;
70
71 vowel_groups.max(1)
73 })
74 .sum()
75 }
76
77 pub fn calculate_word_frequencies(&self, words: &[String]) -> HashMap<String, usize> {
79 let mut frequencies = HashMap::new();
80
81 for word in words {
82 let lower_word = word.to_lowercase();
83 *frequencies.entry(lower_word).or_insert(0) += 1;
84 }
85
86 frequencies
87 }
88
89 pub fn calculate_language_scores(&self, words: &[String]) -> HashMap<String, f64> {
91 let mut scores = HashMap::new();
92
93 for word in words {
97 let lower_word = word.to_lowercase();
98
99 if lower_word.contains("the") || lower_word.contains("and") || lower_word.contains("is")
101 {
102 *scores.entry("english".to_string()).or_insert(0.0) += 0.1;
103 }
104
105 if lower_word.contains("el") || lower_word.contains("la") || lower_word.contains("de") {
107 *scores.entry("spanish".to_string()).or_insert(0.0) += 0.1;
108 }
109
110 if lower_word.contains("le") || lower_word.contains("la") || lower_word.contains("et") {
112 *scores.entry("french".to_string()).or_insert(0.0) += 0.1;
113 }
114
115 if lower_word.contains("der")
117 || lower_word.contains("die")
118 || lower_word.contains("und")
119 {
120 *scores.entry("german".to_string()).or_insert(0.0) += 0.1;
121 }
122 }
123
124 scores
125 }
126
127 pub fn default_stop_words() -> std::collections::HashSet<String> {
129 vec![
130 "a",
131 "an",
132 "and",
133 "are",
134 "as",
135 "at",
136 "be",
137 "but",
138 "by",
139 "for",
140 "if",
141 "in",
142 "into",
143 "is",
144 "it",
145 "no",
146 "not",
147 "of",
148 "on",
149 "or",
150 "such",
151 "that",
152 "the",
153 "their",
154 "then",
155 "there",
156 "these",
157 "they",
158 "this",
159 "to",
160 "was",
161 "will",
162 "with",
163 "the",
164 "is",
165 "at",
166 "which",
167 "on",
168 "and",
169 "a",
170 "an",
171 "as",
172 "are",
173 "was",
174 "were",
175 "been",
176 "be",
177 "have",
178 "has",
179 "had",
180 "do",
181 "does",
182 "did",
183 "will",
184 "would",
185 "should",
186 "could",
187 "may",
188 "might",
189 "must",
190 "shall",
191 "can",
192 "cannot",
193 "cant",
194 "won't",
195 "wouldn't",
196 "shouldn't",
197 "couldn't",
198 "mustn't",
199 "shan't",
200 "mightn't",
201 "mustn't",
202 ]
203 .into_iter()
204 .map(|s| s.to_string())
205 .collect()
206 }
207
208 pub fn default_sentiment_words() -> SentimentWords {
210 let positive: std::collections::HashSet<String> = vec![
211 "good",
212 "great",
213 "excellent",
214 "amazing",
215 "wonderful",
216 "fantastic",
217 "awesome",
218 "brilliant",
219 "outstanding",
220 "superb",
221 "magnificent",
222 "perfect",
223 "love",
224 "like",
225 "enjoy",
226 "happy",
227 "joy",
228 "delight",
229 "pleasure",
230 "satisfied",
231 "pleased",
232 "thrilled",
233 "excited",
234 "enthusiastic",
235 "positive",
236 "optimistic",
237 "hopeful",
238 "confident",
239 "proud",
240 "grateful",
241 "thankful",
242 "appreciate",
243 "beautiful",
244 "nice",
245 "pretty",
246 "handsome",
247 "attractive",
248 "gorgeous",
249 "stunning",
250 "elegant",
251 ]
252 .into_iter()
253 .map(|s| s.to_string())
254 .collect();
255
256 let negative: std::collections::HashSet<String> = vec![
257 "bad",
258 "terrible",
259 "awful",
260 "horrible",
261 "disgusting",
262 "disappointing",
263 "frustrating",
264 "annoying",
265 "irritating",
266 "angry",
267 "mad",
268 "furious",
269 "enraged",
270 "upset",
271 "sad",
272 "depressed",
273 "miserable",
274 "unhappy",
275 "gloomy",
276 "pessimistic",
277 "negative",
278 "worried",
279 "anxious",
280 "stressed",
281 "overwhelmed",
282 "exhausted",
283 "tired",
284 "bored",
285 "uninterested",
286 "apathetic",
287 "indifferent",
288 "ugly",
289 "disgusting",
290 "repulsive",
291 "hideous",
292 "grotesque",
293 "unpleasant",
294 "nasty",
295 "vile",
296 ]
297 .into_iter()
298 .map(|s| s.to_string())
299 .collect();
300
301 let neutral: std::collections::HashSet<String> = vec![
302 "okay",
303 "fine",
304 "average",
305 "normal",
306 "typical",
307 "standard",
308 "regular",
309 "ordinary",
310 "common",
311 "usual",
312 "expected",
313 "anticipated",
314 "predicted",
315 "forecasted",
316 "planned",
317 "scheduled",
318 "arranged",
319 "organized",
320 "prepared",
321 "ready",
322 "available",
323 "present",
324 "existing",
325 "current",
326 "ongoing",
327 "continuing",
328 "proceeding",
329 "happening",
330 "occurring",
331 "taking place",
332 "underway",
333 "in progress",
334 ]
335 .into_iter()
336 .map(|s| s.to_string())
337 .collect();
338
339 SentimentWords {
340 positive,
341 negative,
342 neutral,
343 }
344 }
345}