1use crate::error::{Result, TextError};
7use std::collections::HashMap;
8
9#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
11pub enum Language {
12 English,
14 Spanish,
16 French,
18 German,
20 Italian,
22 Portuguese,
24 Dutch,
26 Russian,
28 Chinese,
30 Japanese,
32 Korean,
34 Arabic,
36 Unknown,
38}
39
40impl Language {
41 pub fn iso_code(&self) -> &'static str {
43 match self {
44 Language::English => "en",
45 Language::Spanish => "es",
46 Language::French => "fr",
47 Language::German => "de",
48 Language::Italian => "it",
49 Language::Portuguese => "pt",
50 Language::Dutch => "nl",
51 Language::Russian => "ru",
52 Language::Chinese => "zh",
53 Language::Japanese => "ja",
54 Language::Korean => "ko",
55 Language::Arabic => "ar",
56 Language::Unknown => "und",
57 }
58 }
59
60 pub fn from_iso_code(code: &str) -> Self {
62 match code.to_lowercase().as_str() {
63 "en" => Language::English,
64 "es" => Language::Spanish,
65 "fr" => Language::French,
66 "de" => Language::German,
67 "it" => Language::Italian,
68 "pt" => Language::Portuguese,
69 "nl" => Language::Dutch,
70 "ru" => Language::Russian,
71 "zh" => Language::Chinese,
72 "ja" => Language::Japanese,
73 "ko" => Language::Korean,
74 "ar" => Language::Arabic,
75 _ => Language::Unknown,
76 }
77 }
78
79 pub fn name(&self) -> &'static str {
81 match self {
82 Language::English => "English",
83 Language::Spanish => "Spanish",
84 Language::French => "French",
85 Language::German => "German",
86 Language::Italian => "Italian",
87 Language::Portuguese => "Portuguese",
88 Language::Dutch => "Dutch",
89 Language::Russian => "Russian",
90 Language::Chinese => "Chinese",
91 Language::Japanese => "Japanese",
92 Language::Korean => "Korean",
93 Language::Arabic => "Arabic",
94 Language::Unknown => "Unknown",
95 }
96 }
97}
98
99#[derive(Debug, Clone)]
101pub struct LanguageDetectionResult {
102 pub language: Language,
104 pub confidence: f64,
106 pub alternatives: Vec<(Language, f64)>,
108}
109
110pub struct LanguageDetector {
112 profiles: HashMap<Language, HashMap<String, f64>>,
114 n_gram_size: usize,
116}
117
118impl LanguageDetector {
119 pub fn new() -> Self {
121 let mut detector = Self {
122 profiles: HashMap::new(),
123 n_gram_size: 3,
124 };
125 detector.initialize_default_profiles();
126 detector
127 }
128
129 pub fn with_ngram_size(n_gramsize: usize) -> Result<Self> {
131 if !(1..=5).contains(&n_gramsize) {
132 return Err(TextError::InvalidInput(
133 "N-gram size must be between 1 and 5".to_string(),
134 ));
135 }
136 let mut detector = Self {
137 profiles: HashMap::new(),
138 n_gram_size: n_gramsize,
139 };
140 detector.initialize_default_profiles();
141 Ok(detector)
142 }
143
144 fn initialize_default_profiles(&mut self) {
146 let mut english_profile = HashMap::new();
148 for (ngram, freq) in &[
149 ("the", 0.05),
150 ("and", 0.03),
151 ("ing", 0.025),
152 ("ion", 0.02),
153 ("tio", 0.018),
154 ("ent", 0.015),
155 ("ati", 0.013),
156 ("her", 0.012),
157 ("for", 0.011),
158 ("ter", 0.01),
159 ("hat", 0.009),
160 ("tha", 0.009),
161 ("ere", 0.008),
162 ("ate", 0.008),
163 ("ver", 0.007),
164 ("his", 0.007),
165 ] {
166 english_profile.insert(ngram.to_string(), *freq);
167 }
168 self.profiles.insert(Language::English, english_profile);
169
170 let mut spanish_profile = HashMap::new();
172 for (ngram, freq) in &[
173 ("que", 0.04),
174 ("de_", 0.035),
175 ("la_", 0.03),
176 ("el_", 0.025),
177 ("es_", 0.02),
178 ("los", 0.018),
179 ("las", 0.015),
180 ("ión", 0.013),
181 ("ado", 0.012),
182 ("nte", 0.011),
183 ("con", 0.01),
184 ("par", 0.009),
185 ("ara", 0.008),
186 ("una", 0.008),
187 ("por", 0.007),
188 ("est", 0.007),
189 ] {
190 spanish_profile.insert(ngram.to_string(), *freq);
191 }
192 self.profiles.insert(Language::Spanish, spanish_profile);
193
194 let mut french_profile = HashMap::new();
196 for (ngram, freq) in &[
197 ("de_", 0.05),
198 ("le_", 0.04),
199 ("que", 0.03),
200 ("les", 0.025),
201 ("la_", 0.02),
202 ("des", 0.018),
203 ("ent", 0.015),
204 ("ion", 0.013),
205 ("est", 0.012),
206 ("ait", 0.011),
207 ("pour", 0.01),
208 ("ais", 0.009),
209 ("ans", 0.008),
210 ("ont", 0.008),
211 ("une", 0.007),
212 ("qui", 0.007),
213 ] {
214 french_profile.insert(ngram.to_string(), *freq);
215 }
216 self.profiles.insert(Language::French, french_profile);
217
218 let mut german_profile = HashMap::new();
220 for (ngram, freq) in &[
221 ("der", 0.05),
222 ("die", 0.04),
223 ("und", 0.03),
224 ("den", 0.025),
225 ("das", 0.02),
226 ("ein", 0.018),
227 ("ich", 0.015),
228 ("ist", 0.013),
229 ("sch", 0.012),
230 ("cht", 0.011),
231 ("ung", 0.01),
232 ("gen", 0.009),
233 ("eit", 0.008),
234 ("ver", 0.008),
235 ("ber", 0.007),
236 ("ten", 0.007),
237 ] {
238 german_profile.insert(ngram.to_string(), *freq);
239 }
240 self.profiles.insert(Language::German, german_profile);
241 }
242
243 pub fn detect(&self, text: &str) -> Result<LanguageDetectionResult> {
245 if text.trim().is_empty() {
246 return Err(TextError::InvalidInput(
247 "Cannot detect language of empty text".to_string(),
248 ));
249 }
250
251 let text_profile = self.createtext_profile(text);
253
254 let mut scores: Vec<(Language, f64)> = self
256 .profiles
257 .iter()
258 .map(|(lang, profile)| {
259 let score = self.calculate_similarity(&text_profile, profile);
260 (*lang, score)
261 })
262 .collect();
263
264 scores.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
266
267 if scores.is_empty() {
268 return Ok(LanguageDetectionResult {
269 language: Language::Unknown,
270 confidence: 0.0,
271 alternatives: vec![],
272 });
273 }
274
275 let best_score = scores[0].1;
276 let best_language = scores[0].0;
277
278 let confidence = if scores.len() > 1 {
280 let second_score = scores[1].1;
281 let diff = best_score - second_score;
282 (diff / best_score).clamp(0.0, 1.0)
284 } else {
285 best_score
286 };
287
288 Ok(LanguageDetectionResult {
289 language: best_language,
290 confidence,
291 alternatives: scores.into_iter().skip(1).take(3).collect(),
292 })
293 }
294
295 fn createtext_profile(&self, text: &str) -> HashMap<String, f64> {
297 let mut profile = HashMap::new();
298 let text_lower = text.to_lowercase();
299 let chars: Vec<char> = text_lower.chars().collect();
300 let total_ngrams = chars.len().saturating_sub(self.n_gram_size - 1) as f64;
301
302 if total_ngrams <= 0.0 {
303 return profile;
304 }
305
306 let mut ngram_counts: HashMap<String, usize> = HashMap::new();
308 for i in 0..=chars.len().saturating_sub(self.n_gram_size) {
309 let ngram: String = chars[i..i + self.n_gram_size].iter().collect();
310 let ngram = ngram.replace(' ', "_");
312 *ngram_counts.entry(ngram).or_insert(0) += 1;
313 }
314
315 for (ngram, count) in ngram_counts {
317 profile.insert(ngram, count as f64 / total_ngrams);
318 }
319
320 profile
321 }
322
323 fn calculate_similarity(
325 &self,
326 profile1: &HashMap<String, f64>,
327 profile2: &HashMap<String, f64>,
328 ) -> f64 {
329 let mut similarity = 0.0;
330 let mut total_weight = 0.0;
331
332 for (ngram, freq1) in profile1 {
334 if let Some(freq2) = profile2.get(ngram) {
335 similarity += freq1 * freq2;
336 }
337 total_weight += freq1 * freq1;
338 }
339
340 if total_weight > 0.0 {
341 similarity / total_weight.sqrt()
342 } else {
343 0.0
344 }
345 }
346
347 pub fn supported_languages(&self) -> Vec<Language> {
349 self.profiles.keys().copied().collect()
350 }
351}
352
353impl Default for LanguageDetector {
354 fn default() -> Self {
355 Self::new()
356 }
357}
358
359pub struct StopWords {
361 stop_words: HashMap<Language, Vec<String>>,
363}
364
365impl StopWords {
366 pub fn new() -> Self {
368 let mut stop_words = HashMap::new();
369
370 stop_words.insert(
372 Language::English,
373 vec![
374 "a", "an", "and", "are", "as", "at", "be", "by", "for", "from", "has", "he", "in",
375 "is", "it", "its", "of", "on", "that", "the", "to", "was", "will", "with", "you",
376 "your", "this", "have", "had", "been", "but", "not", "they", "were", "what",
377 "when", "where", "who", "which", "their", "them", "these", "those", "there",
378 "here", "than",
379 ]
380 .iter()
381 .map(|s| s.to_string())
382 .collect(),
383 );
384
385 stop_words.insert(
387 Language::Spanish,
388 vec![
389 "a", "al", "algo", "algunas", "algunos", "ante", "antes", "como", "con", "contra",
390 "cual", "cuando", "de", "del", "desde", "donde", "durante", "e", "el", "ella",
391 "ellas", "ellos", "en", "entre", "era", "erais", "eran", "eras", "eres", "es",
392 "esa", "esas", "ese", "eso", "esos", "esta", "estas", "este", "esto", "estos",
393 "fue", "fueron", "fui", "la", "las", "lo", "los", "más", "mi", "mis", "mucho",
394 "muchos", "muy", "ni", "no", "nos", "nosotras", "nosotros", "o", "otra", "otras",
395 "otro", "otros", "para", "pero", "por", "porque", "que", "quien", "quienes", "se",
396 "si", "sin", "sobre", "su", "sus", "también", "tanto", "te", "tu", "tus", "un",
397 "una", "uno", "unos", "y", "ya", "yo",
398 ]
399 .iter()
400 .map(|s| s.to_string())
401 .collect(),
402 );
403
404 stop_words.insert(
406 Language::French,
407 vec![
408 "au", "aux", "avec", "ce", "ces", "dans", "de", "des", "du", "elle", "en", "et",
409 "eux", "il", "je", "la", "le", "les", "leur", "lui", "ma", "mais", "me", "même",
410 "mes", "moi", "mon", "ne", "nos", "notre", "nous", "on", "ou", "par", "pas",
411 "pour", "qu", "que", "qui", "sa", "se", "ses", "son", "sur", "ta", "te", "tes",
412 "toi", "ton", "tu", "un", "une", "vos", "votre", "vous",
413 ]
414 .iter()
415 .map(|s| s.to_string())
416 .collect(),
417 );
418
419 Self { stop_words }
420 }
421
422 pub fn get(&self, language: Language) -> Option<&Vec<String>> {
424 self.stop_words.get(&language)
425 }
426
427 pub fn is_stop_word(&self, word: &str, language: Language) -> bool {
429 if let Some(words) = self.stop_words.get(&language) {
430 words.iter().any(|sw| sw == &word.to_lowercase())
431 } else {
432 false
433 }
434 }
435
436 pub fn remove_stop_words(&self, tokens: &[String], language: Language) -> Vec<String> {
438 tokens
439 .iter()
440 .filter(|token| !self.is_stop_word(token, language))
441 .cloned()
442 .collect()
443 }
444}
445
446impl Default for StopWords {
447 fn default() -> Self {
448 Self::new()
449 }
450}
451
452pub struct MultilingualProcessor {
454 detector: LanguageDetector,
456 stop_words: StopWords,
458}
459
460impl MultilingualProcessor {
461 pub fn new() -> Self {
463 Self {
464 detector: LanguageDetector::new(),
465 stop_words: StopWords::new(),
466 }
467 }
468
469 pub fn process(&self, text: &str) -> Result<ProcessedText> {
471 let detection = self.detector.detect(text)?;
473
474 let tokens: Vec<String> = text.split_whitespace().map(|s| s.to_string()).collect();
476
477 let filtered_tokens = self
479 .stop_words
480 .remove_stop_words(&tokens, detection.language);
481
482 Ok(ProcessedText {
483 original: text.to_string(),
484 language: detection.language,
485 confidence: detection.confidence,
486 tokens,
487 filtered_tokens,
488 })
489 }
490}
491
492impl Default for MultilingualProcessor {
493 fn default() -> Self {
494 Self::new()
495 }
496}
497
498#[derive(Debug, Clone)]
500pub struct ProcessedText {
501 pub original: String,
503 pub language: Language,
505 pub confidence: f64,
507 pub tokens: Vec<String>,
509 pub filtered_tokens: Vec<String>,
511}
512
513#[cfg(test)]
514mod tests {
515 use super::*;
516
517 #[test]
518 fn test_language_enum() {
519 assert_eq!(Language::English.iso_code(), "en");
520 assert_eq!(Language::Spanish.name(), "Spanish");
521 assert_eq!(Language::from_iso_code("fr"), Language::French);
522 assert_eq!(Language::from_iso_code("unknown"), Language::Unknown);
523 }
524
525 #[test]
526 fn test_language_detection() {
527 let detector = LanguageDetector::new();
528
529 let result = detector.detect("The quick brown fox jumps over the lazy dog. This is definitely an English sentence with many common words.").unwrap();
531 assert_eq!(result.language, Language::English);
532
533 let empty_result = detector.detect("");
535 assert!(empty_result.is_err());
536 }
537
538 #[test]
539 fn test_stop_words() {
540 let stop_words = StopWords::new();
541
542 assert!(stop_words.is_stop_word("the", Language::English));
544 assert!(stop_words.is_stop_word("and", Language::English));
545 assert!(!stop_words.is_stop_word("hello", Language::English));
546
547 let tokens = vec![
549 "the".to_string(),
550 "cat".to_string(),
551 "is".to_string(),
552 "happy".to_string(),
553 ];
554 let filtered = stop_words.remove_stop_words(&tokens, Language::English);
555 assert_eq!(filtered, vec!["cat", "happy"]);
556 }
557
558 #[test]
559 fn test_multilingual_processor() {
560 let processor = MultilingualProcessor::new();
561
562 let result = processor.process("The quick brown fox jumps over the lazy dog. This sentence has many English words.").unwrap();
563 assert_eq!(result.language, Language::English);
564 assert!(!result.tokens.is_empty());
565 assert!(result.filtered_tokens.len() < result.tokens.len());
566 }
567
568 #[test]
569 fn test_createtext_profile() {
570 let detector = LanguageDetector::new();
571 let profile = detector.createtext_profile("hello world");
572
573 assert!(!profile.is_empty());
575 assert!(profile.contains_key("hel") || profile.contains_key("llo"));
576 }
577}