1use crate::error::{Result, TextError};
7use std::collections::HashMap;
8
9#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
11pub enum Language {
12 English,
14 Spanish,
16 French,
18 German,
20 Italian,
22 Portuguese,
24 Dutch,
26 Russian,
28 Chinese,
30 Japanese,
32 Korean,
34 Arabic,
36 Unknown,
38}
39
40impl Language {
41 pub fn iso_code(&self) -> &'static str {
43 match self {
44 Language::English => "en",
45 Language::Spanish => "es",
46 Language::French => "fr",
47 Language::German => "de",
48 Language::Italian => "it",
49 Language::Portuguese => "pt",
50 Language::Dutch => "nl",
51 Language::Russian => "ru",
52 Language::Chinese => "zh",
53 Language::Japanese => "ja",
54 Language::Korean => "ko",
55 Language::Arabic => "ar",
56 Language::Unknown => "und",
57 }
58 }
59
60 pub fn from_iso_code(code: &str) -> Self {
62 match code.to_lowercase().as_str() {
63 "en" => Language::English,
64 "es" => Language::Spanish,
65 "fr" => Language::French,
66 "de" => Language::German,
67 "it" => Language::Italian,
68 "pt" => Language::Portuguese,
69 "nl" => Language::Dutch,
70 "ru" => Language::Russian,
71 "zh" => Language::Chinese,
72 "ja" => Language::Japanese,
73 "ko" => Language::Korean,
74 "ar" => Language::Arabic,
75 _ => Language::Unknown,
76 }
77 }
78
79 pub fn name(&self) -> &'static str {
81 match self {
82 Language::English => "English",
83 Language::Spanish => "Spanish",
84 Language::French => "French",
85 Language::German => "German",
86 Language::Italian => "Italian",
87 Language::Portuguese => "Portuguese",
88 Language::Dutch => "Dutch",
89 Language::Russian => "Russian",
90 Language::Chinese => "Chinese",
91 Language::Japanese => "Japanese",
92 Language::Korean => "Korean",
93 Language::Arabic => "Arabic",
94 Language::Unknown => "Unknown",
95 }
96 }
97}
98
99#[derive(Debug, Clone)]
101pub struct LanguageDetectionResult {
102 pub language: Language,
104 pub confidence: f64,
106 pub alternatives: Vec<(Language, f64)>,
108}
109
110pub struct LanguageDetector {
112 profiles: HashMap<Language, HashMap<String, f64>>,
114 n_gram_size: usize,
116}
117
118impl LanguageDetector {
119 pub fn new() -> Self {
121 let mut detector = Self {
122 profiles: HashMap::new(),
123 n_gram_size: 3,
124 };
125 detector.initialize_default_profiles();
126 detector
127 }
128
129 pub fn with_ngram_size(n_gramsize: usize) -> Result<Self> {
131 if !(1..=5).contains(&n_gramsize) {
132 return Err(TextError::InvalidInput(
133 "N-gram size must be between 1 and 5".to_string(),
134 ));
135 }
136 let mut detector = Self {
137 profiles: HashMap::new(),
138 n_gram_size: n_gramsize,
139 };
140 detector.initialize_default_profiles();
141 Ok(detector)
142 }
143
144 fn initialize_default_profiles(&mut self) {
146 let mut english_profile = HashMap::new();
148 for (ngram, freq) in &[
149 ("the", 0.05),
150 ("and", 0.03),
151 ("ing", 0.025),
152 ("ion", 0.02),
153 ("tio", 0.018),
154 ("ent", 0.015),
155 ("ati", 0.013),
156 ("her", 0.012),
157 ("for", 0.011),
158 ("ter", 0.01),
159 ("hat", 0.009),
160 ("tha", 0.009),
161 ("ere", 0.008),
162 ("ate", 0.008),
163 ("ver", 0.007),
164 ("his", 0.007),
165 ] {
166 english_profile.insert(ngram.to_string(), *freq);
167 }
168 self.profiles.insert(Language::English, english_profile);
169
170 let mut spanish_profile = HashMap::new();
172 for (ngram, freq) in &[
173 ("que", 0.04),
174 ("de_", 0.035),
175 ("la_", 0.03),
176 ("el_", 0.025),
177 ("es_", 0.02),
178 ("los", 0.018),
179 ("las", 0.015),
180 ("ión", 0.013),
181 ("ado", 0.012),
182 ("nte", 0.011),
183 ("con", 0.01),
184 ("par", 0.009),
185 ("ara", 0.008),
186 ("una", 0.008),
187 ("por", 0.007),
188 ("est", 0.007),
189 ] {
190 spanish_profile.insert(ngram.to_string(), *freq);
191 }
192 self.profiles.insert(Language::Spanish, spanish_profile);
193
194 let mut french_profile = HashMap::new();
196 for (ngram, freq) in &[
197 ("de_", 0.05),
198 ("le_", 0.04),
199 ("que", 0.03),
200 ("les", 0.025),
201 ("la_", 0.02),
202 ("des", 0.018),
203 ("ent", 0.015),
204 ("ion", 0.013),
205 ("est", 0.012),
206 ("ait", 0.011),
207 ("pour", 0.01),
208 ("ais", 0.009),
209 ("ans", 0.008),
210 ("ont", 0.008),
211 ("une", 0.007),
212 ("qui", 0.007),
213 ] {
214 french_profile.insert(ngram.to_string(), *freq);
215 }
216 self.profiles.insert(Language::French, french_profile);
217
218 let mut german_profile = HashMap::new();
220 for (ngram, freq) in &[
221 ("der", 0.05),
222 ("die", 0.04),
223 ("und", 0.03),
224 ("den", 0.025),
225 ("das", 0.02),
226 ("ein", 0.018),
227 ("ich", 0.015),
228 ("ist", 0.013),
229 ("sch", 0.012),
230 ("cht", 0.011),
231 ("ung", 0.01),
232 ("gen", 0.009),
233 ("eit", 0.008),
234 ("ver", 0.008),
235 ("ber", 0.007),
236 ("ten", 0.007),
237 ] {
238 german_profile.insert(ngram.to_string(), *freq);
239 }
240 self.profiles.insert(Language::German, german_profile);
241
242 let mut italian_profile = HashMap::new();
244 for (ngram, freq) in &[
245 ("che", 0.05),
246 ("la_", 0.04),
247 ("il_", 0.03),
248 ("di_", 0.025),
249 ("del", 0.02),
250 ("le_", 0.018),
251 ("lla", 0.015),
252 ("per", 0.013),
253 ("ato", 0.012),
254 ("gli", 0.011),
255 ("sta", 0.01),
256 ("con", 0.009),
257 ("ent", 0.008),
258 ("ion", 0.008),
259 ("are", 0.007),
260 ("una", 0.007),
261 ] {
262 italian_profile.insert(ngram.to_string(), *freq);
263 }
264 self.profiles.insert(Language::Italian, italian_profile);
265
266 let mut portuguese_profile = HashMap::new();
268 for (ngram, freq) in &[
269 ("que", 0.05),
270 ("de_", 0.04),
271 ("os_", 0.03),
272 ("as_", 0.025),
273 ("da_", 0.02),
274 ("do_", 0.018),
275 ("ão_", 0.015),
276 ("ent", 0.013),
277 ("com", 0.012),
278 ("para", 0.011),
279 ("uma", 0.01),
280 ("est", 0.009),
281 ("nte", 0.008),
282 ("ção", 0.008),
283 ("por", 0.007),
284 ("não", 0.007),
285 ] {
286 portuguese_profile.insert(ngram.to_string(), *freq);
287 }
288 self.profiles
289 .insert(Language::Portuguese, portuguese_profile);
290
291 let mut dutch_profile = HashMap::new();
293 for (ngram, freq) in &[
294 ("de_", 0.05),
295 ("het", 0.04),
296 ("een", 0.03),
297 ("van", 0.025),
298 ("en_", 0.02),
299 ("dat", 0.018),
300 ("te_", 0.015),
301 ("op_", 0.013),
302 ("aar", 0.012),
303 ("oor", 0.011),
304 ("eer", 0.01),
305 ("sch", 0.009),
306 ("ver", 0.008),
307 ("ing", 0.008),
308 ("cht", 0.007),
309 ("ter", 0.007),
310 ] {
311 dutch_profile.insert(ngram.to_string(), *freq);
312 }
313 self.profiles.insert(Language::Dutch, dutch_profile);
314
315 let mut russian_profile = HashMap::new();
317 for (ngram, freq) in &[
318 ("что", 0.05),
319 ("ого", 0.04),
320 ("как", 0.03),
321 ("это", 0.025),
322 ("все", 0.02),
323 ("был", 0.018),
324 ("ени", 0.015),
325 ("ост", 0.013),
326 ("ова", 0.012),
327 ("про", 0.011),
328 ("сто", 0.01),
329 ("ого", 0.009),
330 ("при", 0.008),
331 ("ени", 0.008),
332 ("ать", 0.007),
333 ("ный", 0.007),
334 ] {
335 russian_profile.insert(ngram.to_string(), *freq);
336 }
337 self.profiles.insert(Language::Russian, russian_profile);
338
339 let mut chinese_profile = HashMap::new();
341 for (ngram, freq) in &[
342 ("的_", 0.06),
343 ("是_", 0.045),
344 ("了_", 0.035),
345 ("在_", 0.03),
346 ("和_", 0.025),
347 ("有_", 0.022),
348 ("我_", 0.02),
349 ("他_", 0.018),
350 ("不_", 0.016),
351 ("为_", 0.014),
352 ("这_", 0.013),
353 ("个_", 0.012),
354 ("们_", 0.011),
355 ("人_", 0.01),
356 ("要_", 0.009),
357 ("会_", 0.008),
358 ] {
359 chinese_profile.insert(ngram.to_string(), *freq);
360 }
361 self.profiles.insert(Language::Chinese, chinese_profile);
362
363 let mut japanese_profile = HashMap::new();
365 for (ngram, freq) in &[
366 ("の_", 0.05),
367 ("に_", 0.04),
368 ("は_", 0.035),
369 ("を_", 0.03),
370 ("た_", 0.025),
371 ("と_", 0.022),
372 ("が_", 0.02),
373 ("で_", 0.018),
374 ("る_", 0.016),
375 ("す_", 0.014),
376 ("い_", 0.013),
377 ("ます", 0.012),
378 ("した", 0.011),
379 ("して", 0.01),
380 ("です", 0.009),
381 ("ない", 0.008),
382 ] {
383 japanese_profile.insert(ngram.to_string(), *freq);
384 }
385 self.profiles.insert(Language::Japanese, japanese_profile);
386
387 let mut korean_profile = HashMap::new();
389 for (ngram, freq) in &[
390 ("의_", 0.05),
391 ("이_", 0.04),
392 ("가_", 0.035),
393 ("을_", 0.03),
394 ("는_", 0.025),
395 ("에_", 0.022),
396 ("하_", 0.02),
397 ("고_", 0.018),
398 ("다_", 0.016),
399 ("지_", 0.014),
400 ("한_", 0.013),
401 ("로_", 0.012),
402 ("서_", 0.011),
403 ("도_", 0.01),
404 ("와_", 0.009),
405 ("니_", 0.008),
406 ] {
407 korean_profile.insert(ngram.to_string(), *freq);
408 }
409 self.profiles.insert(Language::Korean, korean_profile);
410
411 let mut arabic_profile = HashMap::new();
413 for (ngram, freq) in &[
414 ("ال_", 0.06),
415 ("في_", 0.045),
416 ("من_", 0.035),
417 ("على", 0.03),
418 ("إلى", 0.025),
419 ("ها_", 0.022),
420 ("أن_", 0.02),
421 ("ما_", 0.018),
422 ("هو_", 0.016),
423 ("كان", 0.014),
424 ("هذا", 0.013),
425 ("عن_", 0.012),
426 ("بين", 0.011),
427 ("لا_", 0.01),
428 ("قد_", 0.009),
429 ("كل_", 0.008),
430 ] {
431 arabic_profile.insert(ngram.to_string(), *freq);
432 }
433 self.profiles.insert(Language::Arabic, arabic_profile);
434 }
435
436 pub fn detect(&self, text: &str) -> Result<LanguageDetectionResult> {
438 if text.trim().is_empty() {
439 return Err(TextError::InvalidInput(
440 "Cannot detect language of empty text".to_string(),
441 ));
442 }
443
444 let text_profile = self.createtext_profile(text);
446
447 let mut scores: Vec<(Language, f64)> = self
449 .profiles
450 .iter()
451 .map(|(lang, profile)| {
452 let score = self.calculate_similarity(&text_profile, profile);
453 (*lang, score)
454 })
455 .collect();
456
457 scores.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
459
460 if scores.is_empty() {
461 return Ok(LanguageDetectionResult {
462 language: Language::Unknown,
463 confidence: 0.0,
464 alternatives: vec![],
465 });
466 }
467
468 let best_score = scores[0].1;
469 let best_language = scores[0].0;
470
471 let confidence = if scores.len() > 1 {
473 let second_score = scores[1].1;
474 let diff = best_score - second_score;
475 (diff / best_score).clamp(0.0, 1.0)
477 } else {
478 best_score
479 };
480
481 Ok(LanguageDetectionResult {
482 language: best_language,
483 confidence,
484 alternatives: scores.into_iter().skip(1).take(3).collect(),
485 })
486 }
487
488 fn createtext_profile(&self, text: &str) -> HashMap<String, f64> {
490 let mut profile = HashMap::new();
491 let text_lower = text.to_lowercase();
492 let chars: Vec<char> = text_lower.chars().collect();
493 let total_ngrams = chars.len().saturating_sub(self.n_gram_size - 1) as f64;
494
495 if total_ngrams <= 0.0 {
496 return profile;
497 }
498
499 let mut ngram_counts: HashMap<String, usize> = HashMap::new();
501 for i in 0..=chars.len().saturating_sub(self.n_gram_size) {
502 let ngram: String = chars[i..i + self.n_gram_size].iter().collect();
503 let ngram = ngram.replace(' ', "_");
505 *ngram_counts.entry(ngram).or_insert(0) += 1;
506 }
507
508 for (ngram, count) in ngram_counts {
510 profile.insert(ngram, count as f64 / total_ngrams);
511 }
512
513 profile
514 }
515
516 fn calculate_similarity(
518 &self,
519 profile1: &HashMap<String, f64>,
520 profile2: &HashMap<String, f64>,
521 ) -> f64 {
522 let mut similarity = 0.0;
523 let mut total_weight = 0.0;
524
525 for (ngram, freq1) in profile1 {
527 if let Some(freq2) = profile2.get(ngram) {
528 similarity += freq1 * freq2;
529 }
530 total_weight += freq1 * freq1;
531 }
532
533 if total_weight > 0.0 {
534 similarity / total_weight.sqrt()
535 } else {
536 0.0
537 }
538 }
539
540 pub fn supported_languages(&self) -> Vec<Language> {
542 self.profiles.keys().copied().collect()
543 }
544}
545
546impl Default for LanguageDetector {
547 fn default() -> Self {
548 Self::new()
549 }
550}
551
552pub struct StopWords {
554 stop_words: HashMap<Language, Vec<String>>,
556}
557
558impl StopWords {
559 pub fn new() -> Self {
561 let mut stop_words = HashMap::new();
562
563 stop_words.insert(
565 Language::English,
566 vec![
567 "a", "an", "and", "are", "as", "at", "be", "by", "for", "from", "has", "he", "in",
568 "is", "it", "its", "of", "on", "that", "the", "to", "was", "will", "with", "you",
569 "your", "this", "have", "had", "been", "but", "not", "they", "were", "what",
570 "when", "where", "who", "which", "their", "them", "these", "those", "there",
571 "here", "than",
572 ]
573 .iter()
574 .map(|s| s.to_string())
575 .collect(),
576 );
577
578 stop_words.insert(
580 Language::Spanish,
581 vec![
582 "a", "al", "algo", "algunas", "algunos", "ante", "antes", "como", "con", "contra",
583 "cual", "cuando", "de", "del", "desde", "donde", "durante", "e", "el", "ella",
584 "ellas", "ellos", "en", "entre", "era", "erais", "eran", "eras", "eres", "es",
585 "esa", "esas", "ese", "eso", "esos", "esta", "estas", "este", "esto", "estos",
586 "fue", "fueron", "fui", "la", "las", "lo", "los", "más", "mi", "mis", "mucho",
587 "muchos", "muy", "ni", "no", "nos", "nosotras", "nosotros", "o", "otra", "otras",
588 "otro", "otros", "para", "pero", "por", "porque", "que", "quien", "quienes", "se",
589 "si", "sin", "sobre", "su", "sus", "también", "tanto", "te", "tu", "tus", "un",
590 "una", "uno", "unos", "y", "ya", "yo",
591 ]
592 .iter()
593 .map(|s| s.to_string())
594 .collect(),
595 );
596
597 stop_words.insert(
599 Language::French,
600 vec![
601 "au", "aux", "avec", "ce", "ces", "dans", "de", "des", "du", "elle", "en", "et",
602 "eux", "il", "je", "la", "le", "les", "leur", "lui", "ma", "mais", "me", "même",
603 "mes", "moi", "mon", "ne", "nos", "notre", "nous", "on", "ou", "par", "pas",
604 "pour", "qu", "que", "qui", "sa", "se", "ses", "son", "sur", "ta", "te", "tes",
605 "toi", "ton", "tu", "un", "une", "vos", "votre", "vous",
606 ]
607 .iter()
608 .map(|s| s.to_string())
609 .collect(),
610 );
611
612 Self { stop_words }
613 }
614
615 pub fn get(&self, language: Language) -> Option<&Vec<String>> {
617 self.stop_words.get(&language)
618 }
619
620 pub fn is_stop_word(&self, word: &str, language: Language) -> bool {
622 if let Some(words) = self.stop_words.get(&language) {
623 words.iter().any(|sw| sw == &word.to_lowercase())
624 } else {
625 false
626 }
627 }
628
629 pub fn remove_stop_words(&self, tokens: &[String], language: Language) -> Vec<String> {
631 tokens
632 .iter()
633 .filter(|token| !self.is_stop_word(token, language))
634 .cloned()
635 .collect()
636 }
637}
638
639impl Default for StopWords {
640 fn default() -> Self {
641 Self::new()
642 }
643}
644
645pub struct MultilingualProcessor {
647 detector: LanguageDetector,
649 stop_words: StopWords,
651}
652
653impl MultilingualProcessor {
654 pub fn new() -> Self {
656 Self {
657 detector: LanguageDetector::new(),
658 stop_words: StopWords::new(),
659 }
660 }
661
662 pub fn process(&self, text: &str) -> Result<ProcessedText> {
664 let detection = self.detector.detect(text)?;
666
667 let tokens: Vec<String> = text.split_whitespace().map(|s| s.to_string()).collect();
669
670 let filtered_tokens = self
672 .stop_words
673 .remove_stop_words(&tokens, detection.language);
674
675 Ok(ProcessedText {
676 original: text.to_string(),
677 language: detection.language,
678 confidence: detection.confidence,
679 tokens,
680 filtered_tokens,
681 })
682 }
683}
684
685impl Default for MultilingualProcessor {
686 fn default() -> Self {
687 Self::new()
688 }
689}
690
691#[derive(Debug, Clone)]
693pub struct ProcessedText {
694 pub original: String,
696 pub language: Language,
698 pub confidence: f64,
700 pub tokens: Vec<String>,
702 pub filtered_tokens: Vec<String>,
704}
705
706#[cfg(test)]
707mod tests {
708 use super::*;
709
710 #[test]
711 fn test_language_enum() {
712 assert_eq!(Language::English.iso_code(), "en");
713 assert_eq!(Language::Spanish.name(), "Spanish");
714 assert_eq!(Language::from_iso_code("fr"), Language::French);
715 assert_eq!(Language::from_iso_code("unknown"), Language::Unknown);
716 }
717
718 #[test]
719 fn test_language_detection() {
720 let detector = LanguageDetector::new();
721
722 let result = detector.detect("The quick brown fox jumps over the lazy dog. This is definitely an English sentence with many common words.").expect("Operation failed");
724 assert_eq!(result.language, Language::English);
725
726 let empty_result = detector.detect("");
728 assert!(empty_result.is_err());
729 }
730
731 #[test]
732 fn test_stop_words() {
733 let stop_words = StopWords::new();
734
735 assert!(stop_words.is_stop_word("the", Language::English));
737 assert!(stop_words.is_stop_word("and", Language::English));
738 assert!(!stop_words.is_stop_word("hello", Language::English));
739
740 let tokens = vec![
742 "the".to_string(),
743 "cat".to_string(),
744 "is".to_string(),
745 "happy".to_string(),
746 ];
747 let filtered = stop_words.remove_stop_words(&tokens, Language::English);
748 assert_eq!(filtered, vec!["cat", "happy"]);
749 }
750
751 #[test]
752 fn test_multilingual_processor() {
753 let processor = MultilingualProcessor::new();
754
755 let result = processor.process("The quick brown fox jumps over the lazy dog. This sentence has many English words.").expect("Operation failed");
756 assert_eq!(result.language, Language::English);
757 assert!(!result.tokens.is_empty());
758 assert!(result.filtered_tokens.len() < result.tokens.len());
759 }
760
761 #[test]
762 fn test_createtext_profile() {
763 let detector = LanguageDetector::new();
764 let profile = detector.createtext_profile("hello world");
765
766 assert!(!profile.is_empty());
768 assert!(profile.contains_key("hel") || profile.contains_key("llo"));
769 }
770}