Skip to main content

scirs2_text/
multilingual.rs

1//! Multilingual text processing and language detection
2//!
3//! This module provides functionality for detecting languages
4//! and processing text in multiple languages.
5
6use crate::error::{Result, TextError};
7use std::collections::HashMap;
8
9/// Supported languages for detection
10#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
11pub enum Language {
12    /// English
13    English,
14    /// Spanish
15    Spanish,
16    /// French
17    French,
18    /// German
19    German,
20    /// Italian
21    Italian,
22    /// Portuguese
23    Portuguese,
24    /// Dutch
25    Dutch,
26    /// Russian
27    Russian,
28    /// Chinese
29    Chinese,
30    /// Japanese
31    Japanese,
32    /// Korean
33    Korean,
34    /// Arabic
35    Arabic,
36    /// Unknown language
37    Unknown,
38}
39
40impl Language {
41    /// Get the ISO 639-1 code for the language
42    pub fn iso_code(&self) -> &'static str {
43        match self {
44            Language::English => "en",
45            Language::Spanish => "es",
46            Language::French => "fr",
47            Language::German => "de",
48            Language::Italian => "it",
49            Language::Portuguese => "pt",
50            Language::Dutch => "nl",
51            Language::Russian => "ru",
52            Language::Chinese => "zh",
53            Language::Japanese => "ja",
54            Language::Korean => "ko",
55            Language::Arabic => "ar",
56            Language::Unknown => "und",
57        }
58    }
59
60    /// Get the language from ISO 639-1 code
61    pub fn from_iso_code(code: &str) -> Self {
62        match code.to_lowercase().as_str() {
63            "en" => Language::English,
64            "es" => Language::Spanish,
65            "fr" => Language::French,
66            "de" => Language::German,
67            "it" => Language::Italian,
68            "pt" => Language::Portuguese,
69            "nl" => Language::Dutch,
70            "ru" => Language::Russian,
71            "zh" => Language::Chinese,
72            "ja" => Language::Japanese,
73            "ko" => Language::Korean,
74            "ar" => Language::Arabic,
75            _ => Language::Unknown,
76        }
77    }
78
79    /// Get the full name of the language
80    pub fn name(&self) -> &'static str {
81        match self {
82            Language::English => "English",
83            Language::Spanish => "Spanish",
84            Language::French => "French",
85            Language::German => "German",
86            Language::Italian => "Italian",
87            Language::Portuguese => "Portuguese",
88            Language::Dutch => "Dutch",
89            Language::Russian => "Russian",
90            Language::Chinese => "Chinese",
91            Language::Japanese => "Japanese",
92            Language::Korean => "Korean",
93            Language::Arabic => "Arabic",
94            Language::Unknown => "Unknown",
95        }
96    }
97}
98
99/// Result of language detection
100#[derive(Debug, Clone)]
101pub struct LanguageDetectionResult {
102    /// The detected language
103    pub language: Language,
104    /// Confidence score (0.0 to 1.0)
105    pub confidence: f64,
106    /// Alternative language candidates with scores
107    pub alternatives: Vec<(Language, f64)>,
108}
109
110/// Language detector using character n-gram profiles
111pub struct LanguageDetector {
112    /// Character n-gram profiles for each language
113    profiles: HashMap<Language, HashMap<String, f64>>,
114    /// N-gram size (typically 2 or 3)
115    n_gram_size: usize,
116}
117
118impl LanguageDetector {
119    /// Create a new language detector with default profiles
120    pub fn new() -> Self {
121        let mut detector = Self {
122            profiles: HashMap::new(),
123            n_gram_size: 3,
124        };
125        detector.initialize_default_profiles();
126        detector
127    }
128
129    /// Create a language detector with custom n-gram size
130    pub fn with_ngram_size(n_gramsize: usize) -> Result<Self> {
131        if !(1..=5).contains(&n_gramsize) {
132            return Err(TextError::InvalidInput(
133                "N-gram size must be between 1 and 5".to_string(),
134            ));
135        }
136        let mut detector = Self {
137            profiles: HashMap::new(),
138            n_gram_size: n_gramsize,
139        };
140        detector.initialize_default_profiles();
141        Ok(detector)
142    }
143
144    /// Initialize default language profiles with common n-grams
145    fn initialize_default_profiles(&mut self) {
146        // English profile
147        let mut english_profile = HashMap::new();
148        for (ngram, freq) in &[
149            ("the", 0.05),
150            ("and", 0.03),
151            ("ing", 0.025),
152            ("ion", 0.02),
153            ("tio", 0.018),
154            ("ent", 0.015),
155            ("ati", 0.013),
156            ("her", 0.012),
157            ("for", 0.011),
158            ("ter", 0.01),
159            ("hat", 0.009),
160            ("tha", 0.009),
161            ("ere", 0.008),
162            ("ate", 0.008),
163            ("ver", 0.007),
164            ("his", 0.007),
165        ] {
166            english_profile.insert(ngram.to_string(), *freq);
167        }
168        self.profiles.insert(Language::English, english_profile);
169
170        // Spanish profile
171        let mut spanish_profile = HashMap::new();
172        for (ngram, freq) in &[
173            ("que", 0.04),
174            ("de_", 0.035),
175            ("la_", 0.03),
176            ("el_", 0.025),
177            ("es_", 0.02),
178            ("los", 0.018),
179            ("las", 0.015),
180            ("ión", 0.013),
181            ("ado", 0.012),
182            ("nte", 0.011),
183            ("con", 0.01),
184            ("par", 0.009),
185            ("ara", 0.008),
186            ("una", 0.008),
187            ("por", 0.007),
188            ("est", 0.007),
189        ] {
190            spanish_profile.insert(ngram.to_string(), *freq);
191        }
192        self.profiles.insert(Language::Spanish, spanish_profile);
193
194        // French profile
195        let mut french_profile = HashMap::new();
196        for (ngram, freq) in &[
197            ("de_", 0.05),
198            ("le_", 0.04),
199            ("que", 0.03),
200            ("les", 0.025),
201            ("la_", 0.02),
202            ("des", 0.018),
203            ("ent", 0.015),
204            ("ion", 0.013),
205            ("est", 0.012),
206            ("ait", 0.011),
207            ("pour", 0.01),
208            ("ais", 0.009),
209            ("ans", 0.008),
210            ("ont", 0.008),
211            ("une", 0.007),
212            ("qui", 0.007),
213        ] {
214            french_profile.insert(ngram.to_string(), *freq);
215        }
216        self.profiles.insert(Language::French, french_profile);
217
218        // German profile
219        let mut german_profile = HashMap::new();
220        for (ngram, freq) in &[
221            ("der", 0.05),
222            ("die", 0.04),
223            ("und", 0.03),
224            ("den", 0.025),
225            ("das", 0.02),
226            ("ein", 0.018),
227            ("ich", 0.015),
228            ("ist", 0.013),
229            ("sch", 0.012),
230            ("cht", 0.011),
231            ("ung", 0.01),
232            ("gen", 0.009),
233            ("eit", 0.008),
234            ("ver", 0.008),
235            ("ber", 0.007),
236            ("ten", 0.007),
237        ] {
238            german_profile.insert(ngram.to_string(), *freq);
239        }
240        self.profiles.insert(Language::German, german_profile);
241
242        // Italian profile
243        let mut italian_profile = HashMap::new();
244        for (ngram, freq) in &[
245            ("che", 0.05),
246            ("la_", 0.04),
247            ("il_", 0.03),
248            ("di_", 0.025),
249            ("del", 0.02),
250            ("le_", 0.018),
251            ("lla", 0.015),
252            ("per", 0.013),
253            ("ato", 0.012),
254            ("gli", 0.011),
255            ("sta", 0.01),
256            ("con", 0.009),
257            ("ent", 0.008),
258            ("ion", 0.008),
259            ("are", 0.007),
260            ("una", 0.007),
261        ] {
262            italian_profile.insert(ngram.to_string(), *freq);
263        }
264        self.profiles.insert(Language::Italian, italian_profile);
265
266        // Portuguese profile
267        let mut portuguese_profile = HashMap::new();
268        for (ngram, freq) in &[
269            ("que", 0.05),
270            ("de_", 0.04),
271            ("os_", 0.03),
272            ("as_", 0.025),
273            ("da_", 0.02),
274            ("do_", 0.018),
275            ("ão_", 0.015),
276            ("ent", 0.013),
277            ("com", 0.012),
278            ("para", 0.011),
279            ("uma", 0.01),
280            ("est", 0.009),
281            ("nte", 0.008),
282            ("ção", 0.008),
283            ("por", 0.007),
284            ("não", 0.007),
285        ] {
286            portuguese_profile.insert(ngram.to_string(), *freq);
287        }
288        self.profiles
289            .insert(Language::Portuguese, portuguese_profile);
290
291        // Dutch profile
292        let mut dutch_profile = HashMap::new();
293        for (ngram, freq) in &[
294            ("de_", 0.05),
295            ("het", 0.04),
296            ("een", 0.03),
297            ("van", 0.025),
298            ("en_", 0.02),
299            ("dat", 0.018),
300            ("te_", 0.015),
301            ("op_", 0.013),
302            ("aar", 0.012),
303            ("oor", 0.011),
304            ("eer", 0.01),
305            ("sch", 0.009),
306            ("ver", 0.008),
307            ("ing", 0.008),
308            ("cht", 0.007),
309            ("ter", 0.007),
310        ] {
311            dutch_profile.insert(ngram.to_string(), *freq);
312        }
313        self.profiles.insert(Language::Dutch, dutch_profile);
314
315        // Russian profile
316        let mut russian_profile = HashMap::new();
317        for (ngram, freq) in &[
318            ("что", 0.05),
319            ("ого", 0.04),
320            ("как", 0.03),
321            ("это", 0.025),
322            ("все", 0.02),
323            ("был", 0.018),
324            ("ени", 0.015),
325            ("ост", 0.013),
326            ("ова", 0.012),
327            ("про", 0.011),
328            ("сто", 0.01),
329            ("ого", 0.009),
330            ("при", 0.008),
331            ("ени", 0.008),
332            ("ать", 0.007),
333            ("ный", 0.007),
334        ] {
335            russian_profile.insert(ngram.to_string(), *freq);
336        }
337        self.profiles.insert(Language::Russian, russian_profile);
338
339        // Chinese profile (using pinyin representation)
340        let mut chinese_profile = HashMap::new();
341        for (ngram, freq) in &[
342            ("的_", 0.06),
343            ("是_", 0.045),
344            ("了_", 0.035),
345            ("在_", 0.03),
346            ("和_", 0.025),
347            ("有_", 0.022),
348            ("我_", 0.02),
349            ("他_", 0.018),
350            ("不_", 0.016),
351            ("为_", 0.014),
352            ("这_", 0.013),
353            ("个_", 0.012),
354            ("们_", 0.011),
355            ("人_", 0.01),
356            ("要_", 0.009),
357            ("会_", 0.008),
358        ] {
359            chinese_profile.insert(ngram.to_string(), *freq);
360        }
361        self.profiles.insert(Language::Chinese, chinese_profile);
362
363        // Japanese profile (using hiragana/katakana)
364        let mut japanese_profile = HashMap::new();
365        for (ngram, freq) in &[
366            ("の_", 0.05),
367            ("に_", 0.04),
368            ("は_", 0.035),
369            ("を_", 0.03),
370            ("た_", 0.025),
371            ("と_", 0.022),
372            ("が_", 0.02),
373            ("で_", 0.018),
374            ("る_", 0.016),
375            ("す_", 0.014),
376            ("い_", 0.013),
377            ("ます", 0.012),
378            ("した", 0.011),
379            ("して", 0.01),
380            ("です", 0.009),
381            ("ない", 0.008),
382        ] {
383            japanese_profile.insert(ngram.to_string(), *freq);
384        }
385        self.profiles.insert(Language::Japanese, japanese_profile);
386
387        // Korean profile (using Hangul)
388        let mut korean_profile = HashMap::new();
389        for (ngram, freq) in &[
390            ("의_", 0.05),
391            ("이_", 0.04),
392            ("가_", 0.035),
393            ("을_", 0.03),
394            ("는_", 0.025),
395            ("에_", 0.022),
396            ("하_", 0.02),
397            ("고_", 0.018),
398            ("다_", 0.016),
399            ("지_", 0.014),
400            ("한_", 0.013),
401            ("로_", 0.012),
402            ("서_", 0.011),
403            ("도_", 0.01),
404            ("와_", 0.009),
405            ("니_", 0.008),
406        ] {
407            korean_profile.insert(ngram.to_string(), *freq);
408        }
409        self.profiles.insert(Language::Korean, korean_profile);
410
411        // Arabic profile (using Arabic script)
412        let mut arabic_profile = HashMap::new();
413        for (ngram, freq) in &[
414            ("ال_", 0.06),
415            ("في_", 0.045),
416            ("من_", 0.035),
417            ("على", 0.03),
418            ("إلى", 0.025),
419            ("ها_", 0.022),
420            ("أن_", 0.02),
421            ("ما_", 0.018),
422            ("هو_", 0.016),
423            ("كان", 0.014),
424            ("هذا", 0.013),
425            ("عن_", 0.012),
426            ("بين", 0.011),
427            ("لا_", 0.01),
428            ("قد_", 0.009),
429            ("كل_", 0.008),
430        ] {
431            arabic_profile.insert(ngram.to_string(), *freq);
432        }
433        self.profiles.insert(Language::Arabic, arabic_profile);
434    }
435
436    /// Detect the language of a text
437    pub fn detect(&self, text: &str) -> Result<LanguageDetectionResult> {
438        if text.trim().is_empty() {
439            return Err(TextError::InvalidInput(
440                "Cannot detect language of empty text".to_string(),
441            ));
442        }
443
444        // Extract n-grams from the text
445        let text_profile = self.createtext_profile(text);
446
447        // Score each language profile
448        let mut scores: Vec<(Language, f64)> = self
449            .profiles
450            .iter()
451            .map(|(lang, profile)| {
452                let score = self.calculate_similarity(&text_profile, profile);
453                (*lang, score)
454            })
455            .collect();
456
457        // Sort by score (descending)
458        scores.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
459
460        if scores.is_empty() {
461            return Ok(LanguageDetectionResult {
462                language: Language::Unknown,
463                confidence: 0.0,
464                alternatives: vec![],
465            });
466        }
467
468        let best_score = scores[0].1;
469        let best_language = scores[0].0;
470
471        // Calculate confidence based on the difference between top scores
472        let confidence = if scores.len() > 1 {
473            let second_score = scores[1].1;
474            let diff = best_score - second_score;
475            // Normalize confidence to [0, 1]
476            (diff / best_score).clamp(0.0, 1.0)
477        } else {
478            best_score
479        };
480
481        Ok(LanguageDetectionResult {
482            language: best_language,
483            confidence,
484            alternatives: scores.into_iter().skip(1).take(3).collect(),
485        })
486    }
487
488    /// Create n-gram profile for a text
489    fn createtext_profile(&self, text: &str) -> HashMap<String, f64> {
490        let mut profile = HashMap::new();
491        let text_lower = text.to_lowercase();
492        let chars: Vec<char> = text_lower.chars().collect();
493        let total_ngrams = chars.len().saturating_sub(self.n_gram_size - 1) as f64;
494
495        if total_ngrams <= 0.0 {
496            return profile;
497        }
498
499        // Count n-grams
500        let mut ngram_counts: HashMap<String, usize> = HashMap::new();
501        for i in 0..=chars.len().saturating_sub(self.n_gram_size) {
502            let ngram: String = chars[i..i + self.n_gram_size].iter().collect();
503            // Replace spaces with underscores for consistency
504            let ngram = ngram.replace(' ', "_");
505            *ngram_counts.entry(ngram).or_insert(0) += 1;
506        }
507
508        // Convert counts to frequencies
509        for (ngram, count) in ngram_counts {
510            profile.insert(ngram, count as f64 / total_ngrams);
511        }
512
513        profile
514    }
515
516    /// Calculate similarity between two n-gram profiles
517    fn calculate_similarity(
518        &self,
519        profile1: &HashMap<String, f64>,
520        profile2: &HashMap<String, f64>,
521    ) -> f64 {
522        let mut similarity = 0.0;
523        let mut total_weight = 0.0;
524
525        // Use cosine similarity
526        for (ngram, freq1) in profile1 {
527            if let Some(freq2) = profile2.get(ngram) {
528                similarity += freq1 * freq2;
529            }
530            total_weight += freq1 * freq1;
531        }
532
533        if total_weight > 0.0 {
534            similarity / total_weight.sqrt()
535        } else {
536            0.0
537        }
538    }
539
540    /// Get supported languages
541    pub fn supported_languages(&self) -> Vec<Language> {
542        self.profiles.keys().copied().collect()
543    }
544}
545
546impl Default for LanguageDetector {
547    fn default() -> Self {
548        Self::new()
549    }
550}
551
552/// Language-specific stop words
553pub struct StopWords {
554    /// Stop words organized by language
555    stop_words: HashMap<Language, Vec<String>>,
556}
557
558impl StopWords {
559    /// Create a new stop words collection
560    pub fn new() -> Self {
561        let mut stop_words = HashMap::new();
562
563        // English stop words
564        stop_words.insert(
565            Language::English,
566            vec![
567                "a", "an", "and", "are", "as", "at", "be", "by", "for", "from", "has", "he", "in",
568                "is", "it", "its", "of", "on", "that", "the", "to", "was", "will", "with", "you",
569                "your", "this", "have", "had", "been", "but", "not", "they", "were", "what",
570                "when", "where", "who", "which", "their", "them", "these", "those", "there",
571                "here", "than",
572            ]
573            .iter()
574            .map(|s| s.to_string())
575            .collect(),
576        );
577
578        // Spanish stop words
579        stop_words.insert(
580            Language::Spanish,
581            vec![
582                "a", "al", "algo", "algunas", "algunos", "ante", "antes", "como", "con", "contra",
583                "cual", "cuando", "de", "del", "desde", "donde", "durante", "e", "el", "ella",
584                "ellas", "ellos", "en", "entre", "era", "erais", "eran", "eras", "eres", "es",
585                "esa", "esas", "ese", "eso", "esos", "esta", "estas", "este", "esto", "estos",
586                "fue", "fueron", "fui", "la", "las", "lo", "los", "más", "mi", "mis", "mucho",
587                "muchos", "muy", "ni", "no", "nos", "nosotras", "nosotros", "o", "otra", "otras",
588                "otro", "otros", "para", "pero", "por", "porque", "que", "quien", "quienes", "se",
589                "si", "sin", "sobre", "su", "sus", "también", "tanto", "te", "tu", "tus", "un",
590                "una", "uno", "unos", "y", "ya", "yo",
591            ]
592            .iter()
593            .map(|s| s.to_string())
594            .collect(),
595        );
596
597        // French stop words
598        stop_words.insert(
599            Language::French,
600            vec![
601                "au", "aux", "avec", "ce", "ces", "dans", "de", "des", "du", "elle", "en", "et",
602                "eux", "il", "je", "la", "le", "les", "leur", "lui", "ma", "mais", "me", "même",
603                "mes", "moi", "mon", "ne", "nos", "notre", "nous", "on", "ou", "par", "pas",
604                "pour", "qu", "que", "qui", "sa", "se", "ses", "son", "sur", "ta", "te", "tes",
605                "toi", "ton", "tu", "un", "une", "vos", "votre", "vous",
606            ]
607            .iter()
608            .map(|s| s.to_string())
609            .collect(),
610        );
611
612        Self { stop_words }
613    }
614
615    /// Get stop words for a specific language
616    pub fn get(&self, language: Language) -> Option<&Vec<String>> {
617        self.stop_words.get(&language)
618    }
619
620    /// Check if a word is a stop word in a specific language
621    pub fn is_stop_word(&self, word: &str, language: Language) -> bool {
622        if let Some(words) = self.stop_words.get(&language) {
623            words.iter().any(|sw| sw == &word.to_lowercase())
624        } else {
625            false
626        }
627    }
628
629    /// Remove stop words from a list of tokens
630    pub fn remove_stop_words(&self, tokens: &[String], language: Language) -> Vec<String> {
631        tokens
632            .iter()
633            .filter(|token| !self.is_stop_word(token, language))
634            .cloned()
635            .collect()
636    }
637}
638
639impl Default for StopWords {
640    fn default() -> Self {
641        Self::new()
642    }
643}
644
645/// Language-specific text processor
646pub struct MultilingualProcessor {
647    /// Language detector
648    detector: LanguageDetector,
649    /// Stop words collection
650    stop_words: StopWords,
651}
652
653impl MultilingualProcessor {
654    /// Create a new multilingual processor
655    pub fn new() -> Self {
656        Self {
657            detector: LanguageDetector::new(),
658            stop_words: StopWords::new(),
659        }
660    }
661
662    /// Process text with automatic language detection
663    pub fn process(&self, text: &str) -> Result<ProcessedText> {
664        // Detect language
665        let detection = self.detector.detect(text)?;
666
667        // Tokenize (simple whitespace tokenization for now)
668        let tokens: Vec<String> = text.split_whitespace().map(|s| s.to_string()).collect();
669
670        // Remove stop words
671        let filtered_tokens = self
672            .stop_words
673            .remove_stop_words(&tokens, detection.language);
674
675        Ok(ProcessedText {
676            original: text.to_string(),
677            language: detection.language,
678            confidence: detection.confidence,
679            tokens,
680            filtered_tokens,
681        })
682    }
683}
684
685impl Default for MultilingualProcessor {
686    fn default() -> Self {
687        Self::new()
688    }
689}
690
691/// Result of multilingual text processing
692#[derive(Debug, Clone)]
693pub struct ProcessedText {
694    /// Original text
695    pub original: String,
696    /// Detected language
697    pub language: Language,
698    /// Language detection confidence
699    pub confidence: f64,
700    /// All tokens
701    pub tokens: Vec<String>,
702    /// Tokens after stop word removal
703    pub filtered_tokens: Vec<String>,
704}
705
706#[cfg(test)]
707mod tests {
708    use super::*;
709
710    #[test]
711    fn test_language_enum() {
712        assert_eq!(Language::English.iso_code(), "en");
713        assert_eq!(Language::Spanish.name(), "Spanish");
714        assert_eq!(Language::from_iso_code("fr"), Language::French);
715        assert_eq!(Language::from_iso_code("unknown"), Language::Unknown);
716    }
717
718    #[test]
719    fn test_language_detection() {
720        let detector = LanguageDetector::new();
721
722        // Test English detection with more text
723        let result = detector.detect("The quick brown fox jumps over the lazy dog. This is definitely an English sentence with many common words.").expect("Operation failed");
724        assert_eq!(result.language, Language::English);
725
726        // Test with empty text
727        let empty_result = detector.detect("");
728        assert!(empty_result.is_err());
729    }
730
731    #[test]
732    fn test_stop_words() {
733        let stop_words = StopWords::new();
734
735        // Test English stop words
736        assert!(stop_words.is_stop_word("the", Language::English));
737        assert!(stop_words.is_stop_word("and", Language::English));
738        assert!(!stop_words.is_stop_word("hello", Language::English));
739
740        // Test stop word removal
741        let tokens = vec![
742            "the".to_string(),
743            "cat".to_string(),
744            "is".to_string(),
745            "happy".to_string(),
746        ];
747        let filtered = stop_words.remove_stop_words(&tokens, Language::English);
748        assert_eq!(filtered, vec!["cat", "happy"]);
749    }
750
751    #[test]
752    fn test_multilingual_processor() {
753        let processor = MultilingualProcessor::new();
754
755        let result = processor.process("The quick brown fox jumps over the lazy dog. This sentence has many English words.").expect("Operation failed");
756        assert_eq!(result.language, Language::English);
757        assert!(!result.tokens.is_empty());
758        assert!(result.filtered_tokens.len() < result.tokens.len());
759    }
760
761    #[test]
762    fn test_createtext_profile() {
763        let detector = LanguageDetector::new();
764        let profile = detector.createtext_profile("hello world");
765
766        // Check that profile contains some n-grams
767        assert!(!profile.is_empty());
768        assert!(profile.contains_key("hel") || profile.contains_key("llo"));
769    }
770}