scirs2_text/
multilingual.rs

1//! Multilingual text processing and language detection
2//!
3//! This module provides functionality for detecting languages
4//! and processing text in multiple languages.
5
6use crate::error::{Result, TextError};
7use std::collections::HashMap;
8
9/// Supported languages for detection
10#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
11pub enum Language {
12    /// English
13    English,
14    /// Spanish
15    Spanish,
16    /// French
17    French,
18    /// German
19    German,
20    /// Italian
21    Italian,
22    /// Portuguese
23    Portuguese,
24    /// Dutch
25    Dutch,
26    /// Russian
27    Russian,
28    /// Chinese
29    Chinese,
30    /// Japanese
31    Japanese,
32    /// Korean
33    Korean,
34    /// Arabic
35    Arabic,
36    /// Unknown language
37    Unknown,
38}
39
40impl Language {
41    /// Get the ISO 639-1 code for the language
42    pub fn iso_code(&self) -> &'static str {
43        match self {
44            Language::English => "en",
45            Language::Spanish => "es",
46            Language::French => "fr",
47            Language::German => "de",
48            Language::Italian => "it",
49            Language::Portuguese => "pt",
50            Language::Dutch => "nl",
51            Language::Russian => "ru",
52            Language::Chinese => "zh",
53            Language::Japanese => "ja",
54            Language::Korean => "ko",
55            Language::Arabic => "ar",
56            Language::Unknown => "und",
57        }
58    }
59
60    /// Get the language from ISO 639-1 code
61    pub fn from_iso_code(code: &str) -> Self {
62        match code.to_lowercase().as_str() {
63            "en" => Language::English,
64            "es" => Language::Spanish,
65            "fr" => Language::French,
66            "de" => Language::German,
67            "it" => Language::Italian,
68            "pt" => Language::Portuguese,
69            "nl" => Language::Dutch,
70            "ru" => Language::Russian,
71            "zh" => Language::Chinese,
72            "ja" => Language::Japanese,
73            "ko" => Language::Korean,
74            "ar" => Language::Arabic,
75            _ => Language::Unknown,
76        }
77    }
78
79    /// Get the full name of the language
80    pub fn name(&self) -> &'static str {
81        match self {
82            Language::English => "English",
83            Language::Spanish => "Spanish",
84            Language::French => "French",
85            Language::German => "German",
86            Language::Italian => "Italian",
87            Language::Portuguese => "Portuguese",
88            Language::Dutch => "Dutch",
89            Language::Russian => "Russian",
90            Language::Chinese => "Chinese",
91            Language::Japanese => "Japanese",
92            Language::Korean => "Korean",
93            Language::Arabic => "Arabic",
94            Language::Unknown => "Unknown",
95        }
96    }
97}
98
99/// Result of language detection
100#[derive(Debug, Clone)]
101pub struct LanguageDetectionResult {
102    /// The detected language
103    pub language: Language,
104    /// Confidence score (0.0 to 1.0)
105    pub confidence: f64,
106    /// Alternative language candidates with scores
107    pub alternatives: Vec<(Language, f64)>,
108}
109
110/// Language detector using character n-gram profiles
111pub struct LanguageDetector {
112    /// Character n-gram profiles for each language
113    profiles: HashMap<Language, HashMap<String, f64>>,
114    /// N-gram size (typically 2 or 3)
115    n_gram_size: usize,
116}
117
118impl LanguageDetector {
119    /// Create a new language detector with default profiles
120    pub fn new() -> Self {
121        let mut detector = Self {
122            profiles: HashMap::new(),
123            n_gram_size: 3,
124        };
125        detector.initialize_default_profiles();
126        detector
127    }
128
129    /// Create a language detector with custom n-gram size
130    pub fn with_ngram_size(n_gramsize: usize) -> Result<Self> {
131        if !(1..=5).contains(&n_gramsize) {
132            return Err(TextError::InvalidInput(
133                "N-gram size must be between 1 and 5".to_string(),
134            ));
135        }
136        let mut detector = Self {
137            profiles: HashMap::new(),
138            n_gram_size: n_gramsize,
139        };
140        detector.initialize_default_profiles();
141        Ok(detector)
142    }
143
144    /// Initialize default language profiles with common n-grams
145    fn initialize_default_profiles(&mut self) {
146        // English profile
147        let mut english_profile = HashMap::new();
148        for (ngram, freq) in &[
149            ("the", 0.05),
150            ("and", 0.03),
151            ("ing", 0.025),
152            ("ion", 0.02),
153            ("tio", 0.018),
154            ("ent", 0.015),
155            ("ati", 0.013),
156            ("her", 0.012),
157            ("for", 0.011),
158            ("ter", 0.01),
159            ("hat", 0.009),
160            ("tha", 0.009),
161            ("ere", 0.008),
162            ("ate", 0.008),
163            ("ver", 0.007),
164            ("his", 0.007),
165        ] {
166            english_profile.insert(ngram.to_string(), *freq);
167        }
168        self.profiles.insert(Language::English, english_profile);
169
170        // Spanish profile
171        let mut spanish_profile = HashMap::new();
172        for (ngram, freq) in &[
173            ("que", 0.04),
174            ("de_", 0.035),
175            ("la_", 0.03),
176            ("el_", 0.025),
177            ("es_", 0.02),
178            ("los", 0.018),
179            ("las", 0.015),
180            ("ión", 0.013),
181            ("ado", 0.012),
182            ("nte", 0.011),
183            ("con", 0.01),
184            ("par", 0.009),
185            ("ara", 0.008),
186            ("una", 0.008),
187            ("por", 0.007),
188            ("est", 0.007),
189        ] {
190            spanish_profile.insert(ngram.to_string(), *freq);
191        }
192        self.profiles.insert(Language::Spanish, spanish_profile);
193
194        // French profile
195        let mut french_profile = HashMap::new();
196        for (ngram, freq) in &[
197            ("de_", 0.05),
198            ("le_", 0.04),
199            ("que", 0.03),
200            ("les", 0.025),
201            ("la_", 0.02),
202            ("des", 0.018),
203            ("ent", 0.015),
204            ("ion", 0.013),
205            ("est", 0.012),
206            ("ait", 0.011),
207            ("pour", 0.01),
208            ("ais", 0.009),
209            ("ans", 0.008),
210            ("ont", 0.008),
211            ("une", 0.007),
212            ("qui", 0.007),
213        ] {
214            french_profile.insert(ngram.to_string(), *freq);
215        }
216        self.profiles.insert(Language::French, french_profile);
217
218        // German profile
219        let mut german_profile = HashMap::new();
220        for (ngram, freq) in &[
221            ("der", 0.05),
222            ("die", 0.04),
223            ("und", 0.03),
224            ("den", 0.025),
225            ("das", 0.02),
226            ("ein", 0.018),
227            ("ich", 0.015),
228            ("ist", 0.013),
229            ("sch", 0.012),
230            ("cht", 0.011),
231            ("ung", 0.01),
232            ("gen", 0.009),
233            ("eit", 0.008),
234            ("ver", 0.008),
235            ("ber", 0.007),
236            ("ten", 0.007),
237        ] {
238            german_profile.insert(ngram.to_string(), *freq);
239        }
240        self.profiles.insert(Language::German, german_profile);
241    }
242
243    /// Detect the language of a text
244    pub fn detect(&self, text: &str) -> Result<LanguageDetectionResult> {
245        if text.trim().is_empty() {
246            return Err(TextError::InvalidInput(
247                "Cannot detect language of empty text".to_string(),
248            ));
249        }
250
251        // Extract n-grams from the text
252        let text_profile = self.createtext_profile(text);
253
254        // Score each language profile
255        let mut scores: Vec<(Language, f64)> = self
256            .profiles
257            .iter()
258            .map(|(lang, profile)| {
259                let score = self.calculate_similarity(&text_profile, profile);
260                (*lang, score)
261            })
262            .collect();
263
264        // Sort by score (descending)
265        scores.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
266
267        if scores.is_empty() {
268            return Ok(LanguageDetectionResult {
269                language: Language::Unknown,
270                confidence: 0.0,
271                alternatives: vec![],
272            });
273        }
274
275        let best_score = scores[0].1;
276        let best_language = scores[0].0;
277
278        // Calculate confidence based on the difference between top scores
279        let confidence = if scores.len() > 1 {
280            let second_score = scores[1].1;
281            let diff = best_score - second_score;
282            // Normalize confidence to [0, 1]
283            (diff / best_score).clamp(0.0, 1.0)
284        } else {
285            best_score
286        };
287
288        Ok(LanguageDetectionResult {
289            language: best_language,
290            confidence,
291            alternatives: scores.into_iter().skip(1).take(3).collect(),
292        })
293    }
294
295    /// Create n-gram profile for a text
296    fn createtext_profile(&self, text: &str) -> HashMap<String, f64> {
297        let mut profile = HashMap::new();
298        let text_lower = text.to_lowercase();
299        let chars: Vec<char> = text_lower.chars().collect();
300        let total_ngrams = chars.len().saturating_sub(self.n_gram_size - 1) as f64;
301
302        if total_ngrams <= 0.0 {
303            return profile;
304        }
305
306        // Count n-grams
307        let mut ngram_counts: HashMap<String, usize> = HashMap::new();
308        for i in 0..=chars.len().saturating_sub(self.n_gram_size) {
309            let ngram: String = chars[i..i + self.n_gram_size].iter().collect();
310            // Replace spaces with underscores for consistency
311            let ngram = ngram.replace(' ', "_");
312            *ngram_counts.entry(ngram).or_insert(0) += 1;
313        }
314
315        // Convert counts to frequencies
316        for (ngram, count) in ngram_counts {
317            profile.insert(ngram, count as f64 / total_ngrams);
318        }
319
320        profile
321    }
322
323    /// Calculate similarity between two n-gram profiles
324    fn calculate_similarity(
325        &self,
326        profile1: &HashMap<String, f64>,
327        profile2: &HashMap<String, f64>,
328    ) -> f64 {
329        let mut similarity = 0.0;
330        let mut total_weight = 0.0;
331
332        // Use cosine similarity
333        for (ngram, freq1) in profile1 {
334            if let Some(freq2) = profile2.get(ngram) {
335                similarity += freq1 * freq2;
336            }
337            total_weight += freq1 * freq1;
338        }
339
340        if total_weight > 0.0 {
341            similarity / total_weight.sqrt()
342        } else {
343            0.0
344        }
345    }
346
347    /// Get supported languages
348    pub fn supported_languages(&self) -> Vec<Language> {
349        self.profiles.keys().copied().collect()
350    }
351}
352
353impl Default for LanguageDetector {
354    fn default() -> Self {
355        Self::new()
356    }
357}
358
359/// Language-specific stop words
360pub struct StopWords {
361    /// Stop words organized by language
362    stop_words: HashMap<Language, Vec<String>>,
363}
364
365impl StopWords {
366    /// Create a new stop words collection
367    pub fn new() -> Self {
368        let mut stop_words = HashMap::new();
369
370        // English stop words
371        stop_words.insert(
372            Language::English,
373            vec![
374                "a", "an", "and", "are", "as", "at", "be", "by", "for", "from", "has", "he", "in",
375                "is", "it", "its", "of", "on", "that", "the", "to", "was", "will", "with", "you",
376                "your", "this", "have", "had", "been", "but", "not", "they", "were", "what",
377                "when", "where", "who", "which", "their", "them", "these", "those", "there",
378                "here", "than",
379            ]
380            .iter()
381            .map(|s| s.to_string())
382            .collect(),
383        );
384
385        // Spanish stop words
386        stop_words.insert(
387            Language::Spanish,
388            vec![
389                "a", "al", "algo", "algunas", "algunos", "ante", "antes", "como", "con", "contra",
390                "cual", "cuando", "de", "del", "desde", "donde", "durante", "e", "el", "ella",
391                "ellas", "ellos", "en", "entre", "era", "erais", "eran", "eras", "eres", "es",
392                "esa", "esas", "ese", "eso", "esos", "esta", "estas", "este", "esto", "estos",
393                "fue", "fueron", "fui", "la", "las", "lo", "los", "más", "mi", "mis", "mucho",
394                "muchos", "muy", "ni", "no", "nos", "nosotras", "nosotros", "o", "otra", "otras",
395                "otro", "otros", "para", "pero", "por", "porque", "que", "quien", "quienes", "se",
396                "si", "sin", "sobre", "su", "sus", "también", "tanto", "te", "tu", "tus", "un",
397                "una", "uno", "unos", "y", "ya", "yo",
398            ]
399            .iter()
400            .map(|s| s.to_string())
401            .collect(),
402        );
403
404        // French stop words
405        stop_words.insert(
406            Language::French,
407            vec![
408                "au", "aux", "avec", "ce", "ces", "dans", "de", "des", "du", "elle", "en", "et",
409                "eux", "il", "je", "la", "le", "les", "leur", "lui", "ma", "mais", "me", "même",
410                "mes", "moi", "mon", "ne", "nos", "notre", "nous", "on", "ou", "par", "pas",
411                "pour", "qu", "que", "qui", "sa", "se", "ses", "son", "sur", "ta", "te", "tes",
412                "toi", "ton", "tu", "un", "une", "vos", "votre", "vous",
413            ]
414            .iter()
415            .map(|s| s.to_string())
416            .collect(),
417        );
418
419        Self { stop_words }
420    }
421
422    /// Get stop words for a specific language
423    pub fn get(&self, language: Language) -> Option<&Vec<String>> {
424        self.stop_words.get(&language)
425    }
426
427    /// Check if a word is a stop word in a specific language
428    pub fn is_stop_word(&self, word: &str, language: Language) -> bool {
429        if let Some(words) = self.stop_words.get(&language) {
430            words.iter().any(|sw| sw == &word.to_lowercase())
431        } else {
432            false
433        }
434    }
435
436    /// Remove stop words from a list of tokens
437    pub fn remove_stop_words(&self, tokens: &[String], language: Language) -> Vec<String> {
438        tokens
439            .iter()
440            .filter(|token| !self.is_stop_word(token, language))
441            .cloned()
442            .collect()
443    }
444}
445
446impl Default for StopWords {
447    fn default() -> Self {
448        Self::new()
449    }
450}
451
452/// Language-specific text processor
453pub struct MultilingualProcessor {
454    /// Language detector
455    detector: LanguageDetector,
456    /// Stop words collection
457    stop_words: StopWords,
458}
459
460impl MultilingualProcessor {
461    /// Create a new multilingual processor
462    pub fn new() -> Self {
463        Self {
464            detector: LanguageDetector::new(),
465            stop_words: StopWords::new(),
466        }
467    }
468
469    /// Process text with automatic language detection
470    pub fn process(&self, text: &str) -> Result<ProcessedText> {
471        // Detect language
472        let detection = self.detector.detect(text)?;
473
474        // Tokenize (simple whitespace tokenization for now)
475        let tokens: Vec<String> = text.split_whitespace().map(|s| s.to_string()).collect();
476
477        // Remove stop words
478        let filtered_tokens = self
479            .stop_words
480            .remove_stop_words(&tokens, detection.language);
481
482        Ok(ProcessedText {
483            original: text.to_string(),
484            language: detection.language,
485            confidence: detection.confidence,
486            tokens,
487            filtered_tokens,
488        })
489    }
490}
491
492impl Default for MultilingualProcessor {
493    fn default() -> Self {
494        Self::new()
495    }
496}
497
498/// Result of multilingual text processing
499#[derive(Debug, Clone)]
500pub struct ProcessedText {
501    /// Original text
502    pub original: String,
503    /// Detected language
504    pub language: Language,
505    /// Language detection confidence
506    pub confidence: f64,
507    /// All tokens
508    pub tokens: Vec<String>,
509    /// Tokens after stop word removal
510    pub filtered_tokens: Vec<String>,
511}
512
513#[cfg(test)]
514mod tests {
515    use super::*;
516
517    #[test]
518    fn test_language_enum() {
519        assert_eq!(Language::English.iso_code(), "en");
520        assert_eq!(Language::Spanish.name(), "Spanish");
521        assert_eq!(Language::from_iso_code("fr"), Language::French);
522        assert_eq!(Language::from_iso_code("unknown"), Language::Unknown);
523    }
524
525    #[test]
526    fn test_language_detection() {
527        let detector = LanguageDetector::new();
528
529        // Test English detection with more text
530        let result = detector.detect("The quick brown fox jumps over the lazy dog. This is definitely an English sentence with many common words.").unwrap();
531        assert_eq!(result.language, Language::English);
532
533        // Test with empty text
534        let empty_result = detector.detect("");
535        assert!(empty_result.is_err());
536    }
537
538    #[test]
539    fn test_stop_words() {
540        let stop_words = StopWords::new();
541
542        // Test English stop words
543        assert!(stop_words.is_stop_word("the", Language::English));
544        assert!(stop_words.is_stop_word("and", Language::English));
545        assert!(!stop_words.is_stop_word("hello", Language::English));
546
547        // Test stop word removal
548        let tokens = vec![
549            "the".to_string(),
550            "cat".to_string(),
551            "is".to_string(),
552            "happy".to_string(),
553        ];
554        let filtered = stop_words.remove_stop_words(&tokens, Language::English);
555        assert_eq!(filtered, vec!["cat", "happy"]);
556    }
557
558    #[test]
559    fn test_multilingual_processor() {
560        let processor = MultilingualProcessor::new();
561
562        let result = processor.process("The quick brown fox jumps over the lazy dog. This sentence has many English words.").unwrap();
563        assert_eq!(result.language, Language::English);
564        assert!(!result.tokens.is_empty());
565        assert!(result.filtered_tokens.len() < result.tokens.len());
566    }
567
568    #[test]
569    fn test_createtext_profile() {
570        let detector = LanguageDetector::new();
571        let profile = detector.createtext_profile("hello world");
572
573        // Check that profile contains some n-grams
574        assert!(!profile.is_empty());
575        assert!(profile.contains_key("hel") || profile.contains_key("llo"));
576    }
577}