Skip to main content

graphrag_core/nlp/
multilingual.rs

1//! Multilingual Support
2//!
3//! This module provides language detection and language-specific text processing:
4//! - Automatic language detection using n-gram analysis
5//! - Language-specific tokenization and normalization
6//! - Multi-language entity extraction
7//! - Cross-lingual entity linking
8//!
9//! ## Supported Languages
10//!
11//! - English (en)
12//! - Spanish (es)
13//! - French (fr)
14//! - German (de)
15//! - Chinese (zh)
16//! - Japanese (ja)
17//! - Korean (ko)
18//! - Arabic (ar)
19//! - Russian (ru)
20//! - Portuguese (pt)
21
22use serde::{Deserialize, Serialize};
23use std::collections::HashMap;
24
25/// Supported languages
26#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
27pub enum Language {
28    /// English language
29    English,
30    /// Spanish language
31    Spanish,
32    /// French language
33    French,
34    /// German language
35    German,
36    /// Chinese language
37    Chinese,
38    /// Japanese language
39    Japanese,
40    /// Korean language
41    Korean,
42    /// Arabic language
43    Arabic,
44    /// Russian language
45    Russian,
46    /// Portuguese language
47    Portuguese,
48    /// Unknown or unsupported language
49    Unknown,
50}
51
52impl Language {
53    /// Get ISO 639-1 language code
54    pub fn code(&self) -> &str {
55        match self {
56            Language::English => "en",
57            Language::Spanish => "es",
58            Language::French => "fr",
59            Language::German => "de",
60            Language::Chinese => "zh",
61            Language::Japanese => "ja",
62            Language::Korean => "ko",
63            Language::Arabic => "ar",
64            Language::Russian => "ru",
65            Language::Portuguese => "pt",
66            Language::Unknown => "unknown",
67        }
68    }
69
70    /// Parse from ISO 639-1 code
71    pub fn from_code(code: &str) -> Self {
72        match code.to_lowercase().as_str() {
73            "en" => Language::English,
74            "es" => Language::Spanish,
75            "fr" => Language::French,
76            "de" => Language::German,
77            "zh" => Language::Chinese,
78            "ja" => Language::Japanese,
79            "ko" => Language::Korean,
80            "ar" => Language::Arabic,
81            "ru" => Language::Russian,
82            "pt" => Language::Portuguese,
83            _ => Language::Unknown,
84        }
85    }
86
87    /// Get language name
88    pub fn name(&self) -> &str {
89        match self {
90            Language::English => "English",
91            Language::Spanish => "Spanish",
92            Language::French => "French",
93            Language::German => "German",
94            Language::Chinese => "Chinese",
95            Language::Japanese => "Japanese",
96            Language::Korean => "Korean",
97            Language::Arabic => "Arabic",
98            Language::Russian => "Russian",
99            Language::Portuguese => "Portuguese",
100            Language::Unknown => "Unknown",
101        }
102    }
103
104    /// Check if language uses CJK (Chinese, Japanese, Korean) script
105    pub fn is_cjk(&self) -> bool {
106        matches!(self, Language::Chinese | Language::Japanese | Language::Korean)
107    }
108
109    /// Check if language is right-to-left
110    pub fn is_rtl(&self) -> bool {
111        matches!(self, Language::Arabic)
112    }
113}
114
115/// Language detection result
116#[derive(Debug, Clone, Serialize, Deserialize)]
117pub struct DetectionResult {
118    /// Detected language
119    pub language: Language,
120    /// Confidence score (0.0 to 1.0)
121    pub confidence: f32,
122    /// Alternative languages with scores
123    pub alternatives: Vec<(Language, f32)>,
124}
125
126/// Language detector using n-gram frequency analysis
127pub struct LanguageDetector {
128    /// N-gram language models
129    models: HashMap<Language, LanguageModel>,
130}
131
132/// Simple n-gram language model
133struct LanguageModel {
134    /// Character n-gram frequencies
135    ngrams: HashMap<String, f32>,
136    /// Total n-gram count
137    total: f32,
138}
139
140impl LanguageModel {
141    /// Create new language model
142    fn new() -> Self {
143        Self {
144            ngrams: HashMap::new(),
145            total: 0.0,
146        }
147    }
148
149    /// Add training text
150    fn train(&mut self, text: &str, n: usize) {
151        let chars: Vec<char> = text.chars().collect();
152        for window in chars.windows(n) {
153            let ngram: String = window.iter().collect();
154            *self.ngrams.entry(ngram).or_insert(0.0) += 1.0;
155            self.total += 1.0;
156        }
157    }
158
159    /// Calculate probability of text
160    fn score(&self, text: &str, n: usize) -> f32 {
161        let chars: Vec<char> = text.chars().collect();
162        let mut score = 0.0;
163        let mut count = 0;
164
165        for window in chars.windows(n) {
166            let ngram: String = window.iter().collect();
167            if let Some(&freq) = self.ngrams.get(&ngram) {
168                score += (freq / self.total).ln();
169            } else {
170                score += (1.0 / (self.total + 1.0)).ln(); // Smoothing
171            }
172            count += 1;
173        }
174
175        if count > 0 {
176            score / count as f32
177        } else {
178            0.0
179        }
180    }
181}
182
183impl LanguageDetector {
184    /// Create new language detector
185    pub fn new() -> Self {
186        let mut detector = Self {
187            models: HashMap::new(),
188        };
189
190        // Initialize with basic language models
191        detector.initialize_models();
192        detector
193    }
194
195    /// Initialize language models with sample text
196    fn initialize_models(&mut self) {
197        // English
198        let mut english_model = LanguageModel::new();
199        english_model.train("the quick brown fox jumps over the lazy dog", 3);
200        english_model.train("this is a test of the english language", 3);
201        self.models.insert(Language::English, english_model);
202
203        // Spanish
204        let mut spanish_model = LanguageModel::new();
205        spanish_model.train("el rápido zorro marrón salta sobre el perro perezoso", 3);
206        spanish_model.train("esta es una prueba del idioma español", 3);
207        self.models.insert(Language::Spanish, spanish_model);
208
209        // French
210        let mut french_model = LanguageModel::new();
211        french_model.train("le renard brun rapide saute par-dessus le chien paresseux", 3);
212        french_model.train("ceci est un test de la langue française", 3);
213        self.models.insert(Language::French, french_model);
214
215        // German
216        let mut german_model = LanguageModel::new();
217        german_model.train("der schnelle braune fuchs springt über den faulen hund", 3);
218        german_model.train("dies ist ein test der deutschen sprache", 3);
219        self.models.insert(Language::German, german_model);
220
221        // Portuguese
222        let mut portuguese_model = LanguageModel::new();
223        portuguese_model.train("a rápida raposa marrom pula sobre o cão preguiçoso", 3);
224        portuguese_model.train("este é um teste da língua portuguesa", 3);
225        self.models.insert(Language::Portuguese, portuguese_model);
226
227        // TODO: Add models for Chinese, Japanese, Korean, Arabic, Russian
228        // These require proper training data with representative character sets
229    }
230
231    /// Detect language of text
232    pub fn detect(&self, text: &str) -> DetectionResult {
233        if text.trim().is_empty() {
234            return DetectionResult {
235                language: Language::Unknown,
236                confidence: 0.0,
237                alternatives: Vec::new(),
238            };
239        }
240
241        // Quick heuristics for CJK and RTL languages
242        if self.is_likely_chinese(text) {
243            return DetectionResult {
244                language: Language::Chinese,
245                confidence: 0.9,
246                alternatives: vec![
247                    (Language::Japanese, 0.1),
248                ],
249            };
250        }
251
252        if self.is_likely_japanese(text) {
253            return DetectionResult {
254                language: Language::Japanese,
255                confidence: 0.9,
256                alternatives: vec![
257                    (Language::Chinese, 0.1),
258                ],
259            };
260        }
261
262        if self.is_likely_korean(text) {
263            return DetectionResult {
264                language: Language::Korean,
265                confidence: 0.95,
266                alternatives: Vec::new(),
267            };
268        }
269
270        if self.is_likely_arabic(text) {
271            return DetectionResult {
272                language: Language::Arabic,
273                confidence: 0.95,
274                alternatives: Vec::new(),
275            };
276        }
277
278        if self.is_likely_russian(text) {
279            return DetectionResult {
280                language: Language::Russian,
281                confidence: 0.9,
282                alternatives: Vec::new(),
283            };
284        }
285
286        // Score against all models
287        let mut scores: Vec<(Language, f32)> = self
288            .models
289            .iter()
290            .map(|(lang, model)| (*lang, model.score(text, 3)))
291            .collect();
292
293        scores.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap());
294
295        if scores.is_empty() {
296            return DetectionResult {
297                language: Language::Unknown,
298                confidence: 0.0,
299                alternatives: Vec::new(),
300            };
301        }
302
303        // Normalize scores to confidence (0.0 to 1.0)
304        let max_score = scores[0].1;
305        let min_score = scores.last().unwrap().1;
306        let range = max_score - min_score;
307
308        let confidence = if range > 0.0 {
309            ((max_score - min_score) / range).clamp(0.0, 1.0)
310        } else {
311            0.5
312        };
313
314        DetectionResult {
315            language: scores[0].0,
316            confidence,
317            alternatives: scores.into_iter().skip(1).take(3).collect(),
318        }
319    }
320
321    /// Check if text is likely Chinese (simplified or traditional)
322    fn is_likely_chinese(&self, text: &str) -> bool {
323        let chinese_chars = text.chars().filter(|c| {
324            let code = *c as u32;
325            (0x4E00..=0x9FFF).contains(&code) // CJK Unified Ideographs
326        }).count();
327
328        chinese_chars as f32 / text.chars().count() as f32 > 0.3
329    }
330
331    /// Check if text is likely Japanese (hiragana/katakana present)
332    fn is_likely_japanese(&self, text: &str) -> bool {
333        let japanese_chars = text.chars().filter(|c| {
334            let code = *c as u32;
335            (0x3040..=0x309F).contains(&code) || // Hiragana
336            (0x30A0..=0x30FF).contains(&code)    // Katakana
337        }).count();
338
339        japanese_chars > 0
340    }
341
342    /// Check if text is likely Korean (Hangul)
343    fn is_likely_korean(&self, text: &str) -> bool {
344        let korean_chars = text.chars().filter(|c| {
345            let code = *c as u32;
346            (0xAC00..=0xD7AF).contains(&code) // Hangul Syllables
347        }).count();
348
349        korean_chars as f32 / text.chars().count() as f32 > 0.3
350    }
351
352    /// Check if text is likely Arabic
353    fn is_likely_arabic(&self, text: &str) -> bool {
354        let arabic_chars = text.chars().filter(|c| {
355            let code = *c as u32;
356            (0x0600..=0x06FF).contains(&code) // Arabic
357        }).count();
358
359        arabic_chars as f32 / text.chars().count() as f32 > 0.3
360    }
361
362    /// Check if text is likely Russian (Cyrillic)
363    fn is_likely_russian(&self, text: &str) -> bool {
364        let cyrillic_chars = text.chars().filter(|c| {
365            let code = *c as u32;
366            (0x0400..=0x04FF).contains(&code) // Cyrillic
367        }).count();
368
369        cyrillic_chars as f32 / text.chars().count() as f32 > 0.3
370    }
371}
372
373impl Default for LanguageDetector {
374    fn default() -> Self {
375        Self::new()
376    }
377}
378
379/// Language-specific text processor
380pub struct MultilingualProcessor {
381    detector: LanguageDetector,
382}
383
384impl MultilingualProcessor {
385    /// Create new multilingual processor
386    pub fn new() -> Self {
387        Self {
388            detector: LanguageDetector::new(),
389        }
390    }
391
392    /// Detect language and return processor configuration
393    pub fn process(&self, text: &str) -> ProcessedText {
394        let detection = self.detector.detect(text);
395        let normalized = self.normalize_text(text, detection.language);
396        let tokens = self.tokenize(&normalized, detection.language);
397
398        ProcessedText {
399            original: text.to_string(),
400            normalized,
401            tokens,
402            language: detection.language,
403            confidence: detection.confidence,
404        }
405    }
406
407    /// Normalize text based on language
408    fn normalize_text(&self, text: &str, language: Language) -> String {
409        let mut normalized = text.to_string();
410
411        // Remove extra whitespace
412        normalized = normalized.split_whitespace().collect::<Vec<_>>().join(" ");
413
414        // Language-specific normalization
415        match language {
416            Language::Arabic => {
417                // Remove Arabic diacritics
418                normalized = normalized.chars()
419                    .filter(|c| {
420                        let code = *c as u32;
421                        !(0x064B..=0x0652).contains(&code) // Arabic diacritics
422                    })
423                    .collect();
424            }
425            Language::Chinese | Language::Japanese => {
426                // Full-width to half-width conversion for ASCII characters
427                normalized = normalized.chars()
428                    .map(|c| {
429                        let code = c as u32;
430                        if (0xFF01..=0xFF5E).contains(&code) {
431                            char::from_u32(code - 0xFEE0).unwrap_or(c)
432                        } else {
433                            c
434                        }
435                    })
436                    .collect();
437            }
438            _ => {}
439        }
440
441        normalized
442    }
443
444    /// Tokenize text based on language
445    fn tokenize(&self, text: &str, language: Language) -> Vec<String> {
446        match language {
447            Language::Chinese | Language::Japanese => {
448                // Character-level tokenization for CJK
449                // TODO: Implement proper word segmentation (e.g., jieba for Chinese)
450                text.chars()
451                    .filter(|c| !c.is_whitespace())
452                    .map(|c| c.to_string())
453                    .collect()
454            }
455            _ => {
456                // Word-level tokenization
457                text.split_whitespace()
458                    .map(|s| s.to_string())
459                    .collect()
460            }
461        }
462    }
463}
464
465impl Default for MultilingualProcessor {
466    fn default() -> Self {
467        Self::new()
468    }
469}
470
471/// Processed text result
472#[derive(Debug, Clone)]
473pub struct ProcessedText {
474    /// Original text
475    pub original: String,
476    /// Normalized text
477    pub normalized: String,
478    /// Tokens
479    pub tokens: Vec<String>,
480    /// Detected language
481    pub language: Language,
482    /// Detection confidence
483    pub confidence: f32,
484}
485
486#[cfg(test)]
487mod tests {
488    use super::*;
489
490    #[test]
491    fn test_language_codes() {
492        assert_eq!(Language::English.code(), "en");
493        assert_eq!(Language::Spanish.code(), "es");
494        assert_eq!(Language::from_code("fr"), Language::French);
495        assert_eq!(Language::from_code("unknown"), Language::Unknown);
496    }
497
498    #[test]
499    fn test_cjk_detection() {
500        assert!(Language::Chinese.is_cjk());
501        assert!(Language::Japanese.is_cjk());
502        assert!(Language::Korean.is_cjk());
503        assert!(!Language::English.is_cjk());
504    }
505
506    #[test]
507    fn test_rtl_detection() {
508        assert!(Language::Arabic.is_rtl());
509        assert!(!Language::English.is_rtl());
510    }
511
512    #[test]
513    fn test_language_detection() {
514        let detector = LanguageDetector::new();
515
516        let result = detector.detect("This is English text");
517        assert_eq!(result.language, Language::English);
518        assert!(result.confidence > 0.0);
519
520        let result = detector.detect("Esto es texto en español");
521        assert_eq!(result.language, Language::Spanish);
522
523        let result = detector.detect("Ceci est du texte français");
524        assert_eq!(result.language, Language::French);
525    }
526
527    #[test]
528    fn test_chinese_detection() {
529        let detector = LanguageDetector::new();
530        let result = detector.detect("这是中文文本");
531        assert_eq!(result.language, Language::Chinese);
532        assert!(result.confidence > 0.8);
533    }
534
535    #[test]
536    fn test_japanese_detection() {
537        let detector = LanguageDetector::new();
538        let result = detector.detect("これは日本語のテキストです");
539        assert_eq!(result.language, Language::Japanese);
540        assert!(result.confidence > 0.8);
541    }
542
543    #[test]
544    fn test_korean_detection() {
545        let detector = LanguageDetector::new();
546        let result = detector.detect("이것은 한국어 텍스트입니다");
547        assert_eq!(result.language, Language::Korean);
548        assert!(result.confidence > 0.8);
549    }
550
551    #[test]
552    fn test_multilingual_processing() {
553        let processor = MultilingualProcessor::new();
554
555        let result = processor.process("This is a test");
556        assert_eq!(result.language, Language::English);
557        assert!(!result.tokens.is_empty());
558
559        let result = processor.process("Esto es una prueba");
560        assert_eq!(result.language, Language::Spanish);
561    }
562
563    #[test]
564    fn test_text_normalization() {
565        let processor = MultilingualProcessor::new();
566        let result = processor.process("This   has   extra   spaces");
567        assert_eq!(result.normalized, "This has extra spaces");
568    }
569}