oxirs_vec/sparql_integration/
cross_language.rs

1//! Cross-language search capabilities for SPARQL vector integration
2
3use std::collections::HashMap;
4
5/// Cross-language search processor
6pub struct CrossLanguageProcessor {
7    language_weights: HashMap<String, f32>,
8    supported_languages: Vec<String>,
9}
10
11impl CrossLanguageProcessor {
12    pub fn new() -> Self {
13        let supported_languages = vec![
14            "en".to_string(),
15            "es".to_string(),
16            "fr".to_string(),
17            "de".to_string(),
18            "it".to_string(),
19            "pt".to_string(),
20            "ru".to_string(),
21            "zh".to_string(),
22            "ja".to_string(),
23            "ar".to_string(),
24        ];
25
26        let mut language_weights = HashMap::new();
27        for lang in &supported_languages {
28            language_weights.insert(lang.clone(), 1.0);
29        }
30
31        Self {
32            language_weights,
33            supported_languages,
34        }
35    }
36
37    /// Process a query with cross-language capabilities
38    pub fn process_cross_language_query(
39        &self,
40        query: &str,
41        target_languages: &[String],
42    ) -> Vec<(String, f32)> {
43        let mut processed_queries = Vec::new();
44
45        // Original query gets highest weight
46        processed_queries.push((query.to_string(), 1.0));
47
48        // Detect source language
49        let detected_lang = self.detect_language(query);
50
51        // Generate variations for each target language
52        for target_lang in target_languages {
53            if target_lang == &detected_lang {
54                continue; // Skip same language
55            }
56
57            let weight = self
58                .language_weights
59                .get(target_lang)
60                .copied()
61                .unwrap_or(0.8);
62
63            // Generate translations
64            let translations = self.generate_translations(query, target_lang);
65            for translation in translations {
66                processed_queries.push((translation, weight * 0.9));
67            }
68
69            // Generate transliterations
70            let transliterations = self.generate_transliterations(query, target_lang);
71            for transliteration in transliterations {
72                processed_queries.push((transliteration, weight * 0.8));
73            }
74
75            // Generate stemmed variants
76            let stemmed_variants = self.generate_stemmed_variants(query, target_lang);
77            for variant in stemmed_variants {
78                processed_queries.push((variant, weight * 0.7));
79            }
80        }
81
82        processed_queries
83    }
84
85    /// Detect language using simple heuristics
86    pub fn detect_language(&self, text: &str) -> String {
87        let text_lower = text.to_lowercase();
88
89        // Simple language detection based on character patterns and common words
90        if text_lower.contains("machine learning")
91            || text_lower.contains("artificial intelligence")
92            || text_lower.contains("deep learning")
93        {
94            return "en".to_string();
95        }
96
97        if text_lower.contains("aprendizaje")
98            || text_lower.contains("inteligencia")
99            || text_lower.contains("máquina")
100        {
101            return "es".to_string();
102        }
103
104        if text_lower.contains("apprentissage")
105            || text_lower.contains("intelligence")
106            || text_lower.contains("automatique")
107        {
108            return "fr".to_string();
109        }
110
111        if text_lower.contains("lernen") || text_lower.contains("künstlich") {
112            return "de".to_string();
113        }
114
115        // Check for Cyrillic characters
116        if text.chars().any(|c| ('\u{0400}'..='\u{04FF}').contains(&c)) {
117            return "ru".to_string();
118        }
119
120        // Check for Chinese characters
121        if text.chars().any(|c| {
122            ('\u{4E00}'..='\u{9FFF}').contains(&c) || ('\u{3400}'..='\u{4DBF}').contains(&c)
123        }) {
124            return "zh".to_string();
125        }
126
127        // Check for Arabic characters
128        if text.chars().any(|c| ('\u{0600}'..='\u{06FF}').contains(&c)) {
129            return "ar".to_string();
130        }
131
132        // Default to English
133        "en".to_string()
134    }
135
136    /// Generate basic translations using simple dictionaries
137    fn generate_translations(&self, query: &str, target_lang: &str) -> Vec<String> {
138        let mut translations = Vec::new();
139
140        let basic_dict = match target_lang {
141            "es" => vec![
142                ("artificial intelligence", "inteligencia artificial"),
143                ("machine learning", "aprendizaje automático"),
144                ("data science", "ciencia de datos"),
145                ("neural network", "red neuronal"),
146                ("deep learning", "aprendizaje profundo"),
147            ],
148            "fr" => vec![
149                ("artificial intelligence", "intelligence artificielle"),
150                ("machine learning", "apprentissage automatique"),
151                ("data science", "science des données"),
152                ("neural network", "réseau de neurones"),
153                ("deep learning", "apprentissage profond"),
154            ],
155            "de" => vec![
156                ("artificial intelligence", "künstliche Intelligenz"),
157                ("machine learning", "maschinelles Lernen"),
158                ("data science", "Datenwissenschaft"),
159                ("neural network", "neuronales Netzwerk"),
160                ("deep learning", "tiefes Lernen"),
161            ],
162            _ => vec![],
163        };
164
165        let query_lower = query.to_lowercase();
166        for (en_term, target_term) in basic_dict {
167            if query_lower.contains(en_term) {
168                let translated = query_lower.replace(en_term, target_term);
169                translations.push(translated);
170            }
171        }
172
173        translations
174    }
175
176    /// Generate transliteration variations for different scripts
177    fn generate_transliterations(&self, query: &str, target_lang: &str) -> Vec<String> {
178        let mut transliterations = Vec::new();
179
180        // For languages with different scripts, generate transliterations
181        match target_lang {
182            "ru" => {
183                // Cyrillic transliteration (simplified)
184                let latin_to_cyrillic = vec![
185                    ("ai", "ай"),
186                    ("machine", "машин"),
187                    ("data", "дата"),
188                    ("network", "сеть"),
189                    ("learning", "обучение"),
190                ];
191
192                let mut transliterated = query.to_lowercase();
193                for (latin, cyrillic) in latin_to_cyrillic {
194                    transliterated = transliterated.replace(latin, cyrillic);
195                }
196                if transliterated != query.to_lowercase() {
197                    transliterations.push(transliterated);
198                }
199            }
200            "ar" => {
201                // Arabic transliteration (simplified)
202                let latin_to_arabic =
203                    vec![("data", "بيانات"), ("machine", "آلة"), ("network", "شبكة")];
204
205                let mut transliterated = query.to_lowercase();
206                for (latin, arabic) in latin_to_arabic {
207                    transliterated = transliterated.replace(latin, arabic);
208                }
209                if transliterated != query.to_lowercase() {
210                    transliterations.push(transliterated);
211                }
212            }
213            _ => {
214                // For Latin-script languages, no transliteration needed
215            }
216        }
217
218        transliterations
219    }
220
221    /// Generate stemmed variants for better cross-language matching
222    fn generate_stemmed_variants(&self, query: &str, target_lang: &str) -> Vec<String> {
223        let mut variants = Vec::new();
224
225        // Simple stemming rules by language
226        let words: Vec<&str> = query.split_whitespace().collect();
227
228        for word in words {
229            let stemmed = match target_lang {
230                "es" => {
231                    // Spanish stemming rules (simplified)
232                    let word_lower = word.to_lowercase();
233                    if word_lower.ends_with("ción") {
234                        word_lower.replace("ción", "")
235                    } else if word_lower.ends_with("mente") {
236                        word_lower.replace("mente", "")
237                    } else {
238                        word_lower
239                    }
240                }
241                "fr" => {
242                    // French stemming rules (simplified)
243                    let word_lower = word.to_lowercase();
244                    if word_lower.ends_with("ment") {
245                        word_lower.replace("ment", "")
246                    } else if word_lower.ends_with("ique") {
247                        word_lower.replace("ique", "")
248                    } else {
249                        word_lower
250                    }
251                }
252                "de" => {
253                    // German stemming rules (simplified)
254                    let word_lower = word.to_lowercase();
255                    if word_lower.ends_with("ung") {
256                        word_lower.replace("ung", "")
257                    } else if word_lower.ends_with("lich") {
258                        word_lower.replace("lich", "")
259                    } else {
260                        word_lower
261                    }
262                }
263                "en" => {
264                    // English stemming rules (simplified)
265                    let word_lower = word.to_lowercase();
266                    if word_lower.ends_with("ing") {
267                        word_lower.replace("ing", "")
268                    } else if word_lower.ends_with("ed") {
269                        word_lower.replace("ed", "")
270                    } else if word_lower.ends_with("ly") {
271                        word_lower.replace("ly", "")
272                    } else {
273                        word_lower
274                    }
275                }
276                _ => word.to_lowercase(),
277            };
278
279            if stemmed != word.to_lowercase() && !stemmed.is_empty() {
280                variants.push(stemmed);
281            }
282        }
283
284        // Create variant queries by combining stemmed words
285        if !variants.is_empty() {
286            let original_words: Vec<&str> = query.split_whitespace().collect();
287            let mut variant_query = String::new();
288
289            for (i, word) in original_words.iter().enumerate() {
290                if i < variants.len() && !variants[i].is_empty() {
291                    variant_query.push_str(&variants[i]);
292                } else {
293                    variant_query.push_str(word);
294                }
295                if i < original_words.len() - 1 {
296                    variant_query.push(' ');
297                }
298            }
299
300            if variant_query != query.to_lowercase() {
301                vec![variant_query]
302            } else {
303                vec![]
304            }
305        } else {
306            vec![]
307        }
308    }
309
310    /// Set weight for a specific language
311    pub fn set_language_weight(&mut self, language: &str, weight: f32) {
312        self.language_weights.insert(language.to_string(), weight);
313    }
314
315    /// Get supported languages
316    pub fn supported_languages(&self) -> &[String] {
317        &self.supported_languages
318    }
319
320    /// Check if a language is supported
321    pub fn is_language_supported(&self, language: &str) -> bool {
322        self.supported_languages.contains(&language.to_string())
323    }
324}
325
326impl Default for CrossLanguageProcessor {
327    fn default() -> Self {
328        Self::new()
329    }
330}
331
332#[cfg(test)]
333mod tests {
334    use super::*;
335
336    #[test]
337    fn test_language_detection() {
338        let processor = CrossLanguageProcessor::new();
339
340        assert_eq!(
341            processor.detect_language("machine learning algorithm"),
342            "en"
343        );
344        assert_eq!(processor.detect_language("aprendizaje automático"), "es");
345        assert_eq!(processor.detect_language("apprentissage automatique"), "fr");
346        assert_eq!(processor.detect_language("maschinelles Lernen"), "de");
347    }
348
349    #[test]
350    fn test_translation_generation() {
351        let processor = CrossLanguageProcessor::new();
352
353        let translations = processor.generate_translations("machine learning", "es");
354        assert!(translations.contains(&"aprendizaje automático".to_string()));
355
356        let translations = processor.generate_translations("artificial intelligence", "fr");
357        assert!(translations.contains(&"intelligence artificielle".to_string()));
358    }
359
360    #[test]
361    fn test_cross_language_processing() {
362        let processor = CrossLanguageProcessor::new();
363
364        let processed = processor.process_cross_language_query(
365            "machine learning",
366            &["es".to_string(), "fr".to_string()],
367        );
368
369        // Should include original query plus variations
370        assert!(processed.len() > 1);
371        assert_eq!(processed[0].0, "machine learning");
372        assert_eq!(processed[0].1, 1.0); // Original gets highest weight
373    }
374
375    #[test]
376    fn test_stemming() {
377        let processor = CrossLanguageProcessor::new();
378
379        let variants = processor.generate_stemmed_variants("learning", "en");
380        assert!(variants.iter().any(|v| v.contains("learn")));
381
382        let variants = processor.generate_stemmed_variants("automático", "es");
383        // Should generate stemmed variant if rules apply
384        assert!(variants.len() <= 1); // Simplified stemming
385    }
386
387    #[test]
388    fn test_language_support() {
389        let processor = CrossLanguageProcessor::new();
390
391        assert!(processor.is_language_supported("en"));
392        assert!(processor.is_language_supported("es"));
393        assert!(processor.is_language_supported("fr"));
394        assert!(!processor.is_language_supported("xyz"));
395    }
396}