oxirs_vec/sparql_integration/
cross_language.rs1use std::collections::HashMap;
4
5pub struct CrossLanguageProcessor {
7 language_weights: HashMap<String, f32>,
8 supported_languages: Vec<String>,
9}
10
11impl CrossLanguageProcessor {
12 pub fn new() -> Self {
13 let supported_languages = vec![
14 "en".to_string(),
15 "es".to_string(),
16 "fr".to_string(),
17 "de".to_string(),
18 "it".to_string(),
19 "pt".to_string(),
20 "ru".to_string(),
21 "zh".to_string(),
22 "ja".to_string(),
23 "ar".to_string(),
24 ];
25
26 let mut language_weights = HashMap::new();
27 for lang in &supported_languages {
28 language_weights.insert(lang.clone(), 1.0);
29 }
30
31 Self {
32 language_weights,
33 supported_languages,
34 }
35 }
36
37 pub fn process_cross_language_query(
39 &self,
40 query: &str,
41 target_languages: &[String],
42 ) -> Vec<(String, f32)> {
43 let mut processed_queries = Vec::new();
44
45 processed_queries.push((query.to_string(), 1.0));
47
48 let detected_lang = self.detect_language(query);
50
51 for target_lang in target_languages {
53 if target_lang == &detected_lang {
54 continue; }
56
57 let weight = self
58 .language_weights
59 .get(target_lang)
60 .copied()
61 .unwrap_or(0.8);
62
63 let translations = self.generate_translations(query, target_lang);
65 for translation in translations {
66 processed_queries.push((translation, weight * 0.9));
67 }
68
69 let transliterations = self.generate_transliterations(query, target_lang);
71 for transliteration in transliterations {
72 processed_queries.push((transliteration, weight * 0.8));
73 }
74
75 let stemmed_variants = self.generate_stemmed_variants(query, target_lang);
77 for variant in stemmed_variants {
78 processed_queries.push((variant, weight * 0.7));
79 }
80 }
81
82 processed_queries
83 }
84
85 pub fn detect_language(&self, text: &str) -> String {
87 let text_lower = text.to_lowercase();
88
89 if text_lower.contains("machine learning")
91 || text_lower.contains("artificial intelligence")
92 || text_lower.contains("deep learning")
93 {
94 return "en".to_string();
95 }
96
97 if text_lower.contains("aprendizaje")
98 || text_lower.contains("inteligencia")
99 || text_lower.contains("máquina")
100 {
101 return "es".to_string();
102 }
103
104 if text_lower.contains("apprentissage")
105 || text_lower.contains("intelligence")
106 || text_lower.contains("automatique")
107 {
108 return "fr".to_string();
109 }
110
111 if text_lower.contains("lernen") || text_lower.contains("künstlich") {
112 return "de".to_string();
113 }
114
115 if text.chars().any(|c| ('\u{0400}'..='\u{04FF}').contains(&c)) {
117 return "ru".to_string();
118 }
119
120 if text.chars().any(|c| {
122 ('\u{4E00}'..='\u{9FFF}').contains(&c) || ('\u{3400}'..='\u{4DBF}').contains(&c)
123 }) {
124 return "zh".to_string();
125 }
126
127 if text.chars().any(|c| ('\u{0600}'..='\u{06FF}').contains(&c)) {
129 return "ar".to_string();
130 }
131
132 "en".to_string()
134 }
135
136 fn generate_translations(&self, query: &str, target_lang: &str) -> Vec<String> {
138 let mut translations = Vec::new();
139
140 let basic_dict = match target_lang {
141 "es" => vec![
142 ("artificial intelligence", "inteligencia artificial"),
143 ("machine learning", "aprendizaje automático"),
144 ("data science", "ciencia de datos"),
145 ("neural network", "red neuronal"),
146 ("deep learning", "aprendizaje profundo"),
147 ],
148 "fr" => vec![
149 ("artificial intelligence", "intelligence artificielle"),
150 ("machine learning", "apprentissage automatique"),
151 ("data science", "science des données"),
152 ("neural network", "réseau de neurones"),
153 ("deep learning", "apprentissage profond"),
154 ],
155 "de" => vec![
156 ("artificial intelligence", "künstliche Intelligenz"),
157 ("machine learning", "maschinelles Lernen"),
158 ("data science", "Datenwissenschaft"),
159 ("neural network", "neuronales Netzwerk"),
160 ("deep learning", "tiefes Lernen"),
161 ],
162 _ => vec![],
163 };
164
165 let query_lower = query.to_lowercase();
166 for (en_term, target_term) in basic_dict {
167 if query_lower.contains(en_term) {
168 let translated = query_lower.replace(en_term, target_term);
169 translations.push(translated);
170 }
171 }
172
173 translations
174 }
175
176 fn generate_transliterations(&self, query: &str, target_lang: &str) -> Vec<String> {
178 let mut transliterations = Vec::new();
179
180 match target_lang {
182 "ru" => {
183 let latin_to_cyrillic = vec![
185 ("ai", "ай"),
186 ("machine", "машин"),
187 ("data", "дата"),
188 ("network", "сеть"),
189 ("learning", "обучение"),
190 ];
191
192 let mut transliterated = query.to_lowercase();
193 for (latin, cyrillic) in latin_to_cyrillic {
194 transliterated = transliterated.replace(latin, cyrillic);
195 }
196 if transliterated != query.to_lowercase() {
197 transliterations.push(transliterated);
198 }
199 }
200 "ar" => {
201 let latin_to_arabic =
203 vec![("data", "بيانات"), ("machine", "آلة"), ("network", "شبكة")];
204
205 let mut transliterated = query.to_lowercase();
206 for (latin, arabic) in latin_to_arabic {
207 transliterated = transliterated.replace(latin, arabic);
208 }
209 if transliterated != query.to_lowercase() {
210 transliterations.push(transliterated);
211 }
212 }
213 _ => {
214 }
216 }
217
218 transliterations
219 }
220
221 fn generate_stemmed_variants(&self, query: &str, target_lang: &str) -> Vec<String> {
223 let mut variants = Vec::new();
224
225 let words: Vec<&str> = query.split_whitespace().collect();
227
228 for word in words {
229 let stemmed = match target_lang {
230 "es" => {
231 let word_lower = word.to_lowercase();
233 if word_lower.ends_with("ción") {
234 word_lower.replace("ción", "")
235 } else if word_lower.ends_with("mente") {
236 word_lower.replace("mente", "")
237 } else {
238 word_lower
239 }
240 }
241 "fr" => {
242 let word_lower = word.to_lowercase();
244 if word_lower.ends_with("ment") {
245 word_lower.replace("ment", "")
246 } else if word_lower.ends_with("ique") {
247 word_lower.replace("ique", "")
248 } else {
249 word_lower
250 }
251 }
252 "de" => {
253 let word_lower = word.to_lowercase();
255 if word_lower.ends_with("ung") {
256 word_lower.replace("ung", "")
257 } else if word_lower.ends_with("lich") {
258 word_lower.replace("lich", "")
259 } else {
260 word_lower
261 }
262 }
263 "en" => {
264 let word_lower = word.to_lowercase();
266 if word_lower.ends_with("ing") {
267 word_lower.replace("ing", "")
268 } else if word_lower.ends_with("ed") {
269 word_lower.replace("ed", "")
270 } else if word_lower.ends_with("ly") {
271 word_lower.replace("ly", "")
272 } else {
273 word_lower
274 }
275 }
276 _ => word.to_lowercase(),
277 };
278
279 if stemmed != word.to_lowercase() && !stemmed.is_empty() {
280 variants.push(stemmed);
281 }
282 }
283
284 if !variants.is_empty() {
286 let original_words: Vec<&str> = query.split_whitespace().collect();
287 let mut variant_query = String::new();
288
289 for (i, word) in original_words.iter().enumerate() {
290 if i < variants.len() && !variants[i].is_empty() {
291 variant_query.push_str(&variants[i]);
292 } else {
293 variant_query.push_str(word);
294 }
295 if i < original_words.len() - 1 {
296 variant_query.push(' ');
297 }
298 }
299
300 if variant_query != query.to_lowercase() {
301 vec![variant_query]
302 } else {
303 vec![]
304 }
305 } else {
306 vec![]
307 }
308 }
309
310 pub fn set_language_weight(&mut self, language: &str, weight: f32) {
312 self.language_weights.insert(language.to_string(), weight);
313 }
314
315 pub fn supported_languages(&self) -> &[String] {
317 &self.supported_languages
318 }
319
320 pub fn is_language_supported(&self, language: &str) -> bool {
322 self.supported_languages.contains(&language.to_string())
323 }
324}
325
326impl Default for CrossLanguageProcessor {
327 fn default() -> Self {
328 Self::new()
329 }
330}
331
332#[cfg(test)]
333mod tests {
334 use super::*;
335
336 #[test]
337 fn test_language_detection() {
338 let processor = CrossLanguageProcessor::new();
339
340 assert_eq!(
341 processor.detect_language("machine learning algorithm"),
342 "en"
343 );
344 assert_eq!(processor.detect_language("aprendizaje automático"), "es");
345 assert_eq!(processor.detect_language("apprentissage automatique"), "fr");
346 assert_eq!(processor.detect_language("maschinelles Lernen"), "de");
347 }
348
349 #[test]
350 fn test_translation_generation() {
351 let processor = CrossLanguageProcessor::new();
352
353 let translations = processor.generate_translations("machine learning", "es");
354 assert!(translations.contains(&"aprendizaje automático".to_string()));
355
356 let translations = processor.generate_translations("artificial intelligence", "fr");
357 assert!(translations.contains(&"intelligence artificielle".to_string()));
358 }
359
360 #[test]
361 fn test_cross_language_processing() {
362 let processor = CrossLanguageProcessor::new();
363
364 let processed = processor.process_cross_language_query(
365 "machine learning",
366 &["es".to_string(), "fr".to_string()],
367 );
368
369 assert!(processed.len() > 1);
371 assert_eq!(processed[0].0, "machine learning");
372 assert_eq!(processed[0].1, 1.0); }
374
375 #[test]
376 fn test_stemming() {
377 let processor = CrossLanguageProcessor::new();
378
379 let variants = processor.generate_stemmed_variants("learning", "en");
380 assert!(variants.iter().any(|v| v.contains("learn")));
381
382 let variants = processor.generate_stemmed_variants("automático", "es");
383 assert!(variants.len() <= 1); }
386
387 #[test]
388 fn test_language_support() {
389 let processor = CrossLanguageProcessor::new();
390
391 assert!(processor.is_language_supported("en"));
392 assert!(processor.is_language_supported("es"));
393 assert!(processor.is_language_supported("fr"));
394 assert!(!processor.is_language_supported("xyz"));
395 }
396}