1use crate::{embeddings::EmbeddingGenerator, similarity::SimilarityMetric, Vector};
9
10use anyhow::{anyhow, Context, Result};
11use serde::{Deserialize, Serialize};
12use std::collections::{HashMap, HashSet};
13use std::sync::{Arc, RwLock};
14use tracing::{info, span, Level};
15
16#[derive(Debug, Clone, Serialize, Deserialize)]
18pub struct CrossLanguageConfig {
19 pub supported_languages: Vec<String>,
21 pub primary_language: String,
23 pub enable_language_detection: bool,
25 pub alignment_strategy: AlignmentStrategy,
27 pub translation_config: Option<TranslationConfig>,
29 pub multilingual_embeddings: MultilingualEmbeddingConfig,
31 pub cross_lingual_threshold: f32,
33}
34
35impl Default for CrossLanguageConfig {
36 fn default() -> Self {
37 Self {
38 supported_languages: vec![
39 "en".to_string(), "es".to_string(), "fr".to_string(), "de".to_string(), "it".to_string(), "pt".to_string(), "ru".to_string(), "zh".to_string(), "ja".to_string(), "ar".to_string(), ],
50 primary_language: "en".to_string(),
51 enable_language_detection: true,
52 alignment_strategy: AlignmentStrategy::MultilingualEmbeddings,
53 translation_config: None,
54 multilingual_embeddings: MultilingualEmbeddingConfig::default(),
55 cross_lingual_threshold: 0.6,
56 }
57 }
58}
59
60#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
62pub enum AlignmentStrategy {
63 MultilingualEmbeddings,
65 TranslationBased,
67 Hybrid,
69 LearnedMappings,
71}
72
73#[derive(Debug, Clone, Serialize, Deserialize)]
75pub struct TranslationConfig {
76 pub provider: TranslationProvider,
78 pub endpoint: Option<String>,
80 pub api_key: Option<String>,
82 pub enable_caching: bool,
84 pub max_cache_size: usize,
86}
87
88#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
90pub enum TranslationProvider {
91 Google,
93 Microsoft,
95 Aws,
97 Local,
99}
100
101#[derive(Debug, Clone, Serialize, Deserialize)]
103pub struct MultilingualEmbeddingConfig {
104 pub model_name: String,
106 pub dimensions: usize,
108 pub normalization: NormalizationStrategy,
110 pub language_preprocessing: HashMap<String, Vec<String>>,
112}
113
114impl Default for MultilingualEmbeddingConfig {
115 fn default() -> Self {
116 Self {
117 model_name: "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2".to_string(),
118 dimensions: 384,
119 normalization: NormalizationStrategy::L2,
120 language_preprocessing: HashMap::new(),
121 }
122 }
123}
124
125#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
127pub enum NormalizationStrategy {
128 L2,
130 MeanCentering,
132 Standardization,
134 None,
136}
137
138#[derive(Debug, Clone, Serialize, Deserialize)]
140pub struct LanguageDetection {
141 pub language: String,
143 pub confidence: f32,
145 pub alternatives: Vec<(String, f32)>,
147}
148
149#[derive(Debug, Clone)]
151pub struct CrossLanguageContent {
152 pub id: String,
154 pub text: String,
156 pub language: String,
158 pub language_confidence: f32,
160 pub vector: Option<Vector>,
162 pub aligned_vectors: HashMap<String, Vector>,
164}
165
166pub struct CrossLanguageAligner {
168 config: CrossLanguageConfig,
169 language_detector: Box<dyn LanguageDetector + Send + Sync>,
170 embedding_generator: Box<dyn EmbeddingGenerator + Send + Sync>,
171 translation_cache: Arc<RwLock<HashMap<String, String>>>,
172 alignment_mappings: Arc<RwLock<HashMap<String, AlignmentMapping>>>,
173 multilingual_embeddings: Arc<RwLock<HashMap<String, Vector>>>,
174}
175
176pub trait LanguageDetector {
178 fn detect_language(&self, text: &str) -> Result<LanguageDetection>;
180
181 fn is_supported(&self, language: &str) -> bool;
183}
184
185pub struct SimpleLanguageDetector {
187 supported_languages: HashSet<String>,
188}
189
190impl SimpleLanguageDetector {
191 pub fn new(supported_languages: Vec<String>) -> Self {
192 Self {
193 supported_languages: supported_languages.into_iter().collect(),
194 }
195 }
196}
197
198impl LanguageDetector for SimpleLanguageDetector {
199 fn detect_language(&self, text: &str) -> Result<LanguageDetection> {
200 let text_lower = text.to_lowercase();
202
203 let language = if text_lower
205 .chars()
206 .any(|c| matches!(c, 'ñ' | 'ü' | 'é' | 'á' | 'í' | 'ó' | 'ú'))
207 {
208 "es" } else if text_lower
210 .chars()
211 .any(|c| matches!(c, 'ç' | 'à' | 'è' | 'ù' | 'ê' | 'ô'))
212 {
213 "fr" } else if text_lower
215 .chars()
216 .any(|c| matches!(c, 'ä' | 'ö' | 'ü' | 'ß'))
217 {
218 "de" } else if text_lower
220 .chars()
221 .any(|c| ('\u{4e00}'..='\u{9fff}').contains(&c))
222 {
223 "zh" } else if text_lower
225 .chars()
226 .any(|c| ('\u{3040}'..='\u{309f}').contains(&c))
227 {
228 "ja" } else if text_lower
230 .chars()
231 .any(|c| ('\u{0600}'..='\u{06ff}').contains(&c))
232 {
233 "ar" } else if text_lower
235 .chars()
236 .any(|c| ('\u{0400}'..='\u{04ff}').contains(&c))
237 {
238 "ru" } else {
240 "en" };
242
243 let confidence = if language == "en" { 0.7 } else { 0.8 };
244
245 Ok(LanguageDetection {
246 language: language.to_string(),
247 confidence,
248 alternatives: vec![("en".to_string(), 0.3)],
249 })
250 }
251
252 fn is_supported(&self, language: &str) -> bool {
253 self.supported_languages.contains(language)
254 }
255}
256
257#[derive(Debug, Clone)]
259pub struct AlignmentMapping {
260 pub source_language: String,
262 pub target_language: String,
264 pub transformation_matrix: Option<Vec<Vec<f32>>>,
266 pub translation_pairs: Vec<(String, String)>,
268 pub quality_score: f32,
270}
271
272#[derive(Debug, Clone, Serialize, Deserialize)]
274pub struct CrossLanguageSearchResult {
275 pub id: String,
277 pub similarity: f32,
279 pub language: String,
281 pub text: String,
283 pub translated_text: Option<String>,
285 pub cross_lingual_metrics: HashMap<String, f32>,
287}
288
289impl CrossLanguageAligner {
290 pub fn new(
292 config: CrossLanguageConfig,
293 embedding_generator: Box<dyn EmbeddingGenerator + Send + Sync>,
294 ) -> Self {
295 let language_detector = Box::new(SimpleLanguageDetector::new(
296 config.supported_languages.clone(),
297 ));
298
299 Self {
300 config,
301 language_detector,
302 embedding_generator,
303 translation_cache: Arc::new(RwLock::new(HashMap::new())),
304 alignment_mappings: Arc::new(RwLock::new(HashMap::new())),
305 multilingual_embeddings: Arc::new(RwLock::new(HashMap::new())),
306 }
307 }
308
309 pub async fn process_content(&self, content: &str, id: &str) -> Result<CrossLanguageContent> {
311 let span = span!(Level::INFO, "process_content", content_id = %id);
312 let _enter = span.enter();
313
314 let detection = if self.config.enable_language_detection {
316 self.language_detector.detect_language(content)?
317 } else {
318 LanguageDetection {
319 language: self.config.primary_language.clone(),
320 confidence: 1.0,
321 alternatives: Vec::new(),
322 }
323 };
324
325 let embeddable_content = crate::embeddings::EmbeddableContent::Text(content.to_string());
327 let vector = self
328 .embedding_generator
329 .generate(&embeddable_content)
330 .context("Failed to generate embedding")?;
331
332 let aligned_vectors = self
334 .create_aligned_vectors(content, &detection.language, &vector)
335 .await?;
336
337 Ok(CrossLanguageContent {
338 id: id.to_string(),
339 text: content.to_string(),
340 language: detection.language,
341 language_confidence: detection.confidence,
342 vector: Some(vector),
343 aligned_vectors,
344 })
345 }
346
347 async fn create_aligned_vectors(
349 &self,
350 content: &str,
351 source_language: &str,
352 source_vector: &Vector,
353 ) -> Result<HashMap<String, Vector>> {
354 let mut aligned_vectors = HashMap::new();
355
356 match self.config.alignment_strategy {
357 AlignmentStrategy::MultilingualEmbeddings => {
358 for target_lang in &self.config.supported_languages {
360 if target_lang != source_language {
361 let aligned_vector =
362 self.create_multilingual_embedding(content, target_lang)?;
363 aligned_vectors.insert(target_lang.clone(), aligned_vector);
364 }
365 }
366 }
367 AlignmentStrategy::TranslationBased => {
368 for target_lang in &self.config.supported_languages {
370 if target_lang != source_language {
371 let translated_text = self
372 .translate_text(content, source_language, target_lang)
373 .await?;
374 let embeddable_content =
375 crate::embeddings::EmbeddableContent::Text(translated_text);
376 let translated_vector =
377 self.embedding_generator.generate(&embeddable_content)?;
378 aligned_vectors.insert(target_lang.clone(), translated_vector);
379 }
380 }
381 }
382 AlignmentStrategy::Hybrid => {
383 for target_lang in &self.config.supported_languages {
385 if target_lang != source_language {
386 let multilingual_vector =
387 self.create_multilingual_embedding(content, target_lang)?;
388 let translated_text = self
389 .translate_text(content, source_language, target_lang)
390 .await?;
391 let embeddable_content =
392 crate::embeddings::EmbeddableContent::Text(translated_text);
393 let translated_vector =
394 self.embedding_generator.generate(&embeddable_content)?;
395
396 let combined_vector =
398 self.combine_vectors(&multilingual_vector, &translated_vector)?;
399 aligned_vectors.insert(target_lang.clone(), combined_vector);
400 }
401 }
402 }
403 AlignmentStrategy::LearnedMappings => {
404 for target_lang in &self.config.supported_languages {
406 if target_lang != source_language {
407 let mapped_vector = self.apply_learned_mapping(
408 source_vector,
409 source_language,
410 target_lang,
411 )?;
412 aligned_vectors.insert(target_lang.clone(), mapped_vector);
413 }
414 }
415 }
416 }
417
418 Ok(aligned_vectors)
419 }
420
421 fn create_multilingual_embedding(
423 &self,
424 content: &str,
425 target_language: &str,
426 ) -> Result<Vector> {
427 let prefixed_content = format!("[{target_language}] {content}");
429 let embeddable_content = crate::embeddings::EmbeddableContent::Text(prefixed_content);
430 self.embedding_generator.generate(&embeddable_content)
431 }
432
433 async fn translate_text(
435 &self,
436 text: &str,
437 source_lang: &str,
438 target_lang: &str,
439 ) -> Result<String> {
440 let cache_key = format!("{source_lang}:{target_lang}:{text}");
441
442 {
444 let cache = self.translation_cache.read().unwrap();
445 if let Some(cached_translation) = cache.get(&cache_key) {
446 return Ok(cached_translation.clone());
447 }
448 }
449
450 let translated = match (source_lang, target_lang) {
452 ("en", "es") => format!("[ES] {text}"),
453 ("en", "fr") => format!("[FR] {text}"),
454 ("en", "de") => format!("[DE] {text}"),
455 ("es", "en") => text.replace("[ES]", "[EN]"),
456 ("fr", "en") => text.replace("[FR]", "[EN]"),
457 ("de", "en") => text.replace("[DE]", "[EN]"),
458 _ => {
459 let upper_lang = target_lang.to_uppercase();
460 format!("[{upper_lang}] {text}")
461 }
462 };
463
464 {
466 let mut cache = self.translation_cache.write().unwrap();
467 if cache.len()
468 >= self
469 .config
470 .translation_config
471 .as_ref()
472 .map(|c| c.max_cache_size)
473 .unwrap_or(10000)
474 {
475 if let Some(key) = cache.keys().next().cloned() {
477 cache.remove(&key);
478 }
479 }
480 cache.insert(cache_key, translated.clone());
481 }
482
483 Ok(translated)
484 }
485
486 fn combine_vectors(&self, vector1: &Vector, vector2: &Vector) -> Result<Vector> {
488 let v1_f32 = vector1.as_f32();
489 let v2_f32 = vector2.as_f32();
490
491 if v1_f32.len() != v2_f32.len() {
492 return Err(anyhow!("Vector dimensions must match for combination"));
493 }
494
495 let combined: Vec<f32> = v1_f32
496 .iter()
497 .zip(v2_f32.iter())
498 .map(|(a, b)| (a + b) / 2.0)
499 .collect();
500
501 Ok(Vector::new(combined))
502 }
503
504 fn apply_learned_mapping(
506 &self,
507 source_vector: &Vector,
508 source_lang: &str,
509 target_lang: &str,
510 ) -> Result<Vector> {
511 let mapping_key = format!("{source_lang}:{target_lang}");
512 let mappings = self.alignment_mappings.read().unwrap();
513
514 if let Some(mapping) = mappings.get(&mapping_key) {
515 if let Some(ref matrix) = mapping.transformation_matrix {
516 return self.apply_matrix_transformation(source_vector, matrix);
517 }
518 }
519
520 Ok(source_vector.clone())
522 }
523
524 fn apply_matrix_transformation(&self, vector: &Vector, matrix: &[Vec<f32>]) -> Result<Vector> {
526 let v_f32 = vector.as_f32();
527
528 if matrix.is_empty() || matrix[0].len() != v_f32.len() {
529 return Err(anyhow!("Matrix dimensions incompatible with vector"));
530 }
531
532 let transformed: Vec<f32> = matrix
533 .iter()
534 .map(|row| row.iter().zip(v_f32.iter()).map(|(m, v)| m * v).sum())
535 .collect();
536
537 Ok(Vector::new(transformed))
538 }
539
540 pub fn cross_language_search(
542 &self,
543 query: &str,
544 query_language: &str,
545 content_items: &[CrossLanguageContent],
546 k: usize,
547 ) -> Result<Vec<CrossLanguageSearchResult>> {
548 let span = span!(Level::INFO, "cross_language_search", query_lang = %query_language);
549 let _enter = span.enter();
550
551 let embeddable_content = crate::embeddings::EmbeddableContent::Text(query.to_string());
553 let query_vector = self.embedding_generator.generate(&embeddable_content)?;
554
555 let mut results = Vec::new();
556
557 for content in content_items {
558 let primary_similarity = if content.language == query_language {
560 if let Some(ref content_vector) = content.vector {
561 SimilarityMetric::Cosine.compute(&query_vector, content_vector)?
562 } else {
563 0.0
564 }
565 } else {
566 0.0
567 };
568
569 let mut cross_lingual_similarities = HashMap::new();
571 if let Some(aligned_vector) = content.aligned_vectors.get(query_language) {
572 let cross_similarity =
573 SimilarityMetric::Cosine.compute(&query_vector, aligned_vector)?;
574 cross_lingual_similarities.insert("cosine".to_string(), cross_similarity);
575 }
576
577 let best_similarity = primary_similarity.max(
579 cross_lingual_similarities
580 .values()
581 .copied()
582 .fold(0.0, f32::max),
583 );
584
585 if best_similarity >= self.config.cross_lingual_threshold {
586 results.push(CrossLanguageSearchResult {
587 id: content.id.clone(),
588 similarity: best_similarity,
589 language: content.language.clone(),
590 text: content.text.clone(),
591 translated_text: None, cross_lingual_metrics: cross_lingual_similarities,
593 });
594 }
595 }
596
597 results.sort_by(|a, b| b.similarity.partial_cmp(&a.similarity).unwrap());
599 results.truncate(k);
600
601 Ok(results)
602 }
603
604 pub fn learn_alignment_mapping(
606 &mut self,
607 source_language: &str,
608 target_language: &str,
609 translation_pairs: Vec<(String, String)>,
610 ) -> Result<()> {
611 let span = span!(Level::INFO, "learn_alignment_mapping",
612 source = %source_language, target = %target_language);
613 let _enter = span.enter();
614
615 let mut source_vectors = Vec::new();
617 let mut target_vectors = Vec::new();
618
619 for (source_text, target_text) in &translation_pairs {
620 let source_embeddable = crate::embeddings::EmbeddableContent::Text(source_text.clone());
621 let target_embeddable = crate::embeddings::EmbeddableContent::Text(target_text.clone());
622 let source_vector = self.embedding_generator.generate(&source_embeddable)?;
623 let target_vector = self.embedding_generator.generate(&target_embeddable)?;
624
625 source_vectors.push(source_vector.as_f32());
626 target_vectors.push(target_vector.as_f32());
627 }
628
629 let transformation_matrix =
631 self.compute_transformation_matrix(&source_vectors, &target_vectors)?;
632
633 let quality_score = self.evaluate_mapping_quality(
635 &source_vectors,
636 &target_vectors,
637 &transformation_matrix,
638 )?;
639
640 let mapping = AlignmentMapping {
641 source_language: source_language.to_string(),
642 target_language: target_language.to_string(),
643 transformation_matrix: Some(transformation_matrix),
644 translation_pairs,
645 quality_score,
646 };
647
648 let mapping_key = format!("{source_language}:{target_language}");
649 let mut mappings = self.alignment_mappings.write().unwrap();
650 mappings.insert(mapping_key, mapping);
651
652 info!(
653 "Learned alignment mapping with quality score: {:.3}",
654 quality_score
655 );
656 Ok(())
657 }
658
659 fn compute_transformation_matrix(
661 &self,
662 source_vectors: &[Vec<f32>],
663 target_vectors: &[Vec<f32>],
664 ) -> Result<Vec<Vec<f32>>> {
665 if source_vectors.is_empty() || source_vectors.len() != target_vectors.len() {
666 return Err(anyhow!("Invalid vector sets for learning transformation"));
667 }
668
669 let dim = source_vectors[0].len();
670
671 let mut matrix = vec![vec![0.0; dim]; dim];
673 for (i, row) in matrix.iter_mut().enumerate().take(dim) {
674 row[i] = 1.0;
675 }
676
677 for (i, row) in matrix.iter_mut().enumerate().take(dim) {
679 for (j, row_val) in row.iter_mut().enumerate().take(dim) {
680 if i != j {
681 *row_val = (i as f32 * j as f32 * 0.001) % 0.1 - 0.05;
682 }
683 }
684 }
685
686 Ok(matrix)
687 }
688
689 fn evaluate_mapping_quality(
691 &self,
692 source_vectors: &[Vec<f32>],
693 target_vectors: &[Vec<f32>],
694 matrix: &[Vec<f32>],
695 ) -> Result<f32> {
696 let mut total_similarity = 0.0;
697 let mut count = 0;
698
699 for (source, target) in source_vectors.iter().zip(target_vectors) {
700 let transformed_vector = Vector::new(source.clone());
701 let transformed = self.apply_matrix_transformation(&transformed_vector, matrix)?;
702 let target_vector = Vector::new(target.clone());
703
704 let similarity = SimilarityMetric::Cosine.compute(&transformed, &target_vector)?;
705 total_similarity += similarity;
706 count += 1;
707 }
708
709 Ok(if count > 0 {
710 total_similarity / count as f32
711 } else {
712 0.0
713 })
714 }
715
716 pub fn get_language_statistics(&self) -> HashMap<String, usize> {
718 let embeddings = self.multilingual_embeddings.read().unwrap();
719 let mut stats = HashMap::new();
720
721 for lang in &self.config.supported_languages {
722 stats.insert(lang.clone(), embeddings.len());
723 }
724
725 stats
726 }
727
728 pub fn get_supported_languages(&self) -> &[String] {
730 &self.config.supported_languages
731 }
732}
733
734#[cfg(test)]
735mod tests {
736 use super::*;
737 use crate::embeddings::MockEmbeddingGenerator;
738
739 #[test]
740 fn test_cross_language_config_creation() {
741 let config = CrossLanguageConfig::default();
742 assert!(!config.supported_languages.is_empty());
743 assert_eq!(config.primary_language, "en");
744 assert!(config.enable_language_detection);
745 }
746
747 #[test]
748 fn test_language_detector_creation() {
749 let languages = vec!["en".to_string(), "es".to_string(), "fr".to_string()];
750 let detector = SimpleLanguageDetector::new(languages.clone());
751
752 assert!(detector.is_supported("en"));
753 assert!(detector.is_supported("es"));
754 assert!(!detector.is_supported("de"));
755 }
756
757 #[test]
758 fn test_language_detection() {
759 let detector = SimpleLanguageDetector::new(vec!["en".to_string(), "es".to_string()]);
760
761 let detection = detector.detect_language("Hello world").unwrap();
762 assert_eq!(detection.language, "en");
763 assert!(detection.confidence > 0.0);
764
765 let detection = detector.detect_language("Hola mundo").unwrap();
766 assert_eq!(detection.language, "en"); }
768
769 #[test]
770 fn test_alignment_strategy_variants() {
771 let strategies = vec![
772 AlignmentStrategy::MultilingualEmbeddings,
773 AlignmentStrategy::TranslationBased,
774 AlignmentStrategy::Hybrid,
775 AlignmentStrategy::LearnedMappings,
776 ];
777
778 for strategy in strategies {
779 let config = CrossLanguageConfig {
780 alignment_strategy: strategy.clone(),
781 ..Default::default()
782 };
783 assert_eq!(config.alignment_strategy, strategy);
784 }
785 }
786
787 #[tokio::test]
788 async fn test_cross_language_aligner_creation() {
789 let config = CrossLanguageConfig::default();
790 let embedding_generator = Box::new(MockEmbeddingGenerator::new());
791
792 let aligner = CrossLanguageAligner::new(config, embedding_generator);
793 assert_eq!(aligner.get_supported_languages().len(), 10);
794 }
795
796 #[tokio::test]
797 async fn test_content_processing() {
798 let config = CrossLanguageConfig::default();
799 let embedding_generator = Box::new(MockEmbeddingGenerator::new());
800
801 let aligner = CrossLanguageAligner::new(config, embedding_generator);
802 let content = aligner
803 .process_content("Hello world", "test_id")
804 .await
805 .unwrap();
806
807 assert_eq!(content.id, "test_id");
808 assert_eq!(content.text, "Hello world");
809 assert!(content.vector.is_some());
810 assert!(!content.aligned_vectors.is_empty());
811 }
812
813 #[test]
814 fn test_vector_combination() {
815 let config = CrossLanguageConfig::default();
816 let embedding_generator = Box::new(MockEmbeddingGenerator::new());
817 let aligner = CrossLanguageAligner::new(config, embedding_generator);
818
819 let vector1 = Vector::new(vec![1.0, 2.0, 3.0]);
820 let vector2 = Vector::new(vec![2.0, 4.0, 6.0]);
821
822 let combined = aligner.combine_vectors(&vector1, &vector2).unwrap();
823 let combined_f32 = combined.as_f32();
824
825 assert_eq!(combined_f32, vec![1.5, 3.0, 4.5]);
826 }
827
828 #[test]
829 fn test_cross_language_search_result() {
830 let result = CrossLanguageSearchResult {
831 id: "test".to_string(),
832 similarity: 0.8,
833 language: "en".to_string(),
834 text: "test content".to_string(),
835 translated_text: Some("contenido de prueba".to_string()),
836 cross_lingual_metrics: HashMap::new(),
837 };
838
839 assert_eq!(result.id, "test");
840 assert_eq!(result.similarity, 0.8);
841 assert_eq!(result.language, "en");
842 }
843}