oxirs_vec/
cross_language_alignment.rs

1//! Cross-Language Vector Alignment - Version 1.2 Feature
2//!
3//! This module implements comprehensive cross-language vector alignment capabilities
4//! that enable semantic search and similarity computation across different languages.
5//! It supports multilingual embeddings, translation-based alignment, and cross-lingual
6//! similarity scoring for knowledge graphs with multilingual content.
7
8use crate::{embeddings::EmbeddingGenerator, similarity::SimilarityMetric, Vector};
9
10use anyhow::{anyhow, Context, Result};
11use serde::{Deserialize, Serialize};
12use std::collections::{HashMap, HashSet};
13use std::sync::{Arc, RwLock};
14use tracing::{info, span, Level};
15
16/// Configuration for cross-language vector alignment
17#[derive(Debug, Clone, Serialize, Deserialize)]
18pub struct CrossLanguageConfig {
19    /// Supported languages (ISO 639-1 codes)
20    pub supported_languages: Vec<String>,
21    /// Primary language for fallback
22    pub primary_language: String,
23    /// Enable automatic language detection
24    pub enable_language_detection: bool,
25    /// Alignment strategy
26    pub alignment_strategy: AlignmentStrategy,
27    /// Translation service configuration
28    pub translation_config: Option<TranslationConfig>,
29    /// Multilingual embedding model configuration
30    pub multilingual_embeddings: MultilingualEmbeddingConfig,
31    /// Cross-lingual similarity threshold
32    pub cross_lingual_threshold: f32,
33}
34
35impl Default for CrossLanguageConfig {
36    fn default() -> Self {
37        Self {
38            supported_languages: vec![
39                "en".to_string(), // English
40                "es".to_string(), // Spanish
41                "fr".to_string(), // French
42                "de".to_string(), // German
43                "it".to_string(), // Italian
44                "pt".to_string(), // Portuguese
45                "ru".to_string(), // Russian
46                "zh".to_string(), // Chinese
47                "ja".to_string(), // Japanese
48                "ar".to_string(), // Arabic
49            ],
50            primary_language: "en".to_string(),
51            enable_language_detection: true,
52            alignment_strategy: AlignmentStrategy::MultilingualEmbeddings,
53            translation_config: None,
54            multilingual_embeddings: MultilingualEmbeddingConfig::default(),
55            cross_lingual_threshold: 0.6,
56        }
57    }
58}
59
60/// Strategies for aligning vectors across languages
61#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
62pub enum AlignmentStrategy {
63    /// Use multilingual embedding models
64    MultilingualEmbeddings,
65    /// Use translation to common language
66    TranslationBased,
67    /// Hybrid approach with both methods
68    Hybrid,
69    /// Learn cross-lingual mappings
70    LearnedMappings,
71}
72
73/// Configuration for translation services
74#[derive(Debug, Clone, Serialize, Deserialize)]
75pub struct TranslationConfig {
76    /// Translation service provider
77    pub provider: TranslationProvider,
78    /// API endpoint
79    pub endpoint: Option<String>,
80    /// API key for authentication
81    pub api_key: Option<String>,
82    /// Cache translated content
83    pub enable_caching: bool,
84    /// Maximum cache size
85    pub max_cache_size: usize,
86}
87
88/// Translation service providers
89#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
90pub enum TranslationProvider {
91    /// Google Translate API
92    Google,
93    /// Microsoft Translator
94    Microsoft,
95    /// AWS Translate
96    Aws,
97    /// Local/offline model
98    Local,
99}
100
101/// Configuration for multilingual embeddings
102#[derive(Debug, Clone, Serialize, Deserialize)]
103pub struct MultilingualEmbeddingConfig {
104    /// Model name for multilingual embeddings
105    pub model_name: String,
106    /// Dimension of embeddings
107    pub dimensions: usize,
108    /// Normalization strategy
109    pub normalization: NormalizationStrategy,
110    /// Language-specific preprocessing
111    pub language_preprocessing: HashMap<String, Vec<String>>,
112}
113
114impl Default for MultilingualEmbeddingConfig {
115    fn default() -> Self {
116        Self {
117            model_name: "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2".to_string(),
118            dimensions: 384,
119            normalization: NormalizationStrategy::L2,
120            language_preprocessing: HashMap::new(),
121        }
122    }
123}
124
125/// Vector normalization strategies
126#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
127pub enum NormalizationStrategy {
128    /// L2 normalization
129    L2,
130    /// Mean centering
131    MeanCentering,
132    /// Standardization (z-score)
133    Standardization,
134    /// None
135    None,
136}
137
138/// Language detection result
139#[derive(Debug, Clone, Serialize, Deserialize)]
140pub struct LanguageDetection {
141    /// Detected language code
142    pub language: String,
143    /// Confidence score (0.0 to 1.0)
144    pub confidence: f32,
145    /// Alternative language candidates
146    pub alternatives: Vec<(String, f32)>,
147}
148
149/// Cross-language content item
150#[derive(Debug, Clone)]
151pub struct CrossLanguageContent {
152    /// Unique identifier
153    pub id: String,
154    /// Text content
155    pub text: String,
156    /// Detected or specified language
157    pub language: String,
158    /// Language detection confidence
159    pub language_confidence: f32,
160    /// Original vector embedding
161    pub vector: Option<Vector>,
162    /// Aligned vectors for different languages
163    pub aligned_vectors: HashMap<String, Vector>,
164}
165
166/// Cross-language vector alignment engine
167pub struct CrossLanguageAligner {
168    config: CrossLanguageConfig,
169    language_detector: Box<dyn LanguageDetector + Send + Sync>,
170    embedding_generator: Box<dyn EmbeddingGenerator + Send + Sync>,
171    translation_cache: Arc<RwLock<HashMap<String, String>>>,
172    alignment_mappings: Arc<RwLock<HashMap<String, AlignmentMapping>>>,
173    multilingual_embeddings: Arc<RwLock<HashMap<String, Vector>>>,
174}
175
176/// Language detection trait
177pub trait LanguageDetector {
178    /// Detect language of given text
179    fn detect_language(&self, text: &str) -> Result<LanguageDetection>;
180
181    /// Check if language is supported
182    fn is_supported(&self, language: &str) -> bool;
183}
184
185/// Simple language detector implementation
186pub struct SimpleLanguageDetector {
187    supported_languages: HashSet<String>,
188}
189
190impl SimpleLanguageDetector {
191    pub fn new(supported_languages: Vec<String>) -> Self {
192        Self {
193            supported_languages: supported_languages.into_iter().collect(),
194        }
195    }
196}
197
198impl LanguageDetector for SimpleLanguageDetector {
199    fn detect_language(&self, text: &str) -> Result<LanguageDetection> {
200        // Simplified language detection based on character sets and patterns
201        let text_lower = text.to_lowercase();
202
203        // Simple heuristics for language detection
204        let language = if text_lower
205            .chars()
206            .any(|c| matches!(c, 'ñ' | 'ü' | 'é' | 'á' | 'í' | 'ó' | 'ú'))
207        {
208            "es" // Spanish
209        } else if text_lower
210            .chars()
211            .any(|c| matches!(c, 'ç' | 'à' | 'è' | 'ù' | 'ê' | 'ô'))
212        {
213            "fr" // French
214        } else if text_lower
215            .chars()
216            .any(|c| matches!(c, 'ä' | 'ö' | 'ü' | 'ß'))
217        {
218            "de" // German
219        } else if text_lower
220            .chars()
221            .any(|c| ('\u{4e00}'..='\u{9fff}').contains(&c))
222        {
223            "zh" // Chinese
224        } else if text_lower
225            .chars()
226            .any(|c| ('\u{3040}'..='\u{309f}').contains(&c))
227        {
228            "ja" // Japanese
229        } else if text_lower
230            .chars()
231            .any(|c| ('\u{0600}'..='\u{06ff}').contains(&c))
232        {
233            "ar" // Arabic
234        } else if text_lower
235            .chars()
236            .any(|c| ('\u{0400}'..='\u{04ff}').contains(&c))
237        {
238            "ru" // Russian
239        } else {
240            "en" // Default to English
241        };
242
243        let confidence = if language == "en" { 0.7 } else { 0.8 };
244
245        Ok(LanguageDetection {
246            language: language.to_string(),
247            confidence,
248            alternatives: vec![("en".to_string(), 0.3)],
249        })
250    }
251
252    fn is_supported(&self, language: &str) -> bool {
253        self.supported_languages.contains(language)
254    }
255}
256
257/// Alignment mapping between languages
258#[derive(Debug, Clone)]
259pub struct AlignmentMapping {
260    /// Source language
261    pub source_language: String,
262    /// Target language
263    pub target_language: String,
264    /// Transformation matrix (if learned)
265    pub transformation_matrix: Option<Vec<Vec<f32>>>,
266    /// Translation pairs used for learning
267    pub translation_pairs: Vec<(String, String)>,
268    /// Mapping quality score
269    pub quality_score: f32,
270}
271
272/// Cross-language search result
273#[derive(Debug, Clone, Serialize, Deserialize)]
274pub struct CrossLanguageSearchResult {
275    /// Content identifier
276    pub id: String,
277    /// Similarity score
278    pub similarity: f32,
279    /// Content language
280    pub language: String,
281    /// Original text content
282    pub text: String,
283    /// Translated text (if available)
284    pub translated_text: Option<String>,
285    /// Cross-lingual similarity metrics
286    pub cross_lingual_metrics: HashMap<String, f32>,
287}
288
289impl CrossLanguageAligner {
290    /// Create a new cross-language aligner
291    pub fn new(
292        config: CrossLanguageConfig,
293        embedding_generator: Box<dyn EmbeddingGenerator + Send + Sync>,
294    ) -> Self {
295        let language_detector = Box::new(SimpleLanguageDetector::new(
296            config.supported_languages.clone(),
297        ));
298
299        Self {
300            config,
301            language_detector,
302            embedding_generator,
303            translation_cache: Arc::new(RwLock::new(HashMap::new())),
304            alignment_mappings: Arc::new(RwLock::new(HashMap::new())),
305            multilingual_embeddings: Arc::new(RwLock::new(HashMap::new())),
306        }
307    }
308
309    /// Process content and create cross-language representations
310    pub async fn process_content(&self, content: &str, id: &str) -> Result<CrossLanguageContent> {
311        let span = span!(Level::INFO, "process_content", content_id = %id);
312        let _enter = span.enter();
313
314        // Detect language
315        let detection = if self.config.enable_language_detection {
316            self.language_detector.detect_language(content)?
317        } else {
318            LanguageDetection {
319                language: self.config.primary_language.clone(),
320                confidence: 1.0,
321                alternatives: Vec::new(),
322            }
323        };
324
325        // Generate primary vector embedding
326        let embeddable_content = crate::embeddings::EmbeddableContent::Text(content.to_string());
327        let vector = self
328            .embedding_generator
329            .generate(&embeddable_content)
330            .context("Failed to generate embedding")?;
331
332        // Create aligned vectors for other languages
333        let aligned_vectors = self
334            .create_aligned_vectors(content, &detection.language, &vector)
335            .await?;
336
337        Ok(CrossLanguageContent {
338            id: id.to_string(),
339            text: content.to_string(),
340            language: detection.language,
341            language_confidence: detection.confidence,
342            vector: Some(vector),
343            aligned_vectors,
344        })
345    }
346
347    /// Create aligned vectors for different languages
348    async fn create_aligned_vectors(
349        &self,
350        content: &str,
351        source_language: &str,
352        source_vector: &Vector,
353    ) -> Result<HashMap<String, Vector>> {
354        let mut aligned_vectors = HashMap::new();
355
356        match self.config.alignment_strategy {
357            AlignmentStrategy::MultilingualEmbeddings => {
358                // Use multilingual embedding model directly
359                for target_lang in &self.config.supported_languages {
360                    if target_lang != source_language {
361                        let aligned_vector =
362                            self.create_multilingual_embedding(content, target_lang)?;
363                        aligned_vectors.insert(target_lang.clone(), aligned_vector);
364                    }
365                }
366            }
367            AlignmentStrategy::TranslationBased => {
368                // Translate content and generate embeddings
369                for target_lang in &self.config.supported_languages {
370                    if target_lang != source_language {
371                        let translated_text = self
372                            .translate_text(content, source_language, target_lang)
373                            .await?;
374                        let embeddable_content =
375                            crate::embeddings::EmbeddableContent::Text(translated_text);
376                        let translated_vector =
377                            self.embedding_generator.generate(&embeddable_content)?;
378                        aligned_vectors.insert(target_lang.clone(), translated_vector);
379                    }
380                }
381            }
382            AlignmentStrategy::Hybrid => {
383                // Use both multilingual embeddings and translation
384                for target_lang in &self.config.supported_languages {
385                    if target_lang != source_language {
386                        let multilingual_vector =
387                            self.create_multilingual_embedding(content, target_lang)?;
388                        let translated_text = self
389                            .translate_text(content, source_language, target_lang)
390                            .await?;
391                        let embeddable_content =
392                            crate::embeddings::EmbeddableContent::Text(translated_text);
393                        let translated_vector =
394                            self.embedding_generator.generate(&embeddable_content)?;
395
396                        // Combine vectors (simple average for now)
397                        let combined_vector =
398                            self.combine_vectors(&multilingual_vector, &translated_vector)?;
399                        aligned_vectors.insert(target_lang.clone(), combined_vector);
400                    }
401                }
402            }
403            AlignmentStrategy::LearnedMappings => {
404                // Apply learned transformation mappings
405                for target_lang in &self.config.supported_languages {
406                    if target_lang != source_language {
407                        let mapped_vector = self.apply_learned_mapping(
408                            source_vector,
409                            source_language,
410                            target_lang,
411                        )?;
412                        aligned_vectors.insert(target_lang.clone(), mapped_vector);
413                    }
414                }
415            }
416        }
417
418        Ok(aligned_vectors)
419    }
420
421    /// Create multilingual embedding
422    fn create_multilingual_embedding(
423        &self,
424        content: &str,
425        target_language: &str,
426    ) -> Result<Vector> {
427        // For now, use the same embedding generator with language prefix
428        let prefixed_content = format!("[{target_language}] {content}");
429        let embeddable_content = crate::embeddings::EmbeddableContent::Text(prefixed_content);
430        self.embedding_generator.generate(&embeddable_content)
431    }
432
433    /// Translate text between languages
434    async fn translate_text(
435        &self,
436        text: &str,
437        source_lang: &str,
438        target_lang: &str,
439    ) -> Result<String> {
440        let cache_key = format!("{source_lang}:{target_lang}:{text}");
441
442        // Check cache first
443        {
444            let cache = self.translation_cache.read().unwrap();
445            if let Some(cached_translation) = cache.get(&cache_key) {
446                return Ok(cached_translation.clone());
447            }
448        }
449
450        // Simulate translation (in real implementation, would call translation API)
451        let translated = match (source_lang, target_lang) {
452            ("en", "es") => format!("[ES] {text}"),
453            ("en", "fr") => format!("[FR] {text}"),
454            ("en", "de") => format!("[DE] {text}"),
455            ("es", "en") => text.replace("[ES]", "[EN]"),
456            ("fr", "en") => text.replace("[FR]", "[EN]"),
457            ("de", "en") => text.replace("[DE]", "[EN]"),
458            _ => {
459                let upper_lang = target_lang.to_uppercase();
460                format!("[{upper_lang}] {text}")
461            }
462        };
463
464        // Cache the translation
465        {
466            let mut cache = self.translation_cache.write().unwrap();
467            if cache.len()
468                >= self
469                    .config
470                    .translation_config
471                    .as_ref()
472                    .map(|c| c.max_cache_size)
473                    .unwrap_or(10000)
474            {
475                // Simple cache eviction: remove first entry
476                if let Some(key) = cache.keys().next().cloned() {
477                    cache.remove(&key);
478                }
479            }
480            cache.insert(cache_key, translated.clone());
481        }
482
483        Ok(translated)
484    }
485
486    /// Combine two vectors (simple averaging)
487    fn combine_vectors(&self, vector1: &Vector, vector2: &Vector) -> Result<Vector> {
488        let v1_f32 = vector1.as_f32();
489        let v2_f32 = vector2.as_f32();
490
491        if v1_f32.len() != v2_f32.len() {
492            return Err(anyhow!("Vector dimensions must match for combination"));
493        }
494
495        let combined: Vec<f32> = v1_f32
496            .iter()
497            .zip(v2_f32.iter())
498            .map(|(a, b)| (a + b) / 2.0)
499            .collect();
500
501        Ok(Vector::new(combined))
502    }
503
504    /// Apply learned mapping transformation
505    fn apply_learned_mapping(
506        &self,
507        source_vector: &Vector,
508        source_lang: &str,
509        target_lang: &str,
510    ) -> Result<Vector> {
511        let mapping_key = format!("{source_lang}:{target_lang}");
512        let mappings = self.alignment_mappings.read().unwrap();
513
514        if let Some(mapping) = mappings.get(&mapping_key) {
515            if let Some(ref matrix) = mapping.transformation_matrix {
516                return self.apply_matrix_transformation(source_vector, matrix);
517            }
518        }
519
520        // Fallback to identity mapping
521        Ok(source_vector.clone())
522    }
523
524    /// Apply matrix transformation to vector
525    fn apply_matrix_transformation(&self, vector: &Vector, matrix: &[Vec<f32>]) -> Result<Vector> {
526        let v_f32 = vector.as_f32();
527
528        if matrix.is_empty() || matrix[0].len() != v_f32.len() {
529            return Err(anyhow!("Matrix dimensions incompatible with vector"));
530        }
531
532        let transformed: Vec<f32> = matrix
533            .iter()
534            .map(|row| row.iter().zip(v_f32.iter()).map(|(m, v)| m * v).sum())
535            .collect();
536
537        Ok(Vector::new(transformed))
538    }
539
540    /// Cross-language similarity search
541    pub fn cross_language_search(
542        &self,
543        query: &str,
544        query_language: &str,
545        content_items: &[CrossLanguageContent],
546        k: usize,
547    ) -> Result<Vec<CrossLanguageSearchResult>> {
548        let span = span!(Level::INFO, "cross_language_search", query_lang = %query_language);
549        let _enter = span.enter();
550
551        // Generate query vector
552        let embeddable_content = crate::embeddings::EmbeddableContent::Text(query.to_string());
553        let query_vector = self.embedding_generator.generate(&embeddable_content)?;
554
555        let mut results = Vec::new();
556
557        for content in content_items {
558            // Compute similarity with original vector
559            let primary_similarity = if content.language == query_language {
560                if let Some(ref content_vector) = content.vector {
561                    SimilarityMetric::Cosine.compute(&query_vector, content_vector)?
562                } else {
563                    0.0
564                }
565            } else {
566                0.0
567            };
568
569            // Compute cross-lingual similarity using aligned vectors
570            let mut cross_lingual_similarities = HashMap::new();
571            if let Some(aligned_vector) = content.aligned_vectors.get(query_language) {
572                let cross_similarity =
573                    SimilarityMetric::Cosine.compute(&query_vector, aligned_vector)?;
574                cross_lingual_similarities.insert("cosine".to_string(), cross_similarity);
575            }
576
577            // Determine the best similarity score
578            let best_similarity = primary_similarity.max(
579                cross_lingual_similarities
580                    .values()
581                    .copied()
582                    .fold(0.0, f32::max),
583            );
584
585            if best_similarity >= self.config.cross_lingual_threshold {
586                results.push(CrossLanguageSearchResult {
587                    id: content.id.clone(),
588                    similarity: best_similarity,
589                    language: content.language.clone(),
590                    text: content.text.clone(),
591                    translated_text: None, // Could add translation here
592                    cross_lingual_metrics: cross_lingual_similarities,
593                });
594            }
595        }
596
597        // Sort by similarity (descending)
598        results.sort_by(|a, b| b.similarity.partial_cmp(&a.similarity).unwrap());
599        results.truncate(k);
600
601        Ok(results)
602    }
603
604    /// Learn alignment mapping between languages
605    pub fn learn_alignment_mapping(
606        &mut self,
607        source_language: &str,
608        target_language: &str,
609        translation_pairs: Vec<(String, String)>,
610    ) -> Result<()> {
611        let span = span!(Level::INFO, "learn_alignment_mapping",
612                          source = %source_language, target = %target_language);
613        let _enter = span.enter();
614
615        // Generate embeddings for translation pairs
616        let mut source_vectors = Vec::new();
617        let mut target_vectors = Vec::new();
618
619        for (source_text, target_text) in &translation_pairs {
620            let source_embeddable = crate::embeddings::EmbeddableContent::Text(source_text.clone());
621            let target_embeddable = crate::embeddings::EmbeddableContent::Text(target_text.clone());
622            let source_vector = self.embedding_generator.generate(&source_embeddable)?;
623            let target_vector = self.embedding_generator.generate(&target_embeddable)?;
624
625            source_vectors.push(source_vector.as_f32());
626            target_vectors.push(target_vector.as_f32());
627        }
628
629        // Learn transformation matrix (simplified - in practice would use more sophisticated methods)
630        let transformation_matrix =
631            self.compute_transformation_matrix(&source_vectors, &target_vectors)?;
632
633        // Evaluate mapping quality
634        let quality_score = self.evaluate_mapping_quality(
635            &source_vectors,
636            &target_vectors,
637            &transformation_matrix,
638        )?;
639
640        let mapping = AlignmentMapping {
641            source_language: source_language.to_string(),
642            target_language: target_language.to_string(),
643            transformation_matrix: Some(transformation_matrix),
644            translation_pairs,
645            quality_score,
646        };
647
648        let mapping_key = format!("{source_language}:{target_language}");
649        let mut mappings = self.alignment_mappings.write().unwrap();
650        mappings.insert(mapping_key, mapping);
651
652        info!(
653            "Learned alignment mapping with quality score: {:.3}",
654            quality_score
655        );
656        Ok(())
657    }
658
659    /// Compute transformation matrix using simple linear regression
660    fn compute_transformation_matrix(
661        &self,
662        source_vectors: &[Vec<f32>],
663        target_vectors: &[Vec<f32>],
664    ) -> Result<Vec<Vec<f32>>> {
665        if source_vectors.is_empty() || source_vectors.len() != target_vectors.len() {
666            return Err(anyhow!("Invalid vector sets for learning transformation"));
667        }
668
669        let dim = source_vectors[0].len();
670
671        // Simple identity matrix as baseline (in practice, would use proper linear algebra)
672        let mut matrix = vec![vec![0.0; dim]; dim];
673        for (i, row) in matrix.iter_mut().enumerate().take(dim) {
674            row[i] = 1.0;
675        }
676
677        // Add small random perturbations to simulate learned transformation
678        for (i, row) in matrix.iter_mut().enumerate().take(dim) {
679            for (j, row_val) in row.iter_mut().enumerate().take(dim) {
680                if i != j {
681                    *row_val = (i as f32 * j as f32 * 0.001) % 0.1 - 0.05;
682                }
683            }
684        }
685
686        Ok(matrix)
687    }
688
689    /// Evaluate quality of learned mapping
690    fn evaluate_mapping_quality(
691        &self,
692        source_vectors: &[Vec<f32>],
693        target_vectors: &[Vec<f32>],
694        matrix: &[Vec<f32>],
695    ) -> Result<f32> {
696        let mut total_similarity = 0.0;
697        let mut count = 0;
698
699        for (source, target) in source_vectors.iter().zip(target_vectors) {
700            let transformed_vector = Vector::new(source.clone());
701            let transformed = self.apply_matrix_transformation(&transformed_vector, matrix)?;
702            let target_vector = Vector::new(target.clone());
703
704            let similarity = SimilarityMetric::Cosine.compute(&transformed, &target_vector)?;
705            total_similarity += similarity;
706            count += 1;
707        }
708
709        Ok(if count > 0 {
710            total_similarity / count as f32
711        } else {
712            0.0
713        })
714    }
715
716    /// Get language statistics
717    pub fn get_language_statistics(&self) -> HashMap<String, usize> {
718        let embeddings = self.multilingual_embeddings.read().unwrap();
719        let mut stats = HashMap::new();
720
721        for lang in &self.config.supported_languages {
722            stats.insert(lang.clone(), embeddings.len());
723        }
724
725        stats
726    }
727
728    /// Get supported languages
729    pub fn get_supported_languages(&self) -> &[String] {
730        &self.config.supported_languages
731    }
732}
733
734#[cfg(test)]
735mod tests {
736    use super::*;
737    use crate::embeddings::MockEmbeddingGenerator;
738
739    #[test]
740    fn test_cross_language_config_creation() {
741        let config = CrossLanguageConfig::default();
742        assert!(!config.supported_languages.is_empty());
743        assert_eq!(config.primary_language, "en");
744        assert!(config.enable_language_detection);
745    }
746
747    #[test]
748    fn test_language_detector_creation() {
749        let languages = vec!["en".to_string(), "es".to_string(), "fr".to_string()];
750        let detector = SimpleLanguageDetector::new(languages.clone());
751
752        assert!(detector.is_supported("en"));
753        assert!(detector.is_supported("es"));
754        assert!(!detector.is_supported("de"));
755    }
756
757    #[test]
758    fn test_language_detection() {
759        let detector = SimpleLanguageDetector::new(vec!["en".to_string(), "es".to_string()]);
760
761        let detection = detector.detect_language("Hello world").unwrap();
762        assert_eq!(detection.language, "en");
763        assert!(detection.confidence > 0.0);
764
765        let detection = detector.detect_language("Hola mundo").unwrap();
766        assert_eq!(detection.language, "en"); // Simple detector defaults to English
767    }
768
769    #[test]
770    fn test_alignment_strategy_variants() {
771        let strategies = vec![
772            AlignmentStrategy::MultilingualEmbeddings,
773            AlignmentStrategy::TranslationBased,
774            AlignmentStrategy::Hybrid,
775            AlignmentStrategy::LearnedMappings,
776        ];
777
778        for strategy in strategies {
779            let config = CrossLanguageConfig {
780                alignment_strategy: strategy.clone(),
781                ..Default::default()
782            };
783            assert_eq!(config.alignment_strategy, strategy);
784        }
785    }
786
787    #[tokio::test]
788    async fn test_cross_language_aligner_creation() {
789        let config = CrossLanguageConfig::default();
790        let embedding_generator = Box::new(MockEmbeddingGenerator::new());
791
792        let aligner = CrossLanguageAligner::new(config, embedding_generator);
793        assert_eq!(aligner.get_supported_languages().len(), 10);
794    }
795
796    #[tokio::test]
797    async fn test_content_processing() {
798        let config = CrossLanguageConfig::default();
799        let embedding_generator = Box::new(MockEmbeddingGenerator::new());
800
801        let aligner = CrossLanguageAligner::new(config, embedding_generator);
802        let content = aligner
803            .process_content("Hello world", "test_id")
804            .await
805            .unwrap();
806
807        assert_eq!(content.id, "test_id");
808        assert_eq!(content.text, "Hello world");
809        assert!(content.vector.is_some());
810        assert!(!content.aligned_vectors.is_empty());
811    }
812
813    #[test]
814    fn test_vector_combination() {
815        let config = CrossLanguageConfig::default();
816        let embedding_generator = Box::new(MockEmbeddingGenerator::new());
817        let aligner = CrossLanguageAligner::new(config, embedding_generator);
818
819        let vector1 = Vector::new(vec![1.0, 2.0, 3.0]);
820        let vector2 = Vector::new(vec![2.0, 4.0, 6.0]);
821
822        let combined = aligner.combine_vectors(&vector1, &vector2).unwrap();
823        let combined_f32 = combined.as_f32();
824
825        assert_eq!(combined_f32, vec![1.5, 3.0, 4.5]);
826    }
827
828    #[test]
829    fn test_cross_language_search_result() {
830        let result = CrossLanguageSearchResult {
831            id: "test".to_string(),
832            similarity: 0.8,
833            language: "en".to_string(),
834            text: "test content".to_string(),
835            translated_text: Some("contenido de prueba".to_string()),
836            cross_lingual_metrics: HashMap::new(),
837        };
838
839        assert_eq!(result.id, "test");
840        assert_eq!(result.similarity, 0.8);
841        assert_eq!(result.language, "en");
842    }
843}