oxirs-embed 0.2.4

//! Text, KG, and alignment network encoders for multi-modal embeddings

use anyhow::Result;
use scirs2_core::ndarray_ext::{Array1, Array2};
use scirs2_core::random::{Random, RngExt};
use serde::{Deserialize, Serialize};
use std::collections::HashMap;

/// Text encoder for multi-modal embeddings
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct TextEncoder {
    /// Encoder type (BERT, RoBERTa, etc.)
    pub encoder_type: String,
    /// Input dimension
    pub input_dim: usize,
    /// Output dimension
    pub output_dim: usize,
    /// Learned parameters (simplified representation)
    pub parameters: HashMap<String, Array2<f32>>,
}

impl TextEncoder {
    pub fn new(encoder_type: String, input_dim: usize, output_dim: usize) -> Self {
        let mut parameters = HashMap::new();

        // Initialize key transformation matrices
        let mut random = Random::default();
        parameters.insert(
            "projection".to_string(),
            Array2::from_shape_fn((output_dim, input_dim), |(_, _)| {
                (random.random::<f32>() - 0.5) * 0.1
            }),
        );

        let mut random = Random::default();
        parameters.insert(
            "attention".to_string(),
            Array2::from_shape_fn((output_dim, output_dim), |(_, _)| {
                (random.random::<f32>() - 0.5) * 0.1
            }),
        );

        Self {
            encoder_type,
            input_dim,
            output_dim,
            parameters,
        }
    }

    /// Encode text into embeddings
    pub fn encode(&self, text: &str) -> Result<Array1<f32>> {
        let input_features = self.extract_text_features(text);
        let projection = self
            .parameters
            .get("projection")
            .expect("parameter 'projection' should be initialized");

        // Simple linear projection (in real implementation would be full transformer)
        let encoded = projection.dot(&input_features);

        // Apply layer normalization
        let mean = encoded.mean().unwrap_or(0.0);
        let var = encoded.var(0.0);
        let normalized = encoded.mapv(|x| (x - mean) / (var + 1e-8).sqrt());

        Ok(normalized)
    }

    /// Extract features from text (simplified)
    fn extract_text_features(&self, text: &str) -> Array1<f32> {
        let mut features = vec![0.0; self.input_dim];

        // Simple bag-of-words features (would be tokenization + embeddings in real implementation)
        let words: Vec<&str> = text.split_whitespace().collect();
        for (i, word) in words.iter().enumerate() {
            if i < self.input_dim {
                features[i] = word.len() as f32 / 10.0; // Simple word length feature
            }
        }

        // Add sentence-level features
        if self.input_dim > words.len() {
            features[words.len()] = text.len() as f32 / 100.0; // Text length
            if self.input_dim > words.len() + 1 {
                features[words.len() + 1] = words.len() as f32 / 20.0; // Word count
            }
        }

        Array1::from_vec(features)
    }
}

/// Knowledge graph encoder for multi-modal embeddings
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct KGEncoder {
    /// Encoder architecture (TransE, ComplEx, etc.)
    pub architecture: String,
    /// Entity embedding dimension
    pub entity_dim: usize,
    /// Relation embedding dimension
    pub relation_dim: usize,
    /// Output dimension
    pub output_dim: usize,
    /// Learned parameters
    pub parameters: HashMap<String, Array2<f32>>,
}

impl KGEncoder {
    pub fn new(
        architecture: String,
        entity_dim: usize,
        relation_dim: usize,
        output_dim: usize,
    ) -> Self {
        let mut parameters = HashMap::new();

        // Initialize transformation matrices
        let mut random = Random::default();
        parameters.insert(
            "entity_projection".to_string(),
            Array2::from_shape_fn((output_dim, entity_dim), |(_, _)| {
                (random.random::<f32>() - 0.5) * 0.1
            }),
        );

        let mut random = Random::default();
        parameters.insert(
            "relation_projection".to_string(),
            Array2::from_shape_fn((output_dim, relation_dim), |(_, _)| {
                (random.random::<f32>() - 0.5) * 0.1
            }),
        );

        Self {
            architecture,
            entity_dim,
            relation_dim,
            output_dim,
            parameters,
        }
    }

    /// Encode knowledge graph entity
    pub fn encode_entity(&self, entity_embedding: &Array1<f32>) -> Result<Array1<f32>> {
        let projection = self
            .parameters
            .get("entity_projection")
            .expect("parameter 'entity_projection' should be initialized");

        // Ensure dimension compatibility for matrix-vector multiplication
        if projection.ncols() != entity_embedding.len() {
            // Truncate or pad entity embedding to match projection input dimension
            let target_dim = projection.ncols();
            let mut adjusted_embedding = Array1::zeros(target_dim);

            let copy_len = entity_embedding.len().min(target_dim);
            adjusted_embedding
                .slice_mut(scirs2_core::ndarray_ext::s![..copy_len])
                .assign(&entity_embedding.slice(scirs2_core::ndarray_ext::s![..copy_len]));

            Ok(projection.dot(&adjusted_embedding))
        } else {
            Ok(projection.dot(entity_embedding))
        }
    }

    /// Encode knowledge graph relation
    pub fn encode_relation(&self, relation_embedding: &Array1<f32>) -> Result<Array1<f32>> {
        let projection = self
            .parameters
            .get("relation_projection")
            .expect("parameter 'relation_projection' should be initialized");

        // Ensure dimension compatibility for matrix-vector multiplication
        if projection.ncols() != relation_embedding.len() {
            // Truncate or pad relation embedding to match projection input dimension
            let target_dim = projection.ncols();
            let mut adjusted_embedding = Array1::zeros(target_dim);

            let copy_len = relation_embedding.len().min(target_dim);
            adjusted_embedding
                .slice_mut(scirs2_core::ndarray_ext::s![..copy_len])
                .assign(&relation_embedding.slice(scirs2_core::ndarray_ext::s![..copy_len]));

            Ok(projection.dot(&adjusted_embedding))
        } else {
            Ok(projection.dot(relation_embedding))
        }
    }

    /// Encode structured knowledge (entity + relations)
    pub fn encode_structured(
        &self,
        entity: &Array1<f32>,
        relations: &[Array1<f32>],
    ) -> Result<Array1<f32>> {
        let entity_encoded = self.encode_entity(entity)?;

        // Aggregate relation information
        let mut relation_agg = Array1::<f32>::zeros(self.output_dim);
        for relation in relations {
            let rel_encoded = self.encode_relation(relation)?;
            relation_agg = &relation_agg + &rel_encoded;
        }

        if !relations.is_empty() {
            relation_agg /= relations.len() as f32;
        }

        // Combine entity and relation information
        Ok(&entity_encoded + &relation_agg)
    }
}

/// Alignment network for cross-modal learning
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct AlignmentNetwork {
    /// Network architecture
    pub architecture: String,
    /// Input dimensions (text_dim, kg_dim)
    pub input_dims: (usize, usize),
    /// Hidden dimension
    pub hidden_dim: usize,
    /// Output dimension
    pub output_dim: usize,
    /// Network parameters
    pub parameters: HashMap<String, Array2<f32>>,
}

impl AlignmentNetwork {
    pub fn new(
        architecture: String,
        text_dim: usize,
        kg_dim: usize,
        hidden_dim: usize,
        output_dim: usize,
    ) -> Self {
        let mut parameters = HashMap::new();

        // Text pathway
        let mut random = Random::default();
        parameters.insert(
            "text_hidden".to_string(),
            Array2::from_shape_fn((hidden_dim, text_dim), |(_, _)| {
                (random.random::<f32>() - 0.5) * 0.1
            }),
        );

        let mut random = Random::default();
        parameters.insert(
            "text_output".to_string(),
            Array2::from_shape_fn((output_dim, hidden_dim), |(_, _)| {
                (random.random::<f32>() - 0.5) * 0.1
            }),
        );

        // KG pathway
        let mut random = Random::default();
        parameters.insert(
            "kg_hidden".to_string(),
            Array2::from_shape_fn((hidden_dim, kg_dim), |(_, _)| {
                (random.random::<f32>() - 0.5) * 0.1
            }),
        );

        let mut random = Random::default();
        parameters.insert(
            "kg_output".to_string(),
            Array2::from_shape_fn((output_dim, hidden_dim), |(_, _)| {
                (random.random::<f32>() - 0.5) * 0.1
            }),
        );

        // Cross-modal attention
        let mut random = Random::default();
        parameters.insert(
            "cross_attention".to_string(),
            Array2::from_shape_fn((output_dim, output_dim), |(_, _)| {
                (random.random::<f32>() - 0.5) * 0.1
            }),
        );

        Self {
            architecture,
            input_dims: (text_dim, kg_dim),
            hidden_dim,
            output_dim,
            parameters,
        }
    }

    /// Align text and KG embeddings
    pub fn align(
        &self,
        text_emb: &Array1<f32>,
        kg_emb: &Array1<f32>,
    ) -> Result<(Array1<f32>, f32)> {
        // Process text embedding
        let text_hidden_matrix = self
            .parameters
            .get("text_hidden")
            .expect("parameter 'text_hidden' should be initialized");
        let text_hidden = text_hidden_matrix.dot(text_emb);
        let text_hidden = text_hidden.mapv(|x| x.max(0.0)); // ReLU activation
        let text_output_matrix = self
            .parameters
            .get("text_output")
            .expect("parameter 'text_output' should be initialized");
        let text_output = text_output_matrix.dot(&text_hidden);

        // Process KG embedding
        let kg_hidden_matrix = self
            .parameters
            .get("kg_hidden")
            .expect("parameter 'kg_hidden' should be initialized");
        let kg_hidden = kg_hidden_matrix.dot(kg_emb);
        let kg_hidden = kg_hidden.mapv(|x| x.max(0.0)); // ReLU activation
        let kg_output_matrix = self
            .parameters
            .get("kg_output")
            .expect("parameter 'kg_output' should be initialized");
        let kg_output = kg_output_matrix.dot(&kg_hidden);

        // Cross-modal attention
        let attention_weights = self.compute_attention(&text_output, &kg_output)?;

        // Weighted combination (ensure same dimensions)
        let min_dim = text_output.len().min(kg_output.len());
        let text_slice = text_output
            .slice(scirs2_core::ndarray_ext::s![..min_dim])
            .to_owned();
        let kg_slice = kg_output
            .slice(scirs2_core::ndarray_ext::s![..min_dim])
            .to_owned();
        let unified = &text_slice * attention_weights + &kg_slice * (1.0 - attention_weights);

        // Compute alignment score
        let alignment_score = self.compute_alignment_score(&text_output, &kg_output);

        Ok((unified, alignment_score))
    }

    /// Compute cross-modal attention weights
    fn compute_attention(&self, text_emb: &Array1<f32>, kg_emb: &Array1<f32>) -> Result<f32> {
        // Ensure both embeddings have the same dimension
        let min_dim = text_emb.len().min(kg_emb.len());
        let text_slice = text_emb.slice(scirs2_core::ndarray_ext::s![..min_dim]);
        let kg_slice = kg_emb.slice(scirs2_core::ndarray_ext::s![..min_dim]);

        // Simple dot product attention (avoiding matrix multiplication dimension issues)
        let attention_score = text_slice.dot(&kg_slice);
        let attention_weight = 1.0 / (1.0 + (-attention_score).exp()); // Sigmoid

        Ok(attention_weight)
    }

    /// Compute alignment score between modalities
    pub fn compute_alignment_score(&self, text_emb: &Array1<f32>, kg_emb: &Array1<f32>) -> f32 {
        // Ensure same dimensions for cosine similarity
        let min_dim = text_emb.len().min(kg_emb.len());
        let text_slice = text_emb.slice(scirs2_core::ndarray_ext::s![..min_dim]);
        let kg_slice = kg_emb.slice(scirs2_core::ndarray_ext::s![..min_dim]);

        // Cosine similarity
        let dot_product = text_slice.dot(&kg_slice);
        let text_norm = text_slice.dot(&text_slice).sqrt();
        let kg_norm = kg_slice.dot(&kg_slice).sqrt();

        if text_norm > 0.0 && kg_norm > 0.0 {
            dot_product / (text_norm * kg_norm)
        } else {
            0.0
        }
    }
}