axonml-llm 0.6.2

//! Embedding Module — Token, Position, Segment, and Sinusoidal Embeddings
//!
//! Embedding-layer building blocks for the transformer models in `axonml-llm`.
//! `TokenEmbedding` wraps `axonml_nn::Embedding` and casts `Tensor<u32>` token
//! IDs into the `f32` embedding lookup. `PositionalEmbedding` is the learned
//! variant with `forward_positions(seq_len, batch_size)` generating a
//! `[batch, seq, embed]` tensor via `Variable::expand`.
//! `SinusoidalPositionalEncoding` precomputes the classic sin/cos table with
//! `div_term = 10000^(2i/d)`, uses `Tensor::narrow` to slice `seq_len` rows
//! when the request is shorter than the table, and falls back to the full
//! tensor otherwise. `BertEmbedding` combines word, position, and token-type
//! (segment) `Embedding` layers, runs them through a local `LayerNorm`
//! (weight/bias `Parameter`, `mean_dim`/`var_dim` over the last axis, scale +
//! shift), then applies `Dropout`; `forward_with_ids` supports optional
//! `token_type_ids` and `position_ids` with auto-generated defaults.
//! `GPT2Embedding` is the GPT-style sibling with `wte` (token) + `wpe`
//! (position) lookups, no segment embedding, and no layer norm. The `Module`
//! impls route through the embedding dim, auto-build position/type tensors
//! from the batch/seq shape, and thread `train()` / `eval()` into the
//! `Dropout`. A private `u32_to_f32_tensor` helper does the index cast. Tests
//! cover token embedding shape `[2, 2, 64]`, learned positional embedding
//! shape `[2, 16, 64]`, sinusoidal slice shape `[16, 64]`, and GPT-2
//! embedding shape `[2, 2, 64]`.
//!
//! # File
//! `crates/axonml-llm/src/embedding.rs`
//!
//! # Author
//! Andrew Jewell Sr. — AutomataNexus LLC
//! ORCID: 0009-0005-2158-7060
//!
//! # Updated
//! April 16, 2026 11:15 PM EST
//!
//! # Disclaimer
//! Use at own risk. This software is provided "as is", without warranty of any
//! kind, express or implied. The author and AutomataNexus shall not be held
//! liable for any damages arising from the use of this software.

// =============================================================================
// Imports
// =============================================================================

use axonml_autograd::Variable;
use axonml_nn::{Dropout, Embedding, Module, Parameter};
use axonml_tensor::Tensor;
use axonml_tensor::creation::{ones, zeros};

// =============================================================================
// TokenEmbedding
// =============================================================================

/// Token embedding layer.
#[derive(Debug)]
pub struct TokenEmbedding {
    /// Embedding layer
    pub embedding: Embedding,
}

impl TokenEmbedding {
    /// Creates a new token embedding.
    pub fn new(vocab_size: usize, embed_dim: usize) -> Self {
        Self {
            embedding: Embedding::new(vocab_size, embed_dim),
        }
    }

    /// Gets embeddings for token IDs.
    pub fn forward_ids(&self, input_ids: &Tensor<u32>) -> Variable {
        // Convert u32 indices to f32 and delegate to Embedding::lookup
        // which has proper EmbeddingBackward for gradient tracking
        let ids_f32: Vec<f32> = input_ids.to_vec().iter().map(|&x| x as f32).collect();
        let ids_var = Variable::new(Tensor::from_vec(ids_f32, input_ids.shape()).unwrap(), false);
        self.embedding.forward(&ids_var)
    }
}

impl Module for TokenEmbedding {
    fn forward(&self, input: &Variable) -> Variable {
        self.embedding.forward(input)
    }

    fn parameters(&self) -> Vec<Parameter> {
        self.embedding.parameters()
    }
}

// =============================================================================
// PositionalEmbedding (Learned)
// =============================================================================

/// Learned positional embedding.
#[derive(Debug)]
pub struct PositionalEmbedding {
    /// Position embedding weights
    pub embedding: Embedding,
    /// Maximum sequence length
    pub max_len: usize,
}

impl PositionalEmbedding {
    /// Creates a new learned positional embedding.
    pub fn new(max_len: usize, embed_dim: usize) -> Self {
        Self {
            embedding: Embedding::new(max_len, embed_dim),
            max_len,
        }
    }

    /// Gets positional embeddings for a sequence length.
    pub fn forward_positions(&self, seq_len: usize, batch_size: usize) -> Variable {
        let embed_dim = self.embedding.embedding_dim();

        // Create position indices [0, 1, 2, ..., seq_len-1]
        let positions: Vec<f32> = (0..seq_len).map(|p| p as f32).collect();
        let position_tensor = Tensor::from_vec(positions.clone(), &[1, seq_len]).unwrap();
        let position_var = Variable::new(position_tensor, false);

        // Lookup embeddings
        let pos_embeds = self.embedding.forward(&position_var);

        // Expand to batch size
        if batch_size > 1 {
            pos_embeds.expand(&[batch_size, seq_len, embed_dim])
        } else {
            pos_embeds
        }
    }
}

impl Module for PositionalEmbedding {
    fn forward(&self, input: &Variable) -> Variable {
        self.embedding.forward(input)
    }

    fn parameters(&self) -> Vec<Parameter> {
        self.embedding.parameters()
    }
}

// =============================================================================
// SinusoidalPositionalEncoding (Fixed)
// =============================================================================

/// Sinusoidal positional encoding (fixed, not learned).
#[derive(Debug)]
pub struct SinusoidalPositionalEncoding {
    /// Precomputed positional encodings
    pub encodings: Tensor<f32>,
    /// Maximum sequence length
    pub max_len: usize,
    /// Embedding dimension
    pub embed_dim: usize,
}

impl SinusoidalPositionalEncoding {
    /// Creates sinusoidal positional encodings.
    pub fn new(max_len: usize, embed_dim: usize) -> Self {
        let mut encodings = vec![0.0f32; max_len * embed_dim];

        for pos in 0..max_len {
            for i in 0..embed_dim / 2 {
                let div_term = (10000.0f32).powf(2.0 * i as f32 / embed_dim as f32);
                let angle = pos as f32 / div_term;

                encodings[pos * embed_dim + 2 * i] = angle.sin();
                encodings[pos * embed_dim + 2 * i + 1] = angle.cos();
            }
        }

        Self {
            encodings: Tensor::from_vec(encodings, &[max_len, embed_dim]).unwrap(),
            max_len,
            embed_dim,
        }
    }

    /// Gets positional encodings for a sequence.
    pub fn forward_seq(&self, seq_len: usize) -> Variable {
        if seq_len >= self.max_len {
            // Full tensor, no slicing needed
            Variable::new(self.encodings.clone(), false)
        } else {
            // Use narrow to slice first seq_len rows — stays on GPU if encodings are on GPU
            let sliced = self.encodings.narrow(0, 0, seq_len).unwrap();
            Variable::new(sliced, false)
        }
    }
}

// =============================================================================
// BertEmbedding and LayerNorm
// =============================================================================

/// BERT-style embeddings (token + position + segment).
#[derive(Debug)]
pub struct BertEmbedding {
    /// Token embeddings
    pub word_embeddings: Embedding,
    /// Position embeddings
    pub position_embeddings: Embedding,
    /// Token type embeddings (segment embeddings)
    pub token_type_embeddings: Embedding,
    /// Layer normalization
    pub layer_norm: LayerNorm,
    /// Dropout
    pub dropout: Dropout,
    /// Embedding dimension
    pub embed_dim: usize,
}

/// Simple layer norm implementation for embeddings.
#[derive(Debug)]
pub struct LayerNorm {
    weight: Parameter,
    bias: Parameter,
    eps: f32,
}

// -----------------------------------------------------------------------------
// LayerNorm (local to embedding module)
// -----------------------------------------------------------------------------

impl LayerNorm {
    fn new(dim: usize, eps: f32) -> Self {
        let weight = Parameter::new(ones::<f32>(&[dim]), true);
        let bias = Parameter::new(zeros::<f32>(&[dim]), true);
        Self { weight, bias, eps }
    }

    fn forward(&self, x: &Variable) -> Variable {
        // Normalize over last dimension
        let mean = x.mean_dim(-1, true);
        let variance = x.var_dim(-1, true);

        let x_normalized = x.sub(&mean).div(&variance.add_scalar(self.eps).sqrt());

        // Scale and shift
        let weight_var = Variable::from_tensor_with_grad(
            self.weight.data().clone(),
            self.weight.requires_grad(),
        );
        let bias_var =
            Variable::from_tensor_with_grad(self.bias.data().clone(), self.bias.requires_grad());

        x_normalized.mul(&weight_var).add(&bias_var)
    }

    fn parameters(&self) -> Vec<Parameter> {
        vec![self.weight.clone(), self.bias.clone()]
    }
}

// -----------------------------------------------------------------------------
// BertEmbedding
// -----------------------------------------------------------------------------

impl BertEmbedding {
    /// Creates BERT embeddings.
    pub fn new(
        vocab_size: usize,
        max_position_embeddings: usize,
        type_vocab_size: usize,
        hidden_size: usize,
        layer_norm_eps: f32,
        dropout_prob: f32,
    ) -> Self {
        Self {
            word_embeddings: Embedding::new(vocab_size, hidden_size),
            position_embeddings: Embedding::new(max_position_embeddings, hidden_size),
            token_type_embeddings: Embedding::new(type_vocab_size, hidden_size),
            layer_norm: LayerNorm::new(hidden_size, layer_norm_eps),
            dropout: Dropout::new(dropout_prob),
            embed_dim: hidden_size,
        }
    }

    /// Forward pass with token IDs, position IDs, and token type IDs.
    pub fn forward_with_ids(
        &self,
        input_ids: &Tensor<u32>,
        token_type_ids: Option<&Tensor<u32>>,
        position_ids: Option<&Tensor<u32>>,
    ) -> Variable {
        let batch_size = input_ids.shape()[0];
        let seq_len = input_ids.shape()[1];

        // Token embeddings
        let input_ids_f32 = Self::u32_to_f32_tensor(input_ids);
        let word_embeds = self
            .word_embeddings
            .forward(&Variable::new(input_ids_f32, false));

        // Position embeddings
        let pos_ids = if let Some(ids) = position_ids {
            Self::u32_to_f32_tensor(ids)
        } else {
            let positions: Vec<f32> = (0..seq_len).map(|p| p as f32).collect();
            let pos_data: Vec<f32> = (0..batch_size)
                .flat_map(|_| positions.iter().cloned())
                .collect();
            Tensor::from_vec(pos_data, &[batch_size, seq_len]).unwrap()
        };
        let position_embeds = self
            .position_embeddings
            .forward(&Variable::new(pos_ids, false));

        // Token type embeddings
        let type_ids = if let Some(ids) = token_type_ids {
            Self::u32_to_f32_tensor(ids)
        } else {
            zeros::<f32>(&[batch_size, seq_len])
        };
        let token_type_embeds = self
            .token_type_embeddings
            .forward(&Variable::new(type_ids, false));

        // Combine embeddings
        let embeddings = word_embeds.add(&position_embeds).add(&token_type_embeds);

        // Layer norm and dropout
        let embeddings = self.layer_norm.forward(&embeddings);
        self.dropout.forward(&embeddings)
    }

    fn u32_to_f32_tensor(t: &Tensor<u32>) -> Tensor<f32> {
        let data: Vec<f32> = t.to_vec().iter().map(|&x| x as f32).collect();
        Tensor::from_vec(data, t.shape()).unwrap()
    }
}

impl Module for BertEmbedding {
    fn forward(&self, input: &Variable) -> Variable {
        // For Module trait, assume input is already f32 token indices
        let input_data = input.data();
        let shape = input_data.shape();
        let batch_size = shape[0];
        let seq_len = shape[1];

        let word_embeds = self.word_embeddings.forward(input);

        // Generate position IDs
        let positions: Vec<f32> = (0..seq_len).map(|p| p as f32).collect();
        let pos_data: Vec<f32> = (0..batch_size)
            .flat_map(|_| positions.iter().cloned())
            .collect();
        let pos_tensor = Tensor::from_vec(pos_data, &[batch_size, seq_len]).unwrap();
        let position_embeds = self
            .position_embeddings
            .forward(&Variable::new(pos_tensor, false));

        // Token type embeddings (assume all zeros)
        let type_tensor = zeros::<f32>(&[batch_size, seq_len]);
        let token_type_embeds = self
            .token_type_embeddings
            .forward(&Variable::new(type_tensor, false));

        let embeddings = word_embeds.add(&position_embeds).add(&token_type_embeds);
        let embeddings = self.layer_norm.forward(&embeddings);
        self.dropout.forward(&embeddings)
    }

    fn parameters(&self) -> Vec<Parameter> {
        let mut params = Vec::new();
        params.extend(self.word_embeddings.parameters());
        params.extend(self.position_embeddings.parameters());
        params.extend(self.token_type_embeddings.parameters());
        params.extend(self.layer_norm.parameters());
        params
    }

    fn train(&mut self) {
        self.dropout.train();
    }

    fn eval(&mut self) {
        self.dropout.eval();
    }
}

// =============================================================================
// GPT2Embedding
// =============================================================================

/// GPT-2 style embeddings (token + position).
#[derive(Debug)]
pub struct GPT2Embedding {
    /// Token embeddings
    pub wte: Embedding,
    /// Position embeddings
    pub wpe: Embedding,
    /// Dropout
    pub dropout: Dropout,
    /// Embedding dimension
    pub n_embd: usize,
}

impl GPT2Embedding {
    /// Creates GPT-2 embeddings.
    pub fn new(vocab_size: usize, n_ctx: usize, n_embd: usize, dropout: f32) -> Self {
        Self {
            wte: Embedding::new(vocab_size, n_embd),
            wpe: Embedding::new(n_ctx, n_embd),
            dropout: Dropout::new(dropout),
            n_embd,
        }
    }

    /// Forward pass with token IDs.
    pub fn forward_ids(&self, input_ids: &Tensor<u32>) -> Variable {
        let batch_size = input_ids.shape()[0];
        let seq_len = input_ids.shape()[1];

        // Token embeddings
        let input_ids_f32 = Self::u32_to_f32_tensor(input_ids);
        let token_embeds = self.wte.forward(&Variable::new(input_ids_f32, false));

        // Position embeddings
        let positions: Vec<f32> = (0..seq_len).map(|p| p as f32).collect();
        let pos_data: Vec<f32> = (0..batch_size)
            .flat_map(|_| positions.iter().cloned())
            .collect();
        let pos_tensor = Tensor::from_vec(pos_data, &[batch_size, seq_len]).unwrap();
        let position_embeds = self.wpe.forward(&Variable::new(pos_tensor, false));

        // Combine and apply dropout
        let embeddings = token_embeds.add(&position_embeds);
        self.dropout.forward(&embeddings)
    }

    fn u32_to_f32_tensor(t: &Tensor<u32>) -> Tensor<f32> {
        let data: Vec<f32> = t.to_vec().iter().map(|&x| x as f32).collect();
        Tensor::from_vec(data, t.shape()).unwrap()
    }
}

impl Module for GPT2Embedding {
    fn forward(&self, input: &Variable) -> Variable {
        let input_data = input.data();
        let shape = input_data.shape();
        let batch_size = shape[0];
        let seq_len = shape[1];

        let token_embeds = self.wte.forward(input);

        // Position embeddings
        let positions: Vec<f32> = (0..seq_len).map(|p| p as f32).collect();
        let pos_data: Vec<f32> = (0..batch_size)
            .flat_map(|_| positions.iter().cloned())
            .collect();
        let pos_tensor = Tensor::from_vec(pos_data, &[batch_size, seq_len]).unwrap();
        let position_embeds = self.wpe.forward(&Variable::new(pos_tensor, false));

        let embeddings = token_embeds.add(&position_embeds);
        self.dropout.forward(&embeddings)
    }

    fn parameters(&self) -> Vec<Parameter> {
        let mut params = Vec::new();
        params.extend(self.wte.parameters());
        params.extend(self.wpe.parameters());
        params
    }

    fn train(&mut self) {
        self.dropout.train();
    }

    fn eval(&mut self) {
        self.dropout.eval();
    }
}

// =============================================================================
// Tests
// =============================================================================

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_token_embedding() {
        let embed = TokenEmbedding::new(1000, 64);
        let input_ids = Tensor::from_vec(vec![1u32, 2, 3, 4], &[2, 2]).unwrap();
        let output = embed.forward_ids(&input_ids);

        assert_eq!(output.data().shape(), &[2, 2, 64]);
    }

    #[test]
    fn test_positional_embedding() {
        let embed = PositionalEmbedding::new(128, 64);
        let output = embed.forward_positions(16, 2);

        assert_eq!(output.data().shape(), &[2, 16, 64]);
    }

    #[test]
    fn test_sinusoidal_encoding() {
        let encoding = SinusoidalPositionalEncoding::new(100, 64);
        let output = encoding.forward_seq(16);

        assert_eq!(output.data().shape(), &[16, 64]);
    }

    #[test]
    fn test_gpt2_embedding() {
        let embed = GPT2Embedding::new(1000, 128, 64, 0.0);
        let input_ids = Tensor::from_vec(vec![1u32, 2, 3, 4], &[2, 2]).unwrap();
        let output = embed.forward_ids(&input_ids);

        assert_eq!(output.data().shape(), &[2, 2, 64]);
    }
}