scirs2-neural 0.4.2

//! Multi-Query Attention (MQA) implementation
//!
//! This module implements Multi-Query Attention as described in:
//! "Fast Transformer Decoding: One Write-Head is All You Need"
//! by Noam Shazeer (2019).
//!
//! In MQA, all query heads share a single set of key and value projections.
//! This drastically reduces the KV cache size during autoregressive generation
//! (by a factor of `num_heads`), while maintaining most of the quality of
//! standard multi-head attention.
//!
//! When `num_kv_heads == num_heads`, MQA degenerates to standard MHA.

use crate::error::{NeuralError, Result};
use crate::layers::Layer;
use scirs2_core::ndarray::{s, Array, Array2, Array4, IxDyn, ScalarOperand};
use scirs2_core::numeric::{Float, NumAssign};
use scirs2_core::random::{Rng, RngExt};
use std::fmt::Debug;

// ---------------------------------------------------------------------------
// Helpers
// ---------------------------------------------------------------------------

/// Xavier-uniform weight initialisation returning an IxDyn array.
fn mk_weight<F: Float, R: Rng>(rows: usize, cols: usize, rng: &mut R) -> Result<Array<F, IxDyn>> {
    let scale = (6.0_f64 / (rows + cols) as f64).sqrt();
    let mut data = Vec::with_capacity(rows * cols);
    for _ in 0..(rows * cols) {
        let x: f64 = rng.random_range(-scale..scale);
        let f = F::from(x)
            .ok_or_else(|| NeuralError::InvalidArchitecture("xavier cast failed".into()))?;
        data.push(f);
    }
    Array::from_shape_vec(IxDyn(&[rows, cols]), data)
        .map_err(|e| NeuralError::InvalidArchitecture(format!("mk_weight: {e}")))
}

/// Softmax over a mutable slice (in-place, numerically stable).
fn softmax_inplace<F: Float + NumAssign>(s: &mut [F]) {
    let max_v = s
        .iter()
        .fold(F::neg_infinity(), |a, &b| if b > a { b } else { a });
    let mut sum = F::zero();
    for v in s.iter_mut() {
        *v = (*v - max_v).exp();
        sum += *v;
    }
    let eps = F::from(1e-12).unwrap_or(F::zero());
    let norm = if sum < eps { eps } else { sum };
    for v in s.iter_mut() {
        *v /= norm;
    }
}

// ---------------------------------------------------------------------------
// KV Cache
// ---------------------------------------------------------------------------

/// Key-Value cache for autoregressive generation
///
/// Stores past key and value tensors so they do not need to be recomputed
/// during incremental decoding.
#[derive(Debug, Clone)]
pub struct KvCache<F: Float> {
    /// Cached keys: [batch, past_len, num_kv_heads, head_dim]
    pub keys: Array<F, IxDyn>,
    /// Cached values: [batch, past_len, num_kv_heads, head_dim]
    pub values: Array<F, IxDyn>,
}

// ---------------------------------------------------------------------------
// Configuration
// ---------------------------------------------------------------------------

/// Configuration for Multi-Query Attention
#[derive(Debug, Clone)]
pub struct MultiQueryAttentionConfig {
    /// Number of query heads
    pub num_heads: usize,
    /// Number of KV heads (1 = pure MQA, num_heads = standard MHA)
    pub num_kv_heads: usize,
    /// Per-head dimension
    pub head_dim: usize,
    /// Dropout probability
    pub dropout_prob: f64,
    /// Whether to apply causal masking
    pub causal: bool,
}

impl Default for MultiQueryAttentionConfig {
    fn default() -> Self {
        Self {
            num_heads: 8,
            num_kv_heads: 1,
            head_dim: 64,
            dropout_prob: 0.0,
            causal: false,
        }
    }
}

impl MultiQueryAttentionConfig {
    /// Create a pure MQA config (1 KV head)
    pub fn new(num_heads: usize, head_dim: usize) -> Self {
        Self {
            num_heads,
            num_kv_heads: 1,
            head_dim,
            ..Default::default()
        }
    }

    /// Set number of KV heads (1 = MQA, num_heads = MHA)
    pub fn with_num_kv_heads(mut self, n: usize) -> Self {
        self.num_kv_heads = n;
        self
    }

    /// Enable or disable causal masking
    pub fn with_causal(mut self, causal: bool) -> Self {
        self.causal = causal;
        self
    }

    /// Set dropout probability
    pub fn with_dropout(mut self, prob: f64) -> Self {
        self.dropout_prob = prob;
        self
    }
}

// ---------------------------------------------------------------------------
// Layer
// ---------------------------------------------------------------------------

/// Multi-Query Attention layer
///
/// Projects queries with `num_heads` independent heads but uses only
/// `num_kv_heads` (default 1) shared key/value heads.
///
/// # Input
/// 3D tensor `[batch, seq_len, d_model]`
///
/// # Output
/// 3D tensor `[batch, seq_len, d_model]`
///
/// # Examples
///
/// ```rust
/// use scirs2_neural::layers::{MultiQueryAttention, MultiQueryAttentionConfig, Layer};
/// use scirs2_core::ndarray::Array3;
/// use scirs2_core::random::rng;
///
/// let mut rng = rng();
/// let config = MultiQueryAttentionConfig::new(4, 16); // 4 Q heads, 1 KV head
/// let mqa = MultiQueryAttention::<f64>::new(64, config, &mut rng).expect("failed");
///
/// let input = Array3::<f64>::from_elem((2, 8, 64), 0.1).into_dyn();
/// let output = mqa.forward(&input).expect("failed");
/// assert_eq!(output.shape(), &[2, 8, 64]);
/// ```
#[derive(Debug)]
pub struct MultiQueryAttention<F: Float + Debug + Send + Sync + NumAssign> {
    d_model: usize,
    config: MultiQueryAttentionConfig,
    /// [d_model, num_heads * head_dim]
    w_q: Array<F, IxDyn>,
    /// [d_model, num_kv_heads * head_dim]
    w_k: Array<F, IxDyn>,
    /// [d_model, num_kv_heads * head_dim]
    w_v: Array<F, IxDyn>,
    /// [num_heads * head_dim, d_model]
    w_o: Array<F, IxDyn>,
    scale: F,
}

impl<F: Float + Debug + ScalarOperand + Send + Sync + 'static + NumAssign> MultiQueryAttention<F> {
    /// Create a new Multi-Query Attention layer
    pub fn new<R: Rng>(
        d_model: usize,
        config: MultiQueryAttentionConfig,
        rng: &mut R,
    ) -> Result<Self> {
        if config.num_heads == 0 || config.num_kv_heads == 0 || config.head_dim == 0 {
            return Err(NeuralError::InvalidArchitecture(
                "num_heads, num_kv_heads, head_dim must be > 0".into(),
            ));
        }

        if !config.num_heads.is_multiple_of(config.num_kv_heads) {
            return Err(NeuralError::InvalidArchitecture(format!(
                "num_heads ({}) must be divisible by num_kv_heads ({})",
                config.num_heads, config.num_kv_heads
            )));
        }

        let q_dim = config.num_heads * config.head_dim;
        let kv_dim = config.num_kv_heads * config.head_dim;

        if q_dim != d_model {
            return Err(NeuralError::InvalidArchitecture(format!(
                "num_heads * head_dim ({q_dim}) must equal d_model ({d_model})"
            )));
        }

        let w_q = mk_weight(d_model, q_dim, rng)?;
        let w_k = mk_weight(d_model, kv_dim, rng)?;
        let w_v = mk_weight(d_model, kv_dim, rng)?;
        let w_o = mk_weight(q_dim, d_model, rng)?;

        let scale = F::one()
            / F::from(config.head_dim)
                .ok_or_else(|| NeuralError::InvalidArchitecture("scale cast".into()))?
                .sqrt();

        Ok(Self {
            d_model,
            config,
            w_q,
            w_k,
            w_v,
            w_o,
            scale,
        })
    }

    /// Forward pass with optional KV cache for autoregressive generation
    ///
    /// # Arguments
    /// * `input` - [batch, seq_len, d_model]
    /// * `past_kv` - Optional past KV cache
    ///
    /// # Returns
    /// (output [batch, seq_len, d_model], updated KV cache)
    pub fn forward_with_cache(
        &self,
        input: &Array<F, IxDyn>,
        past_kv: Option<&KvCache<F>>,
    ) -> Result<(Array<F, IxDyn>, KvCache<F>)> {
        if input.ndim() != 3 {
            return Err(NeuralError::InvalidArchitecture(format!(
                "MQA expects 3D input, got {}D",
                input.ndim()
            )));
        }

        let shape = input.shape();
        let (batch, seq_len, d_model) = (shape[0], shape[1], shape[2]);

        if d_model != self.d_model {
            return Err(NeuralError::InvalidArchitecture(format!(
                "input dim {d_model} != d_model {}",
                self.d_model
            )));
        }

        let num_heads = self.config.num_heads;
        let num_kv_heads = self.config.num_kv_heads;
        let head_dim = self.config.head_dim;
        let group_size = num_heads / num_kv_heads;

        // Project Q, K, V
        let q_4d =
            self.project_and_reshape(input, &self.w_q, batch, seq_len, num_heads, head_dim)?;
        let k_new =
            self.project_and_reshape(input, &self.w_k, batch, seq_len, num_kv_heads, head_dim)?;
        let v_new =
            self.project_and_reshape(input, &self.w_v, batch, seq_len, num_kv_heads, head_dim)?;

        // Concatenate with past cache if provided
        let (k_4d, v_4d, total_kv_len) = if let Some(cache) = past_kv {
            let past_len = cache.keys.shape()[1];
            let total = past_len + seq_len;
            let k_full =
                self.concat_cache(&cache.keys, &k_new, batch, total, num_kv_heads, head_dim)?;
            let v_full =
                self.concat_cache(&cache.values, &v_new, batch, total, num_kv_heads, head_dim)?;
            (k_full, v_full, total)
        } else {
            (k_new.clone(), v_new.clone(), seq_len)
        };

        // Build updated cache
        let new_cache = KvCache {
            keys: k_4d.clone().into_dyn(),
            values: v_4d.clone().into_dyn(),
        };

        // Compute attention
        // Q: [batch, seq_len, num_heads, head_dim]
        // K, V: [batch, total_kv_len, num_kv_heads, head_dim]
        let mut output_4d = Array4::<F>::zeros((batch, seq_len, num_heads, head_dim));

        for b in 0..batch {
            for kv_h in 0..num_kv_heads {
                let q_h_start = kv_h * group_size;
                let q_h_end = q_h_start + group_size;

                for q_h in q_h_start..q_h_end {
                    for t in 0..seq_len {
                        // Compute attention scores
                        let global_t = if past_kv.is_some() {
                            let past_len = past_kv.map(|c| c.keys.shape()[1]).unwrap_or(0);
                            past_len + t
                        } else {
                            t
                        };

                        let mut scores = Vec::with_capacity(total_kv_len);
                        for s_idx in 0..total_kv_len {
                            if self.config.causal && s_idx > global_t {
                                scores.push(F::neg_infinity());
                            } else {
                                let mut dot = F::zero();
                                for d in 0..head_dim {
                                    dot += q_4d[[b, t, q_h, d]] * k_4d[[b, s_idx, kv_h, d]];
                                }
                                scores.push(dot * self.scale);
                            }
                        }

                        softmax_inplace(&mut scores);

                        // Weighted sum of values
                        for d in 0..head_dim {
                            let mut acc = F::zero();
                            for s_idx in 0..total_kv_len {
                                acc += scores[s_idx] * v_4d[[b, s_idx, kv_h, d]];
                            }
                            output_4d[[b, t, q_h, d]] = acc;
                        }
                    }
                }
            }
        }

        // Reshape to [batch, seq_len, d_model] and project output
        let output_3d = output_4d
            .into_shape_with_order((batch, seq_len, d_model))
            .map_err(|e| NeuralError::InferenceError(format!("reshape output: {e}")))?;

        let output_2d = output_3d
            .into_shape_with_order((batch * seq_len, d_model))
            .map_err(|e| NeuralError::InferenceError(format!("reshape for O proj: {e}")))?;

        let w_o_2d = self
            .w_o
            .view()
            .into_dimensionality::<scirs2_core::ndarray::Ix2>()
            .map_err(|_| NeuralError::InferenceError("O weights 2D".into()))?;

        let final_out = output_2d.dot(&w_o_2d);

        let result = final_out
            .into_shape_with_order((batch, seq_len, d_model))
            .map_err(|e| NeuralError::InferenceError(format!("reshape final: {e}")))?;

        Ok((result.into_dyn(), new_cache))
    }

    /// Project input and reshape to [batch, seq, heads, head_dim]
    fn project_and_reshape(
        &self,
        input: &Array<F, IxDyn>,
        weight: &Array<F, IxDyn>,
        batch: usize,
        seq: usize,
        heads: usize,
        head_dim: usize,
    ) -> Result<Array4<F>> {
        let d_model = input.shape()[2];
        let proj_dim = heads * head_dim;

        // [batch * seq, d_model] @ [d_model, proj_dim] = [batch * seq, proj_dim]
        let input_2d = input
            .clone()
            .into_shape_with_order(IxDyn(&[batch * seq, d_model]))
            .map_err(|e| NeuralError::InferenceError(format!("reshape: {e}")))?;

        let input_2d_view = input_2d
            .view()
            .into_dimensionality::<scirs2_core::ndarray::Ix2>()
            .map_err(|_| NeuralError::InferenceError("to Ix2".into()))?;

        let w_2d = weight
            .view()
            .into_dimensionality::<scirs2_core::ndarray::Ix2>()
            .map_err(|_| NeuralError::InferenceError("weight to Ix2".into()))?;

        let projected = input_2d_view.dot(&w_2d);

        projected
            .into_shape_with_order((batch, seq, heads, head_dim))
            .map_err(|e| NeuralError::InferenceError(format!("reshape projected: {e}")))
    }

    /// Concatenate past cache with new KV along the seq dimension
    fn concat_cache(
        &self,
        past: &Array<F, IxDyn>,
        new: &Array4<F>,
        batch: usize,
        total_len: usize,
        heads: usize,
        head_dim: usize,
    ) -> Result<Array4<F>> {
        let past_len = past.shape()[1];
        let new_len = new.shape()[1];

        if past_len + new_len != total_len {
            return Err(NeuralError::InferenceError(
                "cache concat length mismatch".into(),
            ));
        }

        let mut result = Array4::<F>::zeros((batch, total_len, heads, head_dim));

        // Copy past
        for b in 0..batch {
            for t in 0..past_len {
                for h in 0..heads {
                    for d in 0..head_dim {
                        result[[b, t, h, d]] = past[[b, t, h, d]];
                    }
                }
            }
            // Copy new
            for t in 0..new_len {
                for h in 0..heads {
                    for d in 0..head_dim {
                        result[[b, past_len + t, h, d]] = new[[b, t, h, d]];
                    }
                }
            }
        }

        Ok(result)
    }

    /// Get configuration
    pub fn config(&self) -> &MultiQueryAttentionConfig {
        &self.config
    }

    /// Get model dimension
    pub fn d_model(&self) -> usize {
        self.d_model
    }
}

impl<F> Layer<F> for MultiQueryAttention<F>
where
    F: Float + Debug + ScalarOperand + Send + Sync + 'static + NumAssign,
{
    fn as_any(&self) -> &dyn std::any::Any {
        self
    }

    fn as_any_mut(&mut self) -> &mut dyn std::any::Any {
        self
    }

    fn forward(&self, input: &Array<F, IxDyn>) -> Result<Array<F, IxDyn>> {
        let (output, _cache) = self.forward_with_cache(input, None)?;
        Ok(output)
    }

    fn backward(
        &self,
        _input: &Array<F, IxDyn>,
        _grad_output: &Array<F, IxDyn>,
    ) -> Result<Array<F, IxDyn>> {
        Err(NeuralError::NotImplementedError(
            "MQA backward not yet implemented".into(),
        ))
    }

    fn update(&mut self, _learning_rate: F) -> Result<()> {
        Ok(())
    }

    fn layer_type(&self) -> &str {
        "MultiQueryAttention"
    }

    fn parameter_count(&self) -> usize {
        let q_dim = self.config.num_heads * self.config.head_dim;
        let kv_dim = self.config.num_kv_heads * self.config.head_dim;
        let dm = self.d_model;
        dm * q_dim + 2 * dm * kv_dim + q_dim * dm
    }
}

// ===========================================================================
// Tests
// ===========================================================================

#[cfg(test)]
mod tests {
    use super::*;
    use scirs2_core::ndarray::Array3;

    #[test]
    fn test_mqa_creation() {
        let mut rng = scirs2_core::random::rng();
        let config = MultiQueryAttentionConfig::new(4, 16); // 4 Q heads, 1 KV head
        let mqa = MultiQueryAttention::<f64>::new(64, config, &mut rng);
        assert!(mqa.is_ok());
    }

    #[test]
    fn test_mqa_forward_shape() {
        let mut rng = scirs2_core::random::rng();
        let config = MultiQueryAttentionConfig::new(4, 16);
        let mqa = MultiQueryAttention::<f64>::new(64, config, &mut rng).expect("creation failed");

        let input = Array3::<f64>::from_elem((2, 8, 64), 0.1).into_dyn();
        let output = mqa.forward(&input).expect("forward failed");
        assert_eq!(output.shape(), &[2, 8, 64]);
    }

    #[test]
    fn test_mqa_kv_cache() {
        let mut rng = scirs2_core::random::rng();
        let config = MultiQueryAttentionConfig::new(4, 16).with_causal(true);
        let mqa = MultiQueryAttention::<f64>::new(64, config, &mut rng).expect("creation failed");

        // First step: process prefix
        let prefix = Array3::<f64>::from_elem((1, 4, 64), 0.1).into_dyn();
        let (out1, cache1) = mqa
            .forward_with_cache(&prefix, None)
            .expect("step 1 failed");
        assert_eq!(out1.shape(), &[1, 4, 64]);
        assert_eq!(cache1.keys.shape()[1], 4);
        assert_eq!(cache1.values.shape()[1], 4);

        // Second step: process one new token with cache
        let new_token = Array3::<f64>::from_elem((1, 1, 64), 0.2).into_dyn();
        let (out2, cache2) = mqa
            .forward_with_cache(&new_token, Some(&cache1))
            .expect("step 2 failed");
        assert_eq!(out2.shape(), &[1, 1, 64]);
        assert_eq!(cache2.keys.shape()[1], 5); // 4 + 1
        assert_eq!(cache2.values.shape()[1], 5);
    }

    #[test]
    fn test_mqa_with_num_heads_equals_mha() {
        // When num_kv_heads == num_heads, MQA should behave like MHA
        let mut rng = scirs2_core::random::rng();
        let config = MultiQueryAttentionConfig::new(4, 16).with_num_kv_heads(4); // same as num_heads = MHA
        let mqa = MultiQueryAttention::<f64>::new(64, config, &mut rng).expect("creation failed");

        let input = Array3::<f64>::from_elem((1, 6, 64), 0.15).into_dyn();
        let output = mqa.forward(&input).expect("forward failed");
        assert_eq!(output.shape(), &[1, 6, 64]);

        // Output should be finite
        for val in output.iter() {
            assert!(val.is_finite(), "MHA-mode output has non-finite value");
        }
    }

    #[test]
    fn test_mqa_causal_masking() {
        let mut rng = scirs2_core::random::rng();
        let config = MultiQueryAttentionConfig::new(2, 8).with_causal(true);
        let mqa = MultiQueryAttention::<f64>::new(16, config, &mut rng).expect("creation failed");

        let mut input = Array3::<f64>::zeros((1, 6, 16));
        for t in 0..6 {
            for d in 0..16 {
                input[[0, t, d]] = (t as f64 + 1.0) * 0.1 + d as f64 * 0.01;
            }
        }

        let output = mqa.forward(&input.into_dyn()).expect("forward failed");
        assert_eq!(output.shape(), &[1, 6, 16]);

        for val in output.iter() {
            assert!(val.is_finite(), "causal output non-finite");
        }
    }

    #[test]
    fn test_mqa_invalid_config() {
        let mut rng = scirs2_core::random::rng();

        // num_heads not divisible by num_kv_heads
        let config = MultiQueryAttentionConfig::new(5, 16).with_num_kv_heads(3);
        let result = MultiQueryAttention::<f64>::new(80, config, &mut rng);
        assert!(result.is_err());
    }

    #[test]
    fn test_mqa_parameter_count() {
        let mut rng = scirs2_core::random::rng();
        let config = MultiQueryAttentionConfig::new(4, 16); // 1 KV head
        let mqa = MultiQueryAttention::<f64>::new(64, config, &mut rng).expect("creation failed");

        // Q: 64 * 64 = 4096
        // K: 64 * 16 = 1024
        // V: 64 * 16 = 1024
        // O: 64 * 64 = 4096
        assert_eq!(mqa.parameter_count(), 4096 + 1024 + 1024 + 4096);
    }
}