tabicl-model 2.1.1

//! Transformer encoder blocks + stacks — port of:
//!
//!   - `tabicl._model.layers.MultiheadAttentionBlock` (MAB)
//!   - `tabicl._model.encoders.Encoder` (a stack of MABs with optional RoPE)
//!
//! The MAB is a standard transformer-encoder layer: pre/post LayerNorm,
//! multi-head self/cross attention, GELU feed-forward, residual additions.
//! The Encoder stack composes `num_blocks` MABs and a single shared RoPE.
//!
//! Status: reference (host fp32) implementation of MAB and the stack. KV
//! caching, train-size-aware attention (where keys come from the first
//! `train_size` positions), and the `cached_kv` fast path will follow.

use ndarray::{Array3, ArrayView2, ArrayView3, Axis};
use thiserror::Error;

use crate::attention::{AttentionConfig, AttentionParams};
use crate::layers::{layer_norm_last, linear3d};
use crate::rope::{RopeConfig, RopeTables};
use crate::state_dict::{StateDict, StateDictError};
use crate::tabicl::Activation;

#[derive(Debug, Error)]
pub enum EncoderError {
    #[error("d_model ({d_model}) must be divisible by nhead ({nhead})")]
    BadDims { d_model: usize, nhead: usize },
}

/// Per-block static configuration.
#[derive(Debug, Clone)]
pub struct MabConfig {
    pub d_model: usize,
    pub nhead: usize,
    pub dim_feedforward: usize,
    pub dropout: f32,
    pub activation: Activation,
    pub norm_first: bool,
    pub bias_free_ln: bool,
}

impl MabConfig {
    pub fn head_dim(&self) -> usize {
        self.d_model / self.nhead
    }
    pub fn attention_cfg(&self) -> AttentionConfig {
        AttentionConfig {
            embed_dim: self.d_model,
            num_heads: self.nhead,
            dropout: self.dropout,
            bias: true,
        }
    }
}

/// Per-block parameters. Layout matches the `nn.TransformerEncoderLayer`
/// state-dict naming used by PyTorch (and therefore the Python TabICL
/// checkpoints):
///
///   - `norm1.{weight, bias}` (`bias` absent if `bias_free_ln`)
///   - `norm2.{weight, bias}`
///   - `attn.in_proj_{weight, bias}`, `attn.out_proj.{weight, bias}`
///   - `linear1.{weight, bias}`, `linear2.{weight, bias}`
/// Optional SSMax block inside an MAB. When present, the attention
/// forward multiplies queries by the SSMax-computed scale.
#[derive(Debug, Clone)]
pub struct MabSsmax {
    pub spec: crate::ssmax::SsmaxSpec,
    pub params: crate::ssmax::SsmaxParams,
}

#[derive(Debug, Clone)]
pub struct MabParams {
    pub norm1_gamma: Vec<f32>,
    pub norm1_beta: Option<Vec<f32>>,
    pub norm2_gamma: Vec<f32>,
    pub norm2_beta: Option<Vec<f32>>,
    pub attn: AttentionParams,
    /// Optional SSMax: present when ssmax != "none".
    pub ssmax: Option<MabSsmax>,
    /// Linear1: `(dim_feedforward, d_model)`.
    pub linear1: ndarray::Array2<f32>,
    pub linear1_bias: Option<Vec<f32>>,
    /// Linear2: `(d_model, dim_feedforward)`.
    pub linear2: ndarray::Array2<f32>,
    pub linear2_bias: Option<Vec<f32>>,
}

impl MabParams {
    /// Load from a state dict. PyTorch `nn.TransformerEncoderLayer` keys
    /// under `{prefix}`:
    ///
    ///   - `norm1.weight`, `norm1.bias` (bias absent if `bias_free_ln`)
    ///   - `norm2.weight`, `norm2.bias`
    ///   - `attn.in_proj_weight`, `attn.in_proj_bias`,
    ///     `attn.out_proj.weight`, `attn.out_proj.bias`
    ///   - `linear1.weight`, `linear1.bias`
    ///   - `linear2.weight`, `linear2.bias`
    /// Variant of [`Self::load_from`] that also loads an optional SSMax
    /// block under `{prefix}.attn.ssmax_layer.*`. Pass the ssmax kind
    /// configured for *this* MAB (ISAB's second MAB always has
    /// `SsmaxKind::None`).
    pub fn load_from_with_ssmax(
        &mut self,
        sd: &StateDict,
        prefix: &str,
        cfg: &MabConfig,
        ssmax_kind: crate::ssmax::SsmaxKind,
    ) -> Result<(), StateDictError> {
        self.load_from(sd, prefix, cfg)?;
        if let Some(spec) = crate::ssmax::SsmaxSpec::create(ssmax_kind, cfg.nhead, cfg.d_model)
            .ok()
            .flatten()
        {
            let mut params = crate::ssmax::SsmaxParams::zeros(&spec);
            params.load_from(sd, &format!("{prefix}.attn"), &spec)?;
            self.ssmax = Some(MabSsmax { spec, params });
        }
        Ok(())
    }

    pub fn load_from(
        &mut self,
        sd: &StateDict,
        prefix: &str,
        cfg: &MabConfig,
    ) -> Result<(), StateDictError> {
        let d = cfg.d_model;
        let ff = cfg.dim_feedforward;

        self.norm1_gamma = sd.take_vec(&format!("{prefix}.norm1.weight"), d)?;
        let n1_b_key = format!("{prefix}.norm1.bias");
        if sd.tensors.contains_key(&n1_b_key) {
            self.norm1_beta = Some(sd.take_vec(&n1_b_key, d)?);
        }
        self.norm2_gamma = sd.take_vec(&format!("{prefix}.norm2.weight"), d)?;
        let n2_b_key = format!("{prefix}.norm2.bias");
        if sd.tensors.contains_key(&n2_b_key) {
            self.norm2_beta = Some(sd.take_vec(&n2_b_key, d)?);
        }

        self.attn.load_from(sd, &format!("{prefix}.attn"), d)?;

        self.linear1 = sd.take_array2(&format!("{prefix}.linear1.weight"), ff, d)?;
        let l1b = format!("{prefix}.linear1.bias");
        if sd.tensors.contains_key(&l1b) {
            self.linear1_bias = Some(sd.take_vec(&l1b, ff)?);
        }
        self.linear2 = sd.take_array2(&format!("{prefix}.linear2.weight"), d, ff)?;
        let l2b = format!("{prefix}.linear2.bias");
        if sd.tensors.contains_key(&l2b) {
            self.linear2_bias = Some(sd.take_vec(&l2b, d)?);
        }
        Ok(())
    }

    /// Build zero-initialized parameters with the Python init scheme:
    /// `attn.out_proj` and `linear2` are zero, all LayerNorm gammas are
    /// ones, betas zero (matching `nn.init.zeros_` in
    /// `MultiheadAttentionBlock.init_weights`). Q/K/V projections + linear1
    /// are also zero — concrete checkpoints will overwrite them.
    pub fn zeros(cfg: &MabConfig) -> Self {
        let d = cfg.d_model;
        let ff = cfg.dim_feedforward;
        Self {
            norm1_gamma: vec![1.0; d],
            norm1_beta: if cfg.bias_free_ln {
                None
            } else {
                Some(vec![0.0; d])
            },
            norm2_gamma: vec![1.0; d],
            norm2_beta: if cfg.bias_free_ln {
                None
            } else {
                Some(vec![0.0; d])
            },
            attn: AttentionParams {
                in_proj_weight: ndarray::Array2::<f32>::zeros((3 * d, d)),
                in_proj_bias: Some(vec![0.0; 3 * d]),
                out_proj_weight: ndarray::Array2::<f32>::zeros((d, d)),
                out_proj_bias: Some(vec![0.0; d]),
            },
            ssmax: None,
            linear1: ndarray::Array2::<f32>::zeros((ff, d)),
            linear1_bias: Some(vec![0.0; ff]),
            linear2: ndarray::Array2::<f32>::zeros((d, ff)),
            linear2_bias: Some(vec![0.0; d]),
        }
    }
}

fn apply_activation(x: &mut Array3<f32>, kind: Activation) {
    match kind {
        Activation::Relu => {
            for v in x.iter_mut() {
                if *v < 0.0 {
                    *v = 0.0;
                }
            }
        }
        Activation::Gelu => {
            // GELU(x) = 0.5 * x * (1 + erf(x / sqrt(2))). PyTorch's
            // `nn.GELU()` default is `approximate='none'` (exact erf),
            // so we use that for cross-stack numerical parity.
            for v in x.iter_mut() {
                let xv = *v;
                *v = 0.5 * xv * (1.0 + erf(xv / std::f32::consts::SQRT_2));
            }
        }
        Activation::Silu => {
            for v in x.iter_mut() {
                *v = *v / (1.0 + (-*v).exp());
            }
        }
    }
}

/// Feed-forward sub-block: `linear2(activation(linear1(x)))`. Matches
/// `MultiheadAttentionBlock._ff_block`.
pub fn ff_block(x: ArrayView3<f32>, cfg: &MabConfig, params: &MabParams) -> Array3<f32> {
    let mut h = linear3d(x, params.linear1.view(), params.linear1_bias.as_deref());
    apply_activation(&mut h, cfg.activation);
    linear3d(
        h.view(),
        params.linear2.view(),
        params.linear2_bias.as_deref(),
    )
}

fn ln_3d(x: ArrayView3<f32>, gamma: &[f32], beta: Option<&[f32]>) -> Array3<f32> {
    layer_norm_last(x, gamma, beta, 1e-5)
}

/// Multi-head attention block forward (self-attention).
///
/// Mirrors `MultiheadAttentionBlock.forward(q=x)` with `k=v=None`,
/// `train_size=None`, `cached_kv=None`. Both pre-norm and post-norm
/// branches are implemented.
pub fn mab_forward(
    x: ArrayView3<f32>,
    cfg: &MabConfig,
    params: &MabParams,
    rope: Option<&RopeTables>,
) -> Array3<f32> {
    mab_forward_qkv(x, x, x, cfg, params, rope)
}

/// MAB with optional `train_size` slicing. When `Some(k)`, q stays full
/// but `k = v = q[..k]` (cross-attention to the first k rows). This matches
/// Python's `MultiheadAttentionBlock.forward(q, train_size=k)` semantics,
/// including SSMax's `log(n_src) = log(k)` scaling — crucial for parity.
pub fn mab_forward_train_size(
    x: ArrayView3<f32>,
    cfg: &MabConfig,
    params: &MabParams,
    rope: Option<&RopeTables>,
    train_size: Option<usize>,
) -> Array3<f32> {
    match train_size {
        Some(k) => {
            let k = k.min(x.shape()[1]);
            let k_v = x.slice(ndarray::s![.., ..k, ..]);
            mab_forward_qkv_masked(x, k_v, k_v, cfg, params, rope, None)
        }
        None => mab_forward_qkv_masked(x, x, x, cfg, params, rope, None),
    }
}

/// Cross-attention variant: q, k, v supplied separately. Used by ISAB
/// where the inducing points serve as queries against the input set
/// (and vice versa in the second stage).
pub fn mab_forward_qkv(
    q: ArrayView3<f32>,
    k: ArrayView3<f32>,
    v: ArrayView3<f32>,
    cfg: &MabConfig,
    params: &MabParams,
    rope: Option<&RopeTables>,
) -> Array3<f32> {
    mab_forward_qkv_masked(q, k, v, cfg, params, rope, None)
}

/// Cross-attention MAB with optional additive `(tgt_len, src_len)` mask.
pub fn mab_forward_qkv_masked(
    q: ArrayView3<f32>,
    k: ArrayView3<f32>,
    v: ArrayView3<f32>,
    cfg: &MabConfig,
    params: &MabParams,
    rope: Option<&RopeTables>,
    attn_mask: Option<ArrayView2<f32>>,
) -> Array3<f32> {
    if cfg.norm_first {
        let q_normed = ln_3d(q, &params.norm1_gamma, params.norm1_beta.as_deref());
        // For cross-attention with shared LN1, K and V are normed too.
        // We can't safely use pointer aliasing as an optimization here:
        // sliced views of `q` share the same as_ptr() but have different
        // shapes. Always recompute on the actual k/v views.
        let same_shape = k.shape() == q.shape() && std::ptr::eq(k.as_ptr(), q.as_ptr());
        let k_normed: Array3<f32> = if same_shape {
            q_normed.clone()
        } else {
            ln_3d(k, &params.norm1_gamma, params.norm1_beta.as_deref())
        };
        let v_same_as_k = v.shape() == k.shape() && std::ptr::eq(v.as_ptr(), k.as_ptr());
        let v_same_as_q = v.shape() == q.shape() && std::ptr::eq(v.as_ptr(), q.as_ptr());
        let v_normed: Array3<f32> = if v_same_as_k {
            k_normed.clone()
        } else if v_same_as_q {
            q_normed.clone()
        } else {
            ln_3d(v, &params.norm1_gamma, params.norm1_beta.as_deref())
        };
        let attn_out = crate::attention::multi_head_attention_forward_with_ssmax(
            q_normed.view(),
            k_normed.view(),
            v_normed.view(),
            &params.attn,
            &cfg.attention_cfg(),
            rope,
            attn_mask,
            params.ssmax.as_ref(),
        );
        let mut after_attn = q.to_owned() + &attn_out;
        let ff_in = ln_3d(
            after_attn.view(),
            &params.norm2_gamma,
            params.norm2_beta.as_deref(),
        );
        let ff_out = ff_block(ff_in.view(), cfg, params);
        after_attn += &ff_out;
        after_attn
    } else {
        let attn_out = crate::attention::multi_head_attention_forward_with_ssmax(
            q,
            k,
            v,
            &params.attn,
            &cfg.attention_cfg(),
            rope,
            attn_mask,
            params.ssmax.as_ref(),
        );
        let after_attn_pre = q.to_owned() + &attn_out;
        let z = ln_3d(
            after_attn_pre.view(),
            &params.norm1_gamma,
            params.norm1_beta.as_deref(),
        );
        let ff_out = ff_block(z.view(), cfg, params);
        let combined = &z + &ff_out;
        ln_3d(
            combined.view(),
            &params.norm2_gamma,
            params.norm2_beta.as_deref(),
        )
    }
}

/// Encoder stack — port of `tabicl._model.encoders.Encoder`.
#[derive(Debug, Clone)]
pub struct EncoderStack {
    pub mab_cfg: MabConfig,
    pub blocks: Vec<MabParams>,
    pub rope: Option<RopeConfig>,
}

impl EncoderStack {
    /// Load all `num_blocks` blocks of this stack. PyTorch `Encoder` uses
    /// `{prefix}.blocks.0.…`, `{prefix}.blocks.1.…` etc., one MAB per index.
    pub fn load_from(&mut self, sd: &StateDict, prefix: &str) -> Result<(), StateDictError> {
        self.load_from_with_ssmax(sd, prefix, crate::ssmax::SsmaxKind::None)
    }

    /// As [`Self::load_from`] but with the ssmax kind applied to every block.
    pub fn load_from_with_ssmax(
        &mut self,
        sd: &StateDict,
        prefix: &str,
        ssmax_kind: crate::ssmax::SsmaxKind,
    ) -> Result<(), StateDictError> {
        let cfg = self.mab_cfg.clone();
        for (i, block) in self.blocks.iter_mut().enumerate() {
            block.load_from_with_ssmax(sd, &format!("{prefix}.blocks.{i}"), &cfg, ssmax_kind)?;
        }
        Ok(())
    }

    pub fn new(
        num_blocks: usize,
        mab_cfg: MabConfig,
        rope: Option<RopeConfig>,
    ) -> Result<Self, EncoderError> {
        if !mab_cfg.d_model.is_multiple_of(mab_cfg.nhead) {
            return Err(EncoderError::BadDims {
                d_model: mab_cfg.d_model,
                nhead: mab_cfg.nhead,
            });
        }
        let blocks = (0..num_blocks)
            .map(|_| MabParams::zeros(&mab_cfg))
            .collect();
        Ok(Self {
            mab_cfg,
            blocks,
            rope,
        })
    }

    /// Forward through the stack. RoPE tables for a given sequence length
    /// are computed on the fly so the stack is `seq_len`-agnostic.
    pub fn forward(&self, x: ArrayView3<f32>) -> Array3<f32> {
        self.forward_train_size(x, None)
    }

    /// Forward with optional `train_size` masking applied uniformly to
    /// every block in the stack — port of Python `Encoder(src,
    /// train_size=...)`.
    pub fn forward_train_size(&self, x: ArrayView3<f32>, train_size: Option<usize>) -> Array3<f32> {
        let seq_len = x.shape()[x.ndim() - 2];
        let rope_tables = self.rope.map(|cfg| RopeTables::new(cfg, seq_len));
        let mut cur = x.to_owned();
        for block in &self.blocks {
            cur = mab_forward_train_size(
                cur.view(),
                &self.mab_cfg,
                block,
                rope_tables.as_ref(),
                train_size,
            );
        }
        cur
    }
}

/// Abramowitz–Stegun 7.1.26 polynomial erf approximation; max error
/// ~1.5e-7, well below fp32 noise. Matches `libm::erff` to within
/// ~1 ULP across the realistic GELU domain.
fn erf(x: f32) -> f32 {
    let sign = x.signum();
    let ax = x.abs();
    let a1 = 0.254_829_6_f32;
    let a2 = -0.284_496_72_f32;
    let a3 = 1.421_413_8_f32;
    let a4 = -1.453_152_1_f32;
    let a5 = 1.061_405_4_f32;
    let p = 0.3275911_f32;
    let t = 1.0 / (1.0 + p * ax);
    let y = 1.0 - (((((a5 * t + a4) * t) + a3) * t + a2) * t + a1) * t * (-ax * ax).exp();
    sign * y
}

// ────────────────────────────────────────────────────────────────────
// Induced Self-Attention Block (ISAB) + Set Transformer stack.
// Used by `ColEmbedding`. Port of `InducedSelfAttentionBlock` and
// `SetTransformer` from `tabicl._model.layers` / `_model.encoders`.
// ────────────────────────────────────────────────────────────────────

/// Per-block ISAB parameters: two MABs + a learnable inducing-point
/// matrix `(num_inds, d_model)`. Matches the Python `__init__` exactly.
#[derive(Debug, Clone)]
pub struct IsabParams {
    pub mab1: MabParams,
    pub mab2: MabParams,
    pub ind_vectors: ndarray::Array2<f32>,
}

impl IsabParams {
    pub fn zeros(cfg: &MabConfig, num_inds: usize) -> Self {
        Self {
            mab1: MabParams::zeros(cfg),
            mab2: MabParams::zeros(cfg),
            ind_vectors: ndarray::Array2::<f32>::zeros((num_inds, cfg.d_model)),
        }
    }

    /// Load from a state dict. Python `InducedSelfAttentionBlock` keys:
    ///
    ///   - `{prefix}.multihead_attn1.…` → MAB1 (gets SSMax)
    ///   - `{prefix}.multihead_attn2.…` → MAB2 (no SSMax — Python sets it to "none")
    ///   - `{prefix}.ind_vectors` → `(num_inds, d_model)`
    pub fn load_from(
        &mut self,
        sd: &StateDict,
        prefix: &str,
        cfg: &MabConfig,
    ) -> Result<(), StateDictError> {
        self.load_from_with_ssmax(sd, prefix, cfg, crate::ssmax::SsmaxKind::None)
    }

    /// As [`Self::load_from`] but with the ssmax kind for MAB1. MAB2
    /// always has `SsmaxKind::None` per the Python ISAB constructor.
    pub fn load_from_with_ssmax(
        &mut self,
        sd: &StateDict,
        prefix: &str,
        cfg: &MabConfig,
        mab1_ssmax: crate::ssmax::SsmaxKind,
    ) -> Result<(), StateDictError> {
        self.mab1.load_from_with_ssmax(
            sd,
            &format!("{prefix}.multihead_attn1"),
            cfg,
            mab1_ssmax,
        )?;
        self.mab2
            .load_from(sd, &format!("{prefix}.multihead_attn2"), cfg)?;
        let num_inds = self.ind_vectors.shape()[0];
        self.ind_vectors =
            sd.take_array2(&format!("{prefix}.ind_vectors"), num_inds, cfg.d_model)?;
        Ok(())
    }
}

/// ISAB forward — port of `InducedSelfAttentionBlock.induced_attention`.
pub fn isab_forward(src: ArrayView3<f32>, cfg: &MabConfig, params: &IsabParams) -> Array3<f32> {
    isab_forward_train_size(src, cfg, params, None)
}

/// ISAB forward with optional `train_size` masking on stage 1 (the
/// inducing-points → input keys/values attention). When `Some(k)`, the
/// inducing points only attend to the first `k` rows of `src` — mirrors
/// `InducedSelfAttentionBlock.induced_attention(src, train_size=k)`.
pub fn isab_forward_train_size(
    src: ArrayView3<f32>,
    cfg: &MabConfig,
    params: &IsabParams,
    train_size: Option<usize>,
) -> Array3<f32> {
    let (b, _n, d) = (src.shape()[0], src.shape()[1], src.shape()[2]);
    let m = params.ind_vectors.shape()[0];
    assert_eq!(params.ind_vectors.shape()[1], d);

    let mut ind = Array3::<f32>::zeros((b, m, d));
    for bi in 0..b {
        for mi in 0..m {
            for di in 0..d {
                ind[(bi, mi, di)] = params.ind_vectors[(mi, di)];
            }
        }
    }

    // Stage 1: hidden = MAB1(ind, src[:, :train_size], src[:, :train_size]).
    let hidden = match train_size {
        Some(k) => {
            let k = k.min(src.shape()[1]);
            let src_train = src.slice(ndarray::s![.., ..k, ..]);
            mab_forward_qkv(ind.view(), src_train, src_train, cfg, &params.mab1, None)
        }
        None => mab_forward_qkv(ind.view(), src, src, cfg, &params.mab1, None),
    };
    // Stage 2: out = MAB2(src, hidden, hidden) — full sequence.
    mab_forward_qkv(src, hidden.view(), hidden.view(), cfg, &params.mab2, None)
}

/// Set Transformer stack — `num_blocks` ISABs over `(B, n, d_model)`
/// inputs.
#[derive(Debug, Clone)]
pub struct SetTransformerStack {
    pub mab_cfg: MabConfig,
    pub num_inds: usize,
    pub blocks: Vec<IsabParams>,
}

impl SetTransformerStack {
    /// Load from a state dict. `{prefix}.blocks.0.…`, `{prefix}.blocks.1.…`, etc.
    pub fn load_from(&mut self, sd: &StateDict, prefix: &str) -> Result<(), StateDictError> {
        self.load_from_with_ssmax(sd, prefix, crate::ssmax::SsmaxKind::None)
    }

    /// As [`Self::load_from`] but applies the SSMax kind to MAB1 of each ISAB.
    pub fn load_from_with_ssmax(
        &mut self,
        sd: &StateDict,
        prefix: &str,
        mab1_ssmax: crate::ssmax::SsmaxKind,
    ) -> Result<(), StateDictError> {
        let cfg = self.mab_cfg.clone();
        for (i, block) in self.blocks.iter_mut().enumerate() {
            block.load_from_with_ssmax(sd, &format!("{prefix}.blocks.{i}"), &cfg, mab1_ssmax)?;
        }
        Ok(())
    }

    pub fn new(
        num_blocks: usize,
        mab_cfg: MabConfig,
        num_inds: usize,
    ) -> Result<Self, EncoderError> {
        if !mab_cfg.d_model.is_multiple_of(mab_cfg.nhead) {
            return Err(EncoderError::BadDims {
                d_model: mab_cfg.d_model,
                nhead: mab_cfg.nhead,
            });
        }
        let blocks = (0..num_blocks)
            .map(|_| IsabParams::zeros(&mab_cfg, num_inds))
            .collect();
        Ok(Self {
            mab_cfg,
            num_inds,
            blocks,
        })
    }

    pub fn forward(&self, src: ArrayView3<f32>) -> Array3<f32> {
        self.forward_train_size(src, None)
    }

    /// SetTransformer forward with optional train_size masking, applied
    /// to every ISAB block. Mirrors Python's
    /// `SetTransformer.forward(src, train_size=...)`.
    pub fn forward_train_size(
        &self,
        src: ArrayView3<f32>,
        train_size: Option<usize>,
    ) -> Array3<f32> {
        let mut cur = src.to_owned();
        for block in &self.blocks {
            cur = isab_forward_train_size(cur.view(), &self.mab_cfg, block, train_size);
        }
        cur
    }
}

#[allow(dead_code)]
fn _silence(_a: Axis) {}

#[cfg(test)]
mod tests {
    use super::*;
    use ndarray::Array;

    #[allow(dead_code)]
    fn ident_attn_params(d: usize) -> AttentionParams {
        let w = ndarray::Array2::<f32>::zeros((3 * d, d));
        // Identity for Q/K, zero for V — output of attention = 0.
        // Then with out_proj = 0, attn output is 0. Used to make the
        // residual block reduce to identity in tests.
        AttentionParams {
            in_proj_weight: w,
            in_proj_bias: Some(vec![0.0; 3 * d]),
            out_proj_weight: ndarray::Array2::<f32>::zeros((d, d)),
            out_proj_bias: Some(vec![0.0; d]),
        }
    }

    #[test]
    fn pre_norm_with_zero_attn_and_ff_is_identity() {
        let d = 4;
        let cfg = MabConfig {
            d_model: d,
            nhead: 2,
            dim_feedforward: 8,
            dropout: 0.0,
            activation: Activation::Gelu,
            norm_first: true,
            bias_free_ln: false,
        };
        // Per Python init, attn.out_proj and linear2 weights are zero —
        // the residual path through pre-norm reduces to identity.
        let params = MabParams::zeros(&cfg);

        let x = Array::from_shape_fn((2, 3, d), |(b, t, k)| {
            (b as f32) * 0.1 + (t as f32) * 0.01 + (k as f32) * 0.001
        });
        let y = mab_forward(x.view(), &cfg, &params, None);
        assert_eq!(y.shape(), x.shape());
        for (a, b) in x.iter().zip(y.iter()) {
            assert!((a - b).abs() < 1e-5, "{} vs {}", a, b);
        }
    }

    #[test]
    fn encoder_stack_with_zero_blocks_is_identity() {
        let d = 4;
        let cfg = MabConfig {
            d_model: d,
            nhead: 2,
            dim_feedforward: 8,
            dropout: 0.0,
            activation: Activation::Gelu,
            norm_first: true,
            bias_free_ln: false,
        };
        let stack = EncoderStack::new(3, cfg, None).unwrap();
        let x = Array::from_shape_fn((1, 5, d), |(_, t, k)| (t as f32) * 0.1 + (k as f32) * 0.01);
        let y = stack.forward(x.view());
        for (a, b) in x.iter().zip(y.iter()) {
            assert!((a - b).abs() < 1e-4, "{} vs {}", a, b);
        }
    }

    #[test]
    fn stack_rejects_bad_dims() {
        let cfg = MabConfig {
            d_model: 5, // not divisible by nhead
            nhead: 2,
            dim_feedforward: 8,
            dropout: 0.0,
            activation: Activation::Gelu,
            norm_first: true,
            bias_free_ln: false,
        };
        let err = EncoderStack::new(1, cfg, None).unwrap_err();
        assert!(matches!(err, EncoderError::BadDims { .. }));
    }

    #[test]
    fn stack_with_rope_runs() {
        let d = 8;
        let cfg = MabConfig {
            d_model: d,
            nhead: 2,
            dim_feedforward: 16,
            dropout: 0.0,
            activation: Activation::Gelu,
            norm_first: true,
            bias_free_ln: false,
        };
        let rope = RopeConfig {
            head_dim: 4,
            base: 100_000.0,
            interleaved: false,
        };
        let stack = EncoderStack::new(2, cfg, Some(rope)).unwrap();
        let x = Array::from_shape_fn((1, 6, d), |(_, t, k)| (t * d + k) as f32 * 0.01);
        let y = stack.forward(x.view());
        assert_eq!(y.shape(), x.shape());
        // Zero-init: pre-norm with zero attn/ff → still ~identity even
        // through RoPE (RoPE only acts on Q/K, which are projected to
        // zero with the zero in_proj weights).
        for (a, b) in x.iter().zip(y.iter()) {
            assert!((a - b).abs() < 1e-4, "{} vs {}", a, b);
        }
    }

    #[test]
    fn gelu_matches_pytorch_at_known_points() {
        // GELU(0) = 0; GELU(1) ≈ 0.8413; GELU(-1) ≈ -0.1587.
        let mut x = Array::from_shape_vec((1, 1, 3), vec![0.0_f32, 1.0, -1.0]).unwrap();
        apply_activation(&mut x, Activation::Gelu);
        assert!(x[(0, 0, 0)].abs() < 1e-5);
        assert!((x[(0, 0, 1)] - 0.8413).abs() < 1e-3);
        assert!((x[(0, 0, 2)] + 0.1587).abs() < 1e-3);
    }

    #[test]
    fn relu_clamps_negatives() {
        let mut x = Array::from_shape_vec((1, 1, 3), vec![1.0_f32, 0.0, -1.0]).unwrap();
        apply_activation(&mut x, Activation::Relu);
        assert_eq!(x[(0, 0, 0)], 1.0);
        assert_eq!(x[(0, 0, 1)], 0.0);
        assert_eq!(x[(0, 0, 2)], 0.0);
    }

    #[test]
    fn isab_zero_init_preserves_input() {
        // Same logic as the MAB zero-init identity: out_proj=0, linear2=0
        // means each MAB returns its query through the residual path.
        // For ISAB:
        //   hidden = MAB1(ind, src, src) → with zero attn, hidden ≈ ind
        //   out    = MAB2(src, hidden, hidden) → with zero attn, out ≈ src
        let d = 4;
        let cfg = MabConfig {
            d_model: d,
            nhead: 2,
            dim_feedforward: 8,
            dropout: 0.0,
            activation: Activation::Gelu,
            norm_first: true,
            bias_free_ln: false,
        };
        let stack = SetTransformerStack::new(2, cfg, 3).unwrap();
        let src = Array::from_shape_fn((2, 5, d), |(b, n, k)| (b * 100 + n * 10 + k) as f32 * 0.01);
        let out = stack.forward(src.view());
        for (a, b) in src.iter().zip(out.iter()) {
            assert!((a - b).abs() < 1e-4, "ISAB zero-init drift: {} vs {}", a, b);
        }
    }

    #[test]
    fn isab_output_shape_matches_input() {
        let d = 8;
        let cfg = MabConfig {
            d_model: d,
            nhead: 2,
            dim_feedforward: 16,
            dropout: 0.0,
            activation: Activation::Gelu,
            norm_first: true,
            bias_free_ln: false,
        };
        // num_inds=4 — different from seq_len=10, exercises the bottleneck.
        let stack = SetTransformerStack::new(1, cfg, 4).unwrap();
        let src = Array::from_shape_fn((1, 10, d), |(_, n, k)| (n * d + k) as f32 * 0.001);
        let out = stack.forward(src.view());
        assert_eq!(out.shape(), src.shape());
    }

    #[test]
    fn train_size_mask_blocks_test_to_test_attention() {
        // Build a stack with non-trivial weights so attention output
        // actually depends on K/V. Compare train-size-masked vs unmasked:
        // when the test row attends only to training rows, its output
        // must equal the output of the same query against training-only
        // keys/values.
        let d = 4;
        let cfg = MabConfig {
            d_model: d,
            nhead: 1,
            dim_feedforward: 8,
            dropout: 0.0,
            activation: Activation::Gelu,
            norm_first: true,
            bias_free_ln: false,
        };
        let mut params = MabParams::zeros(&cfg);
        // Identity Q/K/V projections + identity out_proj so attention is
        // just softmax(Q K^T / sqrt(D)) V.
        for i in 0..d {
            params.attn.in_proj_weight[(i, i)] = 1.0;
            params.attn.in_proj_weight[(d + i, i)] = 1.0;
            params.attn.in_proj_weight[(2 * d + i, i)] = 1.0;
            params.attn.out_proj_weight[(i, i)] = 1.0;
        }
        // Distinctive values per position so masking has a visible effect.
        // Need rows that are NOT proportional — LN normalizes proportional
        // rows to the same vector, washing out the masking signal.
        let raw = [
            [1.0_f32, 2.0, 3.0, 4.0],
            [4.0, 3.0, 2.0, 1.0],
            [1.0, 5.0, 1.0, 5.0],
            [-2.0, 1.0, 4.0, 0.5],
        ];
        let x = Array::from_shape_fn((1, 4, d), |(_, t, e)| raw[t][e]);
        // Unmasked: every position attends to every position.
        let y_unmasked = mab_forward(x.view(), &cfg, &params, None);
        // train_size = 2 → test rows (positions 2, 3) only see positions 0, 1.
        let y_masked = mab_forward_train_size(x.view(), &cfg, &params, None, Some(2));
        // Training rows (0, 1) are unaffected by the masking — they
        // already only saw training rows under "everyone sees everyone"
        // *plus* test rows, which the mask now removes. So their outputs
        // DO change.
        // Test rows definitely change (their attention distribution
        // shrinks from 4 positions to 2).
        let row3_unmasked: Vec<f32> = (0..d).map(|e| y_unmasked[(0, 3, e)]).collect();
        let row3_masked: Vec<f32> = (0..d).map(|e| y_masked[(0, 3, e)]).collect();
        let differs = row3_unmasked
            .iter()
            .zip(row3_masked.iter())
            .any(|(a, b)| (a - b).abs() > 1e-3);
        assert!(differs, "train-size masking did not affect test row output");
    }

    #[test]
    fn mab_cross_attention_uses_separate_kv() {
        // Identity-style projections so the cross-attention output depends
        // on K/V, not on Q (other than the residual). Verify q is the
        // "skeleton" while content comes from V.
        let d = 4;
        let cfg = MabConfig {
            d_model: d,
            nhead: 2,
            dim_feedforward: 8,
            dropout: 0.0,
            activation: Activation::Gelu,
            norm_first: true,
            bias_free_ln: false,
        };
        let mut params = MabParams::zeros(&cfg);
        // Make V projection identity, Q/K projections zero so attn output
        // (after V) is just mean of V (uniform softmax over Q=K=0).
        for i in 0..d {
            params.attn.in_proj_weight[(2 * d + i, i)] = 1.0;
        }
        // out_proj = identity
        for i in 0..d {
            params.attn.out_proj_weight[(i, i)] = 1.0;
        }
        let q = Array::from_shape_vec((1, 1, d), vec![5.0_f32, 6.0, 7.0, 8.0]).unwrap();
        let k = Array::from_shape_vec((1, 1, d), vec![0.0_f32; 4]).unwrap();
        let v = Array::from_shape_vec((1, 1, d), vec![1.0_f32, 2.0, 3.0, 4.0]).unwrap();
        let out = mab_forward_qkv(q.view(), k.view(), v.view(), &cfg, &params, None);
        // Output = q + attn (zero linear2 → no FF contribution).
        //        = q + LN1(v) since uniform attn over 1 token yields v_normed
        //        with identity out_proj. Actually after pre-norm we have:
        //          attn = mha(LN1(q), LN1(k), LN1(v))
        //        With Q=K=0 (after projection from LN1(q)/LN1(k) → identity-V'd
        //        zeros), uniform attention picks LN1(v), then identity out_proj.
        //        Final: q + LN1(v).
        // We just check that q components contribute (residual) and the
        // result is *not* equal to q itself (v contributed something via LN1(v)).
        let q_view = q.view();
        let differs = (0..d).any(|i| (out[(0, 0, i)] - q_view[(0, 0, i)]).abs() > 1e-3);
        assert!(differs, "expected V to contribute to output via attn");
    }
}