tabicl-model 2.1.1

//! Column-wise distribution-aware embedding — port of
//! `tabicl._model.embedding.ColEmbedding`.
//!
//! The Python module is 890 LOC and supports five orthogonal features:
//!
//!   1. **`in_linear`** — a `SkippableLinear(1 → embed_dim)` (or
//!      `SkippableLinear(feature_group_size → embed_dim)` when grouping is
//!      enabled) projects each column into the embedding space.
//!   2. **`tf_col`** — a SetTransformer ([`crate::encoders::SetTransformerStack`])
//!      mixes information across columns.
//!   3. **`feature_group`** — column-grouping mode (`same`/`valid`/None).
//!   4. **`target_aware`** — concatenates a target-derived embedding so
//!      classification/regression labels can influence the column features.
//!   5. **`mixed_radix_ensemble`** — for many-class problems, encodes the
//!      label in multiple radix digits, embedded separately, and averaged.
//!   6. **`affine`** — an `out_w * x + out_b` head with per-side LayerNorms.
//!
//! This port currently lands:
//!
//!   - the full constructor config + parameter container,
//!   - the basic forward path: `in_linear` per-column → `tf_col` → optional
//!     `affine` head.
//!
//! Feature grouping, target-aware embedding, and mixed-radix ensembling
//! are still TODO and gated at the config level. The `reserve_cls_tokens`
//! mechanism — which leaves the first `num_cls` positions blank for the
//! downstream `RowInteraction` to overwrite with CLS tokens — is honored:
//! we emit `(B, T, H + reserve_cls_tokens, E)` with the CLS slots zero.

use ndarray::{Array2, Array3, Array4, ArrayView2, ArrayView3};
use serde::{Deserialize, Serialize};
use thiserror::Error;

use crate::encoders::{MabConfig, SetTransformerStack};
use crate::layers::{SkippableLinear, layer_norm_last};
use crate::state_dict::{StateDict, StateDictError};
use crate::tabicl::{Activation, ColFeatureGroup};

#[derive(Debug, Error)]
pub enum EmbeddingError {
    #[error("feature grouping is not yet ported; pass ColFeatureGroup::None")]
    GroupingNotImplemented,
    #[error("target-aware embedding is not yet ported")]
    TargetAwareNotImplemented,
}

/// Static config for the column embedding. Mirrors `ColEmbedding.__init__`
/// exactly (modulo the deferred Rust-side features).
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ColEmbeddingConfig {
    pub embed_dim: usize,
    pub num_blocks: usize,
    pub nhead: usize,
    pub dim_feedforward: usize,
    pub num_inds: usize,
    pub dropout: f32,
    pub activation: Activation,
    pub norm_first: bool,
    pub bias_free_ln: bool,
    pub affine: bool,
    pub feature_group: ColFeatureGroup,
    pub feature_group_size: usize,
    pub target_aware: bool,
    pub max_classes: usize,
    pub reserve_cls_tokens: usize,
    pub ssmax: String,
    pub mixed_radix_ensemble: bool,
    pub recompute: bool,
}

impl ColEmbeddingConfig {
    pub fn in_features(&self) -> usize {
        match self.feature_group {
            ColFeatureGroup::None => 1,
            _ => self.feature_group_size,
        }
    }
}

/// Parameter container for [`ColEmbedding`].
#[derive(Debug, Clone)]
pub struct ColEmbeddingParams {
    /// `SkippableLinear(in_features → embed_dim)` — per-column projection.
    pub in_linear: SkippableLinear,
    /// Affine head: `out_w` shape `(embed_dim, embed_dim)`. Present iff `affine`.
    pub out_w: Option<SkippableLinear>,
    /// Affine head: `out_b`. Same shape, same gating.
    pub out_b: Option<SkippableLinear>,
    /// LayerNorm γ/β for the W side of the affine head.
    pub ln_w_gamma: Option<Vec<f32>>,
    pub ln_w_beta: Option<Vec<f32>>,
    /// LayerNorm γ/β for the B side of the affine head.
    pub ln_b_gamma: Option<Vec<f32>>,
    pub ln_b_beta: Option<Vec<f32>>,
    /// y-encoder weight (Linear) for target-aware regression.
    pub y_linear: Option<(Array2<f32>, Option<Vec<f32>>)>,
    /// y-encoder OneHotAndLinear for target-aware classification — stored
    /// as `(weight, bias)` to avoid pulling in the full struct here.
    pub y_one_hot: Option<(Array2<f32>, Option<Vec<f32>>)>,
}

impl ColEmbeddingParams {
    pub fn zeros(cfg: &ColEmbeddingConfig) -> Self {
        let in_linear = SkippableLinear::new(
            Array2::<f32>::zeros((cfg.embed_dim, cfg.in_features())),
            Some(vec![0.0; cfg.embed_dim]),
            -100.0,
        );
        let (out_w, out_b, ln_w_g, ln_w_b, ln_b_g, ln_b_b) = if cfg.affine {
            let mk_skl = || {
                SkippableLinear::new(
                    Array2::<f32>::zeros((cfg.embed_dim, cfg.embed_dim)),
                    Some(vec![0.0; cfg.embed_dim]),
                    -100.0,
                )
            };
            let ln_gamma = if cfg.norm_first {
                Some(vec![1.0; cfg.embed_dim])
            } else {
                None
            };
            let ln_beta = if cfg.norm_first && !cfg.bias_free_ln {
                Some(vec![0.0; cfg.embed_dim])
            } else {
                None
            };
            (
                Some(mk_skl()),
                Some(mk_skl()),
                ln_gamma.clone(),
                ln_beta.clone(),
                ln_gamma,
                ln_beta,
            )
        } else {
            (None, None, None, None, None, None)
        };
        let y_linear = if cfg.target_aware && cfg.max_classes == 0 {
            Some((
                Array2::<f32>::zeros((cfg.embed_dim, 1)),
                Some(vec![0.0; cfg.embed_dim]),
            ))
        } else {
            None
        };
        let y_one_hot = if cfg.target_aware && cfg.max_classes > 0 {
            Some((
                Array2::<f32>::zeros((cfg.embed_dim, cfg.max_classes)),
                Some(vec![0.0; cfg.embed_dim]),
            ))
        } else {
            None
        };
        Self {
            in_linear,
            out_w,
            out_b,
            ln_w_gamma: ln_w_g,
            ln_w_beta: ln_w_b,
            ln_b_gamma: ln_b_g,
            ln_b_beta: ln_b_b,
            y_linear,
            y_one_hot,
        }
    }
}

/// Column embedding module.
#[derive(Debug, Clone)]
pub struct ColEmbedding {
    pub config: ColEmbeddingConfig,
    pub params: ColEmbeddingParams,
    pub set_transformer: SetTransformerStack,
}

impl ColEmbedding {
    /// Load weights from a Python state dict under `{prefix}`. Keys used:
    ///
    ///   - `{prefix}.in_linear.weight`/`.bias`            — SkippableLinear
    ///   - `{prefix}.tf_col.blocks.i.multihead_attn{1,2}.…` — Set Transformer
    ///   - `{prefix}.out_w.weight`/`.bias`                — affine head (if affine)
    ///   - `{prefix}.out_b.weight`/`.bias`                — affine head bias side
    ///   - `{prefix}.ln_w.weight`/`.bias`                 — affine LayerNorm W
    ///   - `{prefix}.ln_b.weight`/`.bias`                 — affine LayerNorm B
    ///   - `{prefix}.y_encoder.weight`/`.bias`            — target-aware y-encoder
    pub fn load_from(&mut self, sd: &StateDict, prefix: &str) -> Result<(), StateDictError> {
        self.params
            .in_linear
            .load_from(sd, &format!("{prefix}.in_linear"))?;
        // Pass the configured SSMax kind through to MAB1 of every ISAB.
        let ssmax_kind = crate::ssmax::SsmaxKind::parse(&self.config.ssmax)
            .unwrap_or(crate::ssmax::SsmaxKind::None);
        self.set_transformer
            .load_from_with_ssmax(sd, &format!("{prefix}.tf_col"), ssmax_kind)?;

        if self.config.affine {
            self.params
                .out_w
                .as_mut()
                .expect("affine=true should have out_w")
                .load_from(sd, &format!("{prefix}.out_w"))?;
            self.params
                .out_b
                .as_mut()
                .expect("affine=true should have out_b")
                .load_from(sd, &format!("{prefix}.out_b"))?;
            if self.config.norm_first {
                let d = self.config.embed_dim;
                self.params.ln_w_gamma = Some(sd.take_vec(&format!("{prefix}.ln_w.weight"), d)?);
                let beta_w = format!("{prefix}.ln_w.bias");
                if sd.tensors.contains_key(&beta_w) {
                    self.params.ln_w_beta = Some(sd.take_vec(&beta_w, d)?);
                }
                self.params.ln_b_gamma = Some(sd.take_vec(&format!("{prefix}.ln_b.weight"), d)?);
                let beta_b = format!("{prefix}.ln_b.bias");
                if sd.tensors.contains_key(&beta_b) {
                    self.params.ln_b_beta = Some(sd.take_vec(&beta_b, d)?);
                }
            }
        }

        if self.config.target_aware {
            let d = self.config.embed_dim;
            if self.config.max_classes > 0 {
                let (w, bias) = self.params.y_one_hot.as_mut().unwrap();
                *w = sd.take_array2(
                    &format!("{prefix}.y_encoder.weight"),
                    d,
                    self.config.max_classes,
                )?;
                let bk = format!("{prefix}.y_encoder.bias");
                if sd.tensors.contains_key(&bk) {
                    *bias = Some(sd.take_vec(&bk, d)?);
                }
            } else {
                let (w, bias) = self.params.y_linear.as_mut().unwrap();
                *w = sd.take_array2(&format!("{prefix}.y_encoder.weight"), d, 1)?;
                let bk = format!("{prefix}.y_encoder.bias");
                if sd.tensors.contains_key(&bk) {
                    *bias = Some(sd.take_vec(&bk, d)?);
                }
            }
        }
        Ok(())
    }

    pub fn new(config: ColEmbeddingConfig) -> Self {
        let params = ColEmbeddingParams::zeros(&config);
        let mab_cfg = MabConfig {
            d_model: config.embed_dim,
            nhead: config.nhead,
            dim_feedforward: config.dim_feedforward,
            dropout: config.dropout,
            activation: config.activation,
            norm_first: config.norm_first,
            bias_free_ln: config.bias_free_ln,
        };
        let set_transformer = SetTransformerStack::new(config.num_blocks, mab_cfg, config.num_inds)
            .expect("ColEmbedding: d_model must be divisible by nhead");
        Self {
            config,
            params,
            set_transformer,
        }
    }

    /// Column embedding forward — port of
    /// `ColEmbedding._train_forward_without_feature_group` +
    /// `_compute_embeddings`.
    ///
    /// Input `x` shape: `(B, T, H)`. Output: `(B, T, H + C, E)` where
    /// `C = reserve_cls_tokens`.
    ///
    /// The set transformer attends **across rows for each column**: each
    /// of the `H + C` columns becomes its own sequence of length `T`. This
    /// makes the embedding distribution-aware — every column's encoding
    /// depends on the distribution of its values across the batch.
    ///
    /// For `reserve_cls_tokens > 0`, the leading `C` column slots are
    /// padded with `-100.0` so the `SkippableLinear` no-ops them through;
    /// they emerge from the set transformer as `-100`-sentinel columns
    /// that `RowInteraction` will overwrite with its CLS tokens.
    ///
    /// Target-aware mode (`target_aware == true`) adds the y-encoder
    /// output to the first `train_size` rows of every column before the
    /// set transformer runs.
    ///
    /// **Not yet supported** (returns a typed error):
    ///   - `feature_group != None`
    ///   - `mixed_radix_ensemble == true` for class counts above `max_classes`
    pub fn forward(&self, x: ArrayView3<f32>) -> Result<Array4<f32>, EmbeddingError> {
        self.forward_with_targets(x, None, None, 0)
    }

    /// Full forward with optional target-aware path.
    ///
    /// `y_train_class` / `y_train_reg`: pick one when `target_aware == true`.
    /// `train_size`: how many of the T rows are training (must match
    /// `y_train.shape[1]`). When `target_aware == false`, `train_size`
    /// is ignored.
    pub fn forward_with_targets(
        &self,
        x: ArrayView3<f32>,
        y_train_class: Option<ArrayView2<usize>>,
        y_train_reg: Option<ArrayView2<f32>>,
        train_size: usize,
    ) -> Result<Array4<f32>, EmbeddingError> {
        let (b, t, _h) = (x.shape()[0], x.shape()[1], x.shape()[2]);
        let e = self.config.embed_dim;
        let c = self.config.reserve_cls_tokens;
        let in_dim = self.config.in_features();

        // 1. Feature grouping: produce per-column slabs of size `in_dim`.
        //    Returns (B, T, G, in_dim).
        let grouped = self.feature_grouping(x);
        let g = grouped.shape()[2];
        let gc = g + c;

        // 2. Pad: prepend C sentinel "columns" of `in_dim`-wide all-(-100)
        //    rows. Layout: (B, T, G+C, in_dim).
        let mut x_padded = Array4::<f32>::zeros((b, t, gc, in_dim));
        for bi in 0..b {
            for ti in 0..t {
                for ci in 0..c {
                    for ki in 0..in_dim {
                        x_padded[(bi, ti, ci, ki)] = -100.0;
                    }
                }
                for gi in 0..g {
                    for ki in 0..in_dim {
                        x_padded[(bi, ti, c + gi, ki)] = grouped[(bi, ti, gi, ki)];
                    }
                }
            }
        }

        // 3. Flatten (B, G+C) → batch dim. Each output "row" is one column
        //    over T (with `in_dim` features per row).
        let bgc = b * gc;
        let mut features = Array3::<f32>::zeros((bgc, t, in_dim));
        for bi in 0..b {
            for ci in 0..gc {
                for ti in 0..t {
                    for ki in 0..in_dim {
                        features[(bi * gc + ci, ti, ki)] = x_padded[(bi, ti, ci, ki)];
                    }
                }
            }
        }
        // Use `hc` consistently below — this is the count of output columns.
        let hc = gc;

        // 3. Per-value projection. SkippableLinear no-ops the -100
        //    columns so they emerge as -100 across all E dims.
        let src_proj = self.params.in_linear.forward(features.view()); // (B*(H+C), T, E)

        // 4. Determine if we need the mixed-radix ensemble path. Python
        //    does this when num_classes > max_classes for classification
        //    target-aware embedding.
        let needs_mixed_radix = if self.config.target_aware
            && self.config.max_classes > 0
            && self.config.mixed_radix_ensemble
        {
            match y_train_class.as_ref() {
                Some(y) => {
                    let max_label = y.iter().copied().max().unwrap_or(0);
                    let num_classes = max_label + 1;
                    num_classes > self.config.max_classes
                }
                None => false,
            }
        } else {
            false
        };

        // 5. Either single-pass y_emb addition + set-transformer, or
        //    the mixed-radix ensemble (average set-transformer outputs
        //    across digit splits).
        let mixed = if needs_mixed_radix {
            let y_cls = y_train_class.expect("checked above");
            let max_label = y_cls.iter().copied().max().unwrap_or(0);
            let num_classes = max_label + 1;
            let bases = self.compute_mixed_radix_bases(num_classes);
            let mut accum = Array3::<f32>::zeros(src_proj.dim());
            for digit_idx in 0..bases.len() {
                // Build per-digit y labels.
                let (br, ts) = (y_cls.shape()[0], y_cls.shape()[1]);
                let mut digit_y = ndarray::Array2::<usize>::zeros((br, ts));
                for bi in 0..br {
                    for ti in 0..ts {
                        digit_y[(bi, ti)] =
                            Self::extract_mixed_radix_digit(y_cls[(bi, ti)], digit_idx, &bases);
                    }
                }
                let mut src_with_y = src_proj.clone();
                self.apply_target_aware(
                    &mut src_with_y,
                    Some(digit_y.view()),
                    None,
                    train_size,
                    b,
                    hc,
                )?;
                let mixed_digit = self
                    .set_transformer
                    .forward_train_size(src_with_y.view(), Some(train_size));
                for ((i, j, k), v) in mixed_digit.indexed_iter() {
                    accum[(i, j, k)] += v;
                }
            }
            let nd = bases.len() as f32;
            for v in accum.iter_mut() {
                *v /= nd;
            }
            accum
        } else {
            // Single-pass path: add y_emb to first train_size rows, then
            // run the set transformer once.
            let mut src = src_proj;
            if self.config.target_aware {
                self.apply_target_aware(&mut src, y_train_class, y_train_reg, train_size, b, hc)?;
            }
            // Use train_size masking when we're in target-aware mode
            // (matches Python's embed_with_test=False semantics).
            if self.config.target_aware {
                self.set_transformer
                    .forward_train_size(src.view(), Some(train_size))
            } else {
                self.set_transformer.forward(src.view())
            }
        };

        // 6. Optional affine head.
        let processed = if self.config.affine {
            self.apply_affine_with_features(mixed.view(), features.view())
        } else {
            mixed
        };

        // 7. Reshape (B*(H+C), T, E) → (B, H+C, T, E) → (B, T, H+C, E).
        let mut out = Array4::<f32>::zeros((b, t, hc, e));
        for bi in 0..b {
            for ci in 0..hc {
                for ti in 0..t {
                    for ei in 0..e {
                        out[(bi, ti, ci, ei)] = processed[(bi * hc + ci, ti, ei)];
                    }
                }
            }
        }
        Ok(out)
    }

    /// Port of `ColEmbedding.feature_grouping`. Returns `(B, T, G, size)`:
    ///
    ///   - `feature_group == None`     → `G = H`, `size = 1` (unsqueeze).
    ///   - `feature_group == Same`     → `G = H`, `size = feature_group_size`,
    ///                                   each group is `[X[h], X[(h+2)%H],
    ///                                   X[(h+4)%H], ..., X[(h+2^(size-1))%H]]`.
    ///   - `feature_group == Valid`    → `G = ceil(H/size)`, padded then
    ///                                   reshaped.
    fn feature_grouping(&self, x: ArrayView3<f32>) -> Array4<f32> {
        let (b, t, h) = (x.shape()[0], x.shape()[1], x.shape()[2]);
        let size = self.config.feature_group_size;
        match self.config.feature_group {
            ColFeatureGroup::None => {
                let mut out = Array4::<f32>::zeros((b, t, h, 1));
                for bi in 0..b {
                    for ti in 0..t {
                        for hi in 0..h {
                            out[(bi, ti, hi, 0)] = x[(bi, ti, hi)];
                        }
                    }
                }
                out
            }
            ColFeatureGroup::Same => {
                // G == H. For group hi:
                //   out[bi, ti, hi, k] = X[bi, ti, (hi + 2^k) % H]
                let g = h;
                let mut out = Array4::<f32>::zeros((b, t, g, size));
                for bi in 0..b {
                    for ti in 0..t {
                        for hi in 0..g {
                            for k in 0..size {
                                let shift = 1usize << k; // 2^k
                                let j = (hi + shift) % h;
                                out[(bi, ti, hi, k)] = x[(bi, ti, j)];
                            }
                        }
                    }
                }
                out
            }
            ColFeatureGroup::Valid => {
                // Right-pad with zeros to a multiple of `size`, then
                // reshape (B, T, H_padded) → (B, T, G, size).
                let pad = (size - h % size) % size;
                let h_padded = h + pad;
                let g = h_padded / size;
                let mut out = Array4::<f32>::zeros((b, t, g, size));
                for bi in 0..b {
                    for ti in 0..t {
                        for gi in 0..g {
                            for k in 0..size {
                                let idx = gi * size + k;
                                let v = if idx < h { x[(bi, ti, idx)] } else { 0.0 };
                                out[(bi, ti, gi, k)] = v;
                            }
                        }
                    }
                }
                out
            }
        }
    }

    /// Compute balanced mixed-radix bases. Port of
    /// `ColEmbedding._compute_mixed_radix_bases`.
    fn compute_mixed_radix_bases(&self, num_classes: usize) -> Vec<usize> {
        let max_cls = self.config.max_classes;
        if num_classes <= max_cls {
            return vec![num_classes];
        }
        let d_levels = ((num_classes as f64).ln() / (max_cls as f64).ln()).ceil() as usize;
        let k = ((num_classes as f64).powf(1.0 / d_levels as f64).ceil() as usize).min(max_cls);
        let mut bases = vec![k; d_levels];
        let mut product: usize = bases.iter().product();
        let mut idx = 0;
        while product < num_classes && idx < d_levels {
            if bases[idx] < max_cls {
                product = product / bases[idx] * (bases[idx] + 1);
                bases[idx] += 1;
            }
            idx += 1;
        }
        bases
    }

    /// Extract digit `digit_idx` from the mixed-radix representation of
    /// `y` under `bases`. Port of `_extract_mixed_radix_digit`.
    fn extract_mixed_radix_digit(y: usize, digit_idx: usize, bases: &[usize]) -> usize {
        let mut divisor = 1_usize;
        for base in bases.iter().skip(digit_idx + 1) {
            divisor *= base;
        }
        (y / divisor) % bases[digit_idx]
    }

    fn apply_target_aware(
        &self,
        src: &mut Array3<f32>,
        y_train_class: Option<ArrayView2<usize>>,
        y_train_reg: Option<ArrayView2<f32>>,
        train_size: usize,
        b: usize,
        hc: usize,
    ) -> Result<(), EmbeddingError> {
        let e = self.config.embed_dim;
        // Many-class regime is handled in forward_with_targets via the
        // mixed-radix ensemble path; here we add a single y_emb pass.

        // y_emb of shape (B, train_size, E). Then broadcast over (H+C).
        let y_emb: Array3<f32> = match (
            &self.params.y_one_hot,
            &self.params.y_linear,
            y_train_class,
            y_train_reg,
        ) {
            (Some((w, bias)), _, Some(y_cls), None) => {
                // OneHotAndLinear: weight (E, num_classes), pick column.
                let mut out = Array3::<f32>::zeros((b, train_size, e));
                for bi in 0..b {
                    for ti in 0..train_size {
                        let cls = y_cls[(bi, ti)];
                        debug_assert!(cls < w.shape()[1]);
                        for ei in 0..e {
                            out[(bi, ti, ei)] = w[(ei, cls)];
                        }
                        if let Some(b_) = bias {
                            for ei in 0..e {
                                out[(bi, ti, ei)] += b_[ei];
                            }
                        }
                    }
                }
                out
            }
            (_, Some((w, bias)), None, Some(y_reg)) => {
                // Linear(1 → E). y_reg shape (B, train_size).
                let mut out = Array3::<f32>::zeros((b, train_size, e));
                for bi in 0..b {
                    for ti in 0..train_size {
                        for ei in 0..e {
                            out[(bi, ti, ei)] = w[(ei, 0)] * y_reg[(bi, ti)];
                        }
                        if let Some(b_) = bias {
                            for ei in 0..e {
                                out[(bi, ti, ei)] += b_[ei];
                            }
                        }
                    }
                }
                out
            }
            _ => panic!(
                "target_aware=true requires y_train and a matching y-encoder (one-hot for classification, linear for regression)"
            ),
        };

        // Add to first train_size rows of every column (broadcast over H+C).
        for bi in 0..b {
            for ci in 0..hc {
                let flat_b = bi * hc + ci;
                for ti in 0..train_size {
                    for ei in 0..e {
                        src[(flat_b, ti, ei)] += y_emb[(bi, ti, ei)];
                    }
                }
            }
        }
        Ok(())
    }

    fn apply_affine_with_features(
        &self,
        x: ArrayView3<f32>,
        features: ArrayView3<f32>,
    ) -> Array3<f32> {
        // Python: weights = LN_w(out_w(src));  biases = LN_b(out_b(src));
        //         embeddings = features * weights + biases
        // `features` has shape (..., T, in_dim). With `in_dim == 1`
        // (no grouping) the multiplication broadcasts cleanly to (.., T, E).
        // With grouping (in_dim > 1) the original Python operation is
        // ill-shaped — affine + grouping is not a tested combination in
        // tabicl. Surface a panic with a clear message instead of
        // silently broadcasting wrong.
        assert_eq!(
            features.shape()[2],
            1,
            "affine head + feature grouping is not a supported combination \
             in Python tabicl; pass affine=false or feature_group=None"
        );
        let w_raw = self
            .params
            .out_w
            .as_ref()
            .expect("affine=true requires out_w")
            .forward(x);
        let b_raw = self
            .params
            .out_b
            .as_ref()
            .expect("affine=true requires out_b")
            .forward(x);
        let weights = match &self.params.ln_w_gamma {
            Some(g) => layer_norm_last(w_raw.view(), g, self.params.ln_w_beta.as_deref(), 1e-5),
            None => w_raw,
        };
        let biases = match &self.params.ln_b_gamma {
            Some(g) => layer_norm_last(b_raw.view(), g, self.params.ln_b_beta.as_deref(), 1e-5),
            None => b_raw,
        };
        let (bs, t, e) = (x.shape()[0], x.shape()[1], x.shape()[2]);
        let mut out = Array3::<f32>::zeros((bs, t, e));
        for bi in 0..bs {
            for ti in 0..t {
                let f = features[(bi, ti, 0)];
                for ei in 0..e {
                    out[(bi, ti, ei)] = f * weights[(bi, ti, ei)] + biases[(bi, ti, ei)];
                }
            }
        }
        out
    }
}

#[allow(dead_code)]
fn _silence(_a: ArrayView2<f32>) {}

#[cfg(test)]
mod tests {
    use super::*;
    use ndarray::Array;

    fn small_cfg() -> ColEmbeddingConfig {
        ColEmbeddingConfig {
            embed_dim: 8,
            num_blocks: 1,
            nhead: 2,
            dim_feedforward: 16,
            num_inds: 4,
            dropout: 0.0,
            activation: Activation::Gelu,
            norm_first: true,
            bias_free_ln: false,
            affine: false,
            feature_group: ColFeatureGroup::None,
            feature_group_size: 3,
            target_aware: false,
            max_classes: 10,
            reserve_cls_tokens: 2,
            ssmax: "none".into(),
            mixed_radix_ensemble: false,
            recompute: false,
        }
    }

    #[test]
    fn forward_shape_includes_reserved_cls_tokens() {
        let ce = ColEmbedding::new(small_cfg());
        let x = Array::from_shape_fn((2, 3, 5), |(b, t, h)| (b * 100 + t * 10 + h) as f32 * 0.01);
        let out = ce.forward(x.view()).unwrap();
        // (B=2, T=3, H+C=5+2=7, E=8)
        assert_eq!(out.shape(), &[2, 3, 7, 8]);
    }

    #[test]
    fn forward_reserves_sentinel_cls_slots() {
        // The reserved CLS slots are padded with -100 *before* the
        // SkippableLinear, which then no-ops them through. Downstream
        // RowInteraction overwrites these slots with learnable CLS
        // tokens, so the exact value doesn't matter as long as it's the
        // sentinel — matches the Python `F.pad(X, (C, 0), value=-100.0)`
        // contract. With zero-init params the set transformer's
        // approximately-identity pre-norm path leaves the sentinel
        // unchanged (the test below verifies that).
        let ce = ColEmbedding::new(small_cfg());
        let x = Array::from_shape_fn((1, 2, 3), |(_, _, h)| (h + 1) as f32);
        let out = ce.forward(x.view()).unwrap();
        for b in 0..1 {
            for t in 0..2 {
                for c in 0..ce.config.reserve_cls_tokens {
                    for e in 0..ce.config.embed_dim {
                        // Allow a wide tolerance — the value just needs
                        // to be near the sentinel rather than near zero.
                        assert!(
                            out[(b, t, c, e)] < -50.0,
                            "expected sentinel near -100, got {} at [{b},{t},{c},{e}]",
                            out[(b, t, c, e)]
                        );
                    }
                }
            }
        }
    }

    #[test]
    fn feature_grouping_same_circular_shifts() {
        let mut cfg = small_cfg();
        cfg.feature_group = ColFeatureGroup::Same;
        cfg.feature_group_size = 3;
        let ce = ColEmbedding::new(cfg);
        // X: B=1, T=1, H=4. Values [10, 20, 30, 40].
        let x =
            ndarray::Array::from_shape_vec((1, 1, 4), vec![10.0_f32, 20.0, 30.0, 40.0]).unwrap();
        let grouped = ce.feature_grouping(x.view());
        // G = H = 4, size = 3. group_h[k] = X[(h + 2^k) % H].
        // group 0 = [X[1], X[2], X[4%4]=X[0]] = [20, 30, 10]
        // group 1 = [X[2], X[3], X[5%4]=X[1]] = [30, 40, 20]
        // group 2 = [X[3], X[0], X[6%4]=X[2]] = [40, 10, 30]
        // group 3 = [X[0], X[1], X[7%4]=X[3]] = [10, 20, 40]
        assert_eq!(grouped.shape(), &[1, 1, 4, 3]);
        assert_eq!(grouped[(0, 0, 0, 0)], 20.0);
        assert_eq!(grouped[(0, 0, 0, 1)], 30.0);
        assert_eq!(grouped[(0, 0, 0, 2)], 10.0);
        assert_eq!(grouped[(0, 0, 3, 2)], 40.0);
    }

    #[test]
    fn feature_grouping_valid_pads_and_reshapes() {
        let mut cfg = small_cfg();
        cfg.feature_group = ColFeatureGroup::Valid;
        cfg.feature_group_size = 3;
        let ce = ColEmbedding::new(cfg);
        // H=4 → pad to 6 with one zero → G = 6/3 = 2.
        // wait 4 mod 3 = 1, so pad = 2. Pad to H=6. G=2.
        let x = ndarray::Array::from_shape_vec((1, 1, 4), vec![1.0_f32, 2.0, 3.0, 4.0]).unwrap();
        let grouped = ce.feature_grouping(x.view());
        assert_eq!(grouped.shape(), &[1, 1, 2, 3]);
        // group 0 = [1, 2, 3]
        assert_eq!(grouped[(0, 0, 0, 0)], 1.0);
        assert_eq!(grouped[(0, 0, 0, 1)], 2.0);
        assert_eq!(grouped[(0, 0, 0, 2)], 3.0);
        // group 1 = [4, 0, 0]
        assert_eq!(grouped[(0, 0, 1, 0)], 4.0);
        assert_eq!(grouped[(0, 0, 1, 1)], 0.0);
        assert_eq!(grouped[(0, 0, 1, 2)], 0.0);
    }

    #[test]
    fn forward_with_grouping_runs_end_to_end() {
        let mut cfg = small_cfg();
        cfg.feature_group = ColFeatureGroup::Same;
        cfg.feature_group_size = 3;
        let ce = ColEmbedding::new(cfg);
        let x = ndarray::Array::from_shape_fn((1, 2, 5), |(_, t, h)| (t * 5 + h) as f32 * 0.1);
        let out = ce.forward(x.view()).unwrap();
        // G == H == 5, plus 2 CLS slots → 7. E=8.
        assert_eq!(out.shape(), &[1, 2, 7, 8]);
    }

    #[test]
    fn forward_target_aware_classification_runs() {
        let mut cfg = small_cfg();
        cfg.target_aware = true;
        cfg.max_classes = 3;
        let ce = ColEmbedding::new(cfg);
        let x = Array::from_shape_fn((1, 5, 4), |(_, t, h)| (t * 4 + h) as f32 * 0.1);
        let y_train: ndarray::Array2<usize> =
            Array::from_shape_vec((1, 3), vec![0_usize, 1, 2]).unwrap();
        let out = ce
            .forward_with_targets(x.view(), Some(y_train.view()), None, 3)
            .unwrap();
        assert_eq!(out.shape(), &[1, 5, 6, 8]); // 4 features + 2 CLS
    }

    #[test]
    fn forward_target_aware_regression_runs() {
        let mut cfg = small_cfg();
        cfg.target_aware = true;
        cfg.max_classes = 0; // regression
        let ce = ColEmbedding::new(cfg);
        let x = Array::from_shape_fn((1, 4, 3), |(_, t, h)| (t * 3 + h) as f32 * 0.01);
        let y_train: ndarray::Array2<f32> =
            Array::from_shape_vec((1, 2), vec![0.5_f32, 1.5]).unwrap();
        let out = ce
            .forward_with_targets(x.view(), None, Some(y_train.view()), 2)
            .unwrap();
        assert_eq!(out.shape(), &[1, 4, 5, 8]);
    }

    #[test]
    fn forward_target_aware_many_classes_uses_mixed_radix() {
        let mut cfg = small_cfg();
        cfg.target_aware = true;
        cfg.max_classes = 3;
        cfg.mixed_radix_ensemble = true;
        let ce = ColEmbedding::new(cfg);
        let x = Array::from_shape_fn((1, 5, 2), |(_, t, h)| (t * 2 + h) as f32 * 0.1);
        // 5 classes > max_classes (3) → triggers mixed-radix path.
        let y_train: ndarray::Array2<usize> =
            Array::from_shape_vec((1, 3), vec![0_usize, 2, 4]).unwrap();
        let out = ce
            .forward_with_targets(x.view(), Some(y_train.view()), None, 3)
            .unwrap();
        assert_eq!(out.shape(), &[1, 5, 4, 8]); // 2 features + 2 CLS
    }

    #[test]
    fn mixed_radix_bases_match_python_formula() {
        let mut cfg = small_cfg();
        cfg.max_classes = 10;
        let ce = ColEmbedding::new(cfg);
        // Python docs example: 25 classes, max=10 → [5, 5].
        let bases = ce.compute_mixed_radix_bases(25);
        assert_eq!(bases, vec![5, 5]);
        // 101 classes, max=10 → [11, 10] (one base bumped to cover).
        let bases = ce.compute_mixed_radix_bases(101);
        let product: usize = bases.iter().product();
        assert!(product >= 101);
        assert!(bases.iter().all(|&b| b <= 11));
    }

    #[test]
    fn mixed_radix_digits_decompose_consistently() {
        // For bases=[5, 5], class y=13 → digit 0 = 13 // 5 = 2, digit 1 = 13 % 5 = 3.
        let bases = vec![5, 5];
        assert_eq!(ColEmbedding::extract_mixed_radix_digit(13, 0, &bases), 2);
        assert_eq!(ColEmbedding::extract_mixed_radix_digit(13, 1, &bases), 3);
        // Round-trip: digit_0 * 5 + digit_1 = 13.
        let d0 = ColEmbedding::extract_mixed_radix_digit(13, 0, &bases);
        let d1 = ColEmbedding::extract_mixed_radix_digit(13, 1, &bases);
        assert_eq!(d0 * 5 + d1, 13);
    }

    #[test]
    fn affine_head_runs() {
        let mut cfg = small_cfg();
        cfg.affine = true;
        let ce = ColEmbedding::new(cfg);
        let x = Array::from_shape_fn((1, 2, 3), |(_, _, h)| (h + 1) as f32);
        let out = ce.forward(x.view()).unwrap();
        assert_eq!(out.shape(), &[1, 2, 5, 8]);
    }
}