tokitai-operator 0.1.0

//! Dense-MLP architecture specialization.
//!
//! A `DenseArch` is the special case of `Architecture` with
//! expert count = 1 and top-k = 1 (no routing, just a single
//! MLP). Used by the simpler non-MoE training paths.
//!
// `DenseModel` — the 0.7B Dense MLP used as the "no-MoE" baseline
// in the 3-way ablation study (sheaf+padic MoE vs softmax-only MoE
// vs Dense MLP, at matched parameter budget).
//
// `QualityModel` — an enum wrapper that holds either an
// `MoEModel` (the historical model family) or a `DenseModel`
// (the new ablation baseline), and forwards the small set of
// methods both need to share (`scalar_param_count`, `forward`,
// `backward`, `parameter_count`).
//
// The Dense architecture is intentionally simple so the ablation
// isolates the contribution of the MoE + sheaf + p-adic machinery:
//
//     x                              # [B, IN_DIM=96]
//     -> InputProj(96 -> hidden)     # single Linear+LN
//     -> FFN sub-block x n_blocks    # each: Linear(h,i)+LN+GELU
//                                     #       + Linear(i,h)+LN+GELU
//     -> OutputProj(hidden -> 20)    # single Linear
//     -> logits                      # [B, OUT_DIM=20]
//
// We deliberately do NOT add residual connections, attention, or
// normalization beyond the per-Layer LN. Those are orthogonal
// choices; if Dense outperforms MoE in the ablation, the next
// question is "by how much is the gap explained by MoE-vs-Dense
// vs by the addition of attention/residuals" — and the cleanest
// way to make that question answerable is to keep the Dense
// architecture as a pure stack of FFN sub-blocks.
//
// `n_blocks` is chosen by the `tiny_dense` factory to land the
// model in the [650M, 750M] parameter range. The default of 84
// with hidden=1024 / intermediate=4096 lands at ~706M, in range.

use std::cell::RefCell;

use crate::model::layer::{GELU, Layer, LayerNorm, Linear};
use crate::model::sequential::{Model, Sequential};
use crate::moe_model::MoEModel;
use crate::object::{Shape, Tensor};
use crate::{Error, Result};

/// Per-call forward cache for [`DenseModel::forward`] ->
/// [`DenseModel::backward`]. Mirrors the role of
/// `MoEForwardCache` in the MoE family.
pub struct DenseForwardCache {
    /// Original input tensor (cloned for safety).
    pub input: Tensor<f32>,
}

/// Resolved dims for a [`DenseModel`]. Cheap to copy, used both
/// at construction time and inside the build path on the arch
/// side. The fields are:
///
///   - `input_dim`:  raw feature width (default 96, the schema
///                   pinned in `docs/COMPETITIVE_BENCHMARK_PLAN.md`).
///   - `hidden_dim`: residual stream width (default 1024, the
///                   `tiny_dense` factory value).
///   - `intermediate`: FFN sub-block inner dim (default 4096,
///                     the `tiny_dense` factory value).
///   - `output_dim`: regression / classification head width
///                   (default 20, the schema).
///   - `n_blocks`:  number of FFN sub-blocks in the body. The
///                  `tiny_dense` factory picks this to land the
///                  scalar param count in [650M, 750M].
#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
pub struct DenseConfig {
    pub input_dim: usize,
    pub hidden_dim: usize,
    pub intermediate: usize,
    pub output_dim: usize,
    pub n_blocks: usize,
}

/// The 0.7B Dense MLP.
///
/// The struct is a thin orchestrator: an `InputProj` Sequential,
/// a `Vec<Sequential>` of FFN sub-blocks, and an `OutputProj`
/// Sequential. `Sequential` already chains forward/backward, so
/// `DenseModel` itself just drives the three groups in order,
/// stitches the per-block input gradient streams, and threads
/// the parameter gradient streams back through the
/// `RefCell<Option<DenseForwardCache>>` that lives on the model.
pub struct DenseModel {
    /// Resolved dims used to construct this model. Kept for
    /// diagnostics (param-count assertions, arch roundtrip).
    pub config: DenseConfig,
    /// `Linear(input_dim, hidden_dim) + LayerNorm(hidden_dim)`.
    pub input_proj: Sequential,
    /// Stack of FFN sub-blocks. Each sub-block is its own
    /// `Sequential` so its `parameters_mut()` view is isolated
    /// for the per-block optimizer step.
    pub blocks: Vec<Sequential>,
    /// `Linear(hidden_dim, output_dim)`. No LayerNorm/GELU
    /// after the output projection, matching the MoE expert
    /// head shape.
    pub output_proj: Sequential,
    /// Last forward's cache (taken by `backward`). Mirrors
    /// `MoEModel::last_cache`.
    last_cache: RefCell<Option<DenseForwardCache>>,
}

impl DenseModel {
    /// Build a fresh `DenseModel` with the given dims, all
    /// parameters seeded from `seed` (xorshift32-derived, same
    /// convention as `MoEModel::new`).
    pub fn new(cfg: DenseConfig, seed: u64) -> Self {
        // Seed convention: same shape as `MoEModel::new`. The
        // high 32 bits seed the body blocks; the low 32 bits
        // seed the input/output projections. Splitting the
        // seed like this gives the projections a different RNG
        // stream from the body, which is what the patent
        // disclosure's reproducibility appendix expects.
        let body_seed = ((seed >> 32) as u32).wrapping_add(0xB0B0_0001);
        let proj_seed = (seed & 0xFFFF_FFFF) as u32;

        // Input projection: Linear + LayerNorm. No GELU; the
        // body blocks each start with their own GELU after the
        // LayerNorm inside the FFN.
        let mut input_layers: Vec<Box<dyn Layer>> = Vec::new();
        input_layers.push(Box::new(Linear::new(
            cfg.input_dim,
            cfg.hidden_dim,
            proj_seed.wrapping_add(0x10),
        )));
        input_layers.push(Box::new(LayerNorm::new(cfg.hidden_dim, 1e-5)));
        let input_proj = Sequential::new(input_layers);

        // Body: `n_blocks` FFN sub-blocks. Each sub-block is
        // a `Sequential` so its parameter gradient stream is
        // isolated for the per-block optimizer step (and so
        // the per-block `parameters_mut()` view lines up with
        // the `build_checkpoint` snapshot layout, mirroring
        // the MoE's per-expert layout).
        let mut blocks: Vec<Sequential> = Vec::with_capacity(cfg.n_blocks);
        for bi in 0..cfg.n_blocks {
            let block_seed = body_seed.wrapping_add((bi as u32).wrapping_mul(0x1000));
            let mut layers: Vec<Box<dyn Layer>> = Vec::new();
            // Linear(hidden, intermediate) + LN(intermediate) + GELU
            layers.push(Box::new(Linear::new(
                cfg.hidden_dim,
                cfg.intermediate,
                block_seed.wrapping_add(0x20),
            )));
            layers.push(Box::new(LayerNorm::new(cfg.intermediate, 1e-5)));
            layers.push(Box::new(GELU));
            // Linear(intermediate, hidden) + LN(hidden) + GELU
            layers.push(Box::new(Linear::new(
                cfg.intermediate,
                cfg.hidden_dim,
                block_seed.wrapping_add(0x30),
            )));
            layers.push(Box::new(LayerNorm::new(cfg.hidden_dim, 1e-5)));
            layers.push(Box::new(GELU));
            blocks.push(Sequential::new(layers));
        }

        // Output projection: Linear only. No LN / GELU after,
        // matching the MoE expert head shape.
        let mut output_layers: Vec<Box<dyn Layer>> = Vec::new();
        output_layers.push(Box::new(Linear::new(
            cfg.hidden_dim,
            cfg.output_dim,
            proj_seed.wrapping_add(0x40),
        )));
        let output_proj = Sequential::new(output_layers);

        Self {
            config: cfg,
            input_proj,
            blocks,
            output_proj,
            last_cache: RefCell::new(None),
        }
    }

    /// Total scalar parameter count (sum of `numel()` over every
    /// trainable tensor in the input proj, the body, and the
    /// output proj). Used by the training runner's param-count
    /// accounting and by the arch fingerprint's stability check.
    pub fn scalar_param_count(&self) -> usize {
        let mut n = 0usize;
        for p in self.input_proj.parameters() {
            n += p.numel();
        }
        for block in &self.blocks {
            for p in block.parameters() {
                n += p.numel();
            }
        }
        for p in self.output_proj.parameters() {
            n += p.numel();
        }
        n
    }

    /// Total number of trainable parameter tensors (the number
    /// of entries in the `param_grads` vector that `backward`
    /// returns). Mirrors `MoEModel::parameter_count`.
    pub fn parameter_count(&self) -> usize {
        let mut n = self.input_proj.parameters().len();
        for block in &self.blocks {
            n += block.parameters().len();
        }
        n += self.output_proj.parameters().len();
        n
    }

    /// Mutable view over every parameter, in declaration order
    /// (input proj, then blocks 0..n_blocks-1, then output proj).
    /// Mirrors `MoEModel::expert_mut` for the per-block slice.
    pub fn block_mut(&mut self, idx: usize) -> &mut Sequential {
        &mut self.blocks[idx]
    }

    /// Forward pass. Returns the logits tensor.
    ///
    /// Note: the Dense model has no router, so the return type
    /// is just `Tensor<f32>` (the logits) — there is no
    /// `router_weights` companion tensor the way
    /// `MoEModel::forward` returns. The `QualityModel` wrapper
    /// layer (see below) fills in a synthetic "router weights =
    /// all-ones on a single expert" tensor when the caller
    /// needs an MoE-shaped return.
    pub fn forward(&self, input: &Tensor<f32>) -> Result<Tensor<f32>> {
        // Stage 1: input projection. The outer binding is
        // moved into the shadowed `current` below; the shadow
        // is the one that's reassigned in the body loop.
        let current = self
            .input_proj
            .forward(&[input.clone()])
            .map_err(|e| Error::backend(format!("DenseModel::forward input_proj: {e}")))?;
        if current.len() != 1 {
            return Err(Error::backend(format!(
                "DenseModel::forward input_proj returned {} tensors, expected 1",
                current.len()
            )));
        }
        let mut current = current.into_iter().next().expect("len=1");

        // Stage 2: body. We thread the gradient through
        // sequentially, *recomputing* forward on each block
        // (no residual connections; this is a strict stack of
        // FFN sub-blocks). The intermediate activations are
        // not cached on the model — only the original input
        // is, matching the MoEModel's "stash the input, let
        // each layer re-run its own forward on backward"
        // convention via the per-layer caches held inside
        // `Sequential::last_cache`.
        for (bi, block) in self.blocks.iter().enumerate() {
            let outs = block
                .forward(&[current.clone()])
                .map_err(|e| Error::backend(format!("DenseModel::forward block {}: {e}", bi)))?;
            if outs.len() != 1 {
                return Err(Error::backend(format!(
                    "DenseModel::forward block {} returned {} tensors, expected 1",
                    bi,
                    outs.len()
                )));
            }
            current = outs.into_iter().next().expect("len=1");
        }

        // Stage 3: output projection.
        let logits_vec = self
            .output_proj
            .forward(&[current])
            .map_err(|e| Error::backend(format!("DenseModel::forward output_proj: {e}")))?;
        if logits_vec.len() != 1 {
            return Err(Error::backend(format!(
                "DenseModel::forward output_proj returned {} tensors, expected 1",
                logits_vec.len()
            )));
        }
        let logits = logits_vec.into_iter().next().expect("len=1");

        // Stash the forward cache. The Dense model only needs
        // the original input for the (currently unused)
        // grad_input return value; the per-layer caches live
        // inside each `Sequential::last_cache` and are taken
        // by `backward` in the standard way.
        *self.last_cache.borrow_mut() = Some(DenseForwardCache {
            input: input.clone(),
        });

        Ok(logits)
    }

    /// Backward pass. Returns `(grad_input, param_grads)`.
    /// `param_grads` is in declaration order: input proj,
    /// then each body block, then the output proj. The total
    /// length matches `parameter_count()`.
    pub fn backward(&self, grad_output: &Tensor<f32>) -> Result<(Tensor<f32>, Vec<Tensor<f32>>)> {
        // Pull the input out of the cache so we can return a
        // meaningful grad_input (the Dense model has no
        // residual connections, so grad_input is currently
        // unused by the runner; we still return the right
        // shape for consistency with `MoEModel::backward`).
        let cache = self
            .last_cache
            .borrow_mut()
            .take()
            .ok_or_else(|| Error::backend("DenseModel::backward called before forward"))?;

        // Stage 3 backward: output projection. `per_input_grads`
        // is consumed into `current_grad` below (no further
        // mutation), so only `param_grads` needs `mut`.
        let (per_input_grads, mut param_grads) = self
            .output_proj
            .backward(&[grad_output.clone()])
            .map_err(|e| Error::backend(format!("DenseModel::backward output_proj: {e}")))?;
        if per_input_grads.len() != 1 {
            return Err(Error::backend(format!(
                "DenseModel::backward output_proj returned {} input grads, expected 1",
                per_input_grads.len()
            )));
        }
        let mut current_grad = per_input_grads.into_iter().next().expect("len=1");

        // Stage 2 backward: body blocks, in reverse order.
        for bi in (0..self.blocks.len()).rev() {
            let (g, p) = self.blocks[bi]
                .backward(&[current_grad])
                .map_err(|e| Error::backend(format!("DenseModel::backward block {}: {e}", bi)))?;
            if g.len() != 1 {
                return Err(Error::backend(format!(
                    "DenseModel::backward block {} returned {} input grads, expected 1",
                    bi,
                    g.len()
                )));
            }
            current_grad = g.into_iter().next().expect("len=1");
            param_grads.extend(p);
        }

        // Stage 1 backward: input projection.
        let (g, p) = self
            .input_proj
            .backward(&[current_grad])
            .map_err(|e| Error::backend(format!("DenseModel::backward input_proj: {e}")))?;
        if g.len() != 1 {
            return Err(Error::backend(format!(
                "DenseModel::backward input_proj returned {} input grads, expected 1",
                g.len()
            )));
        }
        let grad_input = g.into_iter().next().expect("len=1");
        param_grads.extend(p);

        // The param_grads we just accumulated are in BACKWARD
        // order (output proj first, then body reversed, then
        // input proj). The MoEModel convention is forward
        // order (router first, then experts in declaration
        // order, then each expert's parameters in declaration
        // order). The training runner's optimizer step walks
        // `all_parameters_mut` in the same order as
        // `param_grads`, so we need to flip param_grads to
        // match. Reverse the body block slices, then keep
        // input_proj at the end (so the final order is:
        // input_proj, body[0..n_blocks], output_proj — but
        // wait, the body was pushed in reverse, so we have
        // [output_proj, body[n-1], ..., body[0], input_proj].
        // Reverse the whole thing to get
        // [input_proj, body[0], ..., block[n-1], output_proj]).
        param_grads.reverse();

        // grad_input is in the same shape as the original
        // input ([B, input_dim]); we ignore the cached input
        // tensor here, but we took the cache above to clear
        // the RefCell (otherwise a second forward+backward
        // would panic).
        let _ = cache;
        Ok((grad_input, param_grads))
    }
}

/// Sum of `scalar_param_count` for both model families. Used by
/// the training runner to size the optimizer-state allocation
/// without dispatching on `model_kind` itself.
fn total_scalar_params(maybe_moe: Option<&MoEModel>, dense: &DenseModel) -> usize {
    let mut n = dense.scalar_param_count();
    if let Some(m) = maybe_moe {
        n += m.scalar_param_count();
    }
    n
}

/// Unified model type. The training runner and the
/// `train_step_cpu` driver dispatch on this enum instead of
/// picking a concrete model type, so the same training loop
/// works for the MoE family and the Dense family without
/// duplicate code paths.
pub enum QualityModel {
    /// The historical MoE model (router + N_EXPERTS experts).
    MoE(MoEModel),
    /// The new Dense MLP stack (no router, no per-expert dispatch).
    Dense(DenseModel),
}

impl QualityModel {
    /// Total scalar parameter count across all trainable tensors
    /// in the active model. The MoE branch returns
    /// `MoEModel::scalar_param_count`; the Dense branch returns
    /// `DenseModel::scalar_param_count`.
    pub fn scalar_param_count(&self) -> usize {
        match self {
            QualityModel::MoE(m) => m.scalar_param_count(),
            QualityModel::Dense(m) => m.scalar_param_count(),
        }
    }

    /// Total number of trainable parameter tensors. Mirrors
    /// `scalar_param_count`'s dispatch.
    pub fn parameter_count(&self) -> usize {
        match self {
            QualityModel::MoE(m) => m.parameter_count(),
            QualityModel::Dense(m) => m.parameter_count(),
        }
    }

    /// Forward pass. The return type is a `QualityOutput` so
    /// the caller can get both the logits AND the (synthetic
    /// for Dense) router weights in a single tuple. The Dense
    /// branch synthesizes a `router_weights` tensor of shape
    /// `[B, 1]` filled with 1.0, mirroring the
    /// "single-expert, no routing" interpretation. The runner
    /// treats `N_EXPERTS=1` for the Dense path so the
    /// existing per-expert share accounting still produces
    /// a well-formed row.
    pub fn forward(&self, input: &Tensor<f32>) -> Result<QualityOutput> {
        match self {
            QualityModel::MoE(m) => {
                let out = m.forward(input)?;
                Ok(QualityOutput {
                    logits: out.logits,
                    router_weights: out.router_weights,
                })
            }
            QualityModel::Dense(m) => {
                let logits = m.forward(input)?;
                // Synthesize a [B, 1] router_weights = 1.0
                // tensor. The runner's `compute_router_stats`
                // helper does not currently special-case
                // N_EXPERTS=1, so we leave the per-expert
                // share as `[1.0]` and the router entropy
                // as `0.0` (single expert, no choice to
                // make). The shape is `[B, 1]` so the JSONL
                // row's `per_expert_share` field still
                // serializes as a 1-element array — the
                // runner's downstream code reads it as a
                // length-`N_EXPERTS` array, and the
                // `tiny_dense` arch pins N_EXPERTS=1.
                let b = match &logits.meta.shape.dims[0] {
                    crate::object::Dim::Static(v) => *v,
                    _ => {
                        return Err(Error::shape(
                            "QualityModel::Dense::forward logits batch dim must be static",
                        ));
                    }
                };
                let router_weights = Tensor::dense_cpu(
                    logits.meta.domain.clone(),
                    Shape::from(vec![b, 1]),
                    vec![1.0f32; b],
                );
                Ok(QualityOutput {
                    logits,
                    router_weights,
                })
            }
        }
    }

    /// Backward pass. Returns `(grad_input, param_grads)` in
    /// the same shape `MoEModel::backward` returns. The Dense
    /// branch delegates to `DenseModel::backward`; the MoE
    /// branch delegates to `MoEModel::backward`.
    pub fn backward(&self, grad_output: &Tensor<f32>) -> Result<(Tensor<f32>, Vec<Tensor<f32>>)> {
        match self {
            QualityModel::MoE(m) => m.backward(grad_output),
            QualityModel::Dense(m) => m.backward(grad_output),
        }
    }

    /// Mutable view over every parameter, in declaration order,
    /// suitable for handing to the optimizer. The Dense branch
    /// returns input_proj + body blocks + output_proj; the MoE
    /// branch returns router + experts in declaration order.
    pub fn all_parameters_mut(&mut self) -> Vec<&mut crate::model::parameter::Parameter> {
        match self {
            QualityModel::MoE(m) => {
                let mut out: Vec<&mut crate::model::parameter::Parameter> =
                    Vec::with_capacity(m.parameter_count());
                out.extend(m.router.parameters_mut());
                for expert in m.experts.iter_mut() {
                    out.extend(expert.parameters_mut());
                }
                out
            }
            QualityModel::Dense(m) => {
                let mut out: Vec<&mut crate::model::parameter::Parameter> =
                    Vec::with_capacity(m.parameter_count());
                out.extend(m.input_proj.parameters_mut());
                for block in m.blocks.iter_mut() {
                    out.extend(block.parameters_mut());
                }
                out.extend(m.output_proj.parameters_mut());
                out
            }
        }
    }

    /// Sum of `scalar_param_count` plus the scalar count of an
    /// auxiliary MoE model (if any). Currently only used by
    /// the infer path's compatibility check, which the Dense
    /// path doesn't need; kept here for symmetry with the MoE
    /// path's `infer` integration.
    #[allow(dead_code)]
    pub fn total_with(maybe_moe: Option<&MoEModel>, dense: &DenseModel) -> usize {
        total_scalar_params(maybe_moe, dense)
    }
}

/// Output of [`QualityModel::forward`]. The `router_weights`
/// field is real for the MoE family and synthesized (all-ones,
/// shape `[B, 1]`) for the Dense family so the downstream
/// per-expert share / router entropy accounting produces a
/// well-formed row regardless of the model family.
pub struct QualityOutput {
    /// Final model output, shape `[B, OUT_DIM]` (20 for the
    /// current schema).
    pub logits: Tensor<f32>,
    /// Per-expert gating weights, shape `[B, N_EXPERTS]` (4 for
    /// the MoE family, 1 for the Dense family). See
    /// [`QualityModel::forward`].
    pub router_weights: Tensor<f32>,
}