tokitai-operator 0.1.0

//! MoE topology: router + experts + head.
//!
//! The topology holds the per-expert MLP weights, the router
//! weights, and the head. Constructed via `MoEBuilder` (see
//! `builder.rs`) so the user can specify the depth, width, and
//! the number of experts.
//!
// Phase 2.4 MoE topology constants and exact param-count formulas.
//
// Architecture (per `docs/MOE_DESIGN.md` §3-§4):
//   - Input: 96-dim (74 categorical one-hot + 22 numerical p-adic).
//   - Router: Linear(96, N_EXPERTS) -> Softmax -> Top-K mask.
//   - Each expert:
//       Linear(IN_DIM, hidden) -> LayerNorm(hidden) -> GELU
//       x n_hidden_layers
//         [Linear(hidden, hidden) -> LayerNorm(hidden) -> GELU]
//       Linear(hidden, OUT_DIM)        // no LN / GELU after
//
// Per-expert parameter count (exact):
//   (1 input + n_hidden hidden + 1 output) Linears:
//     Linear(in, out) contributes  in*out + out   parameters.
//   (1 input + n_hidden hidden) LayerNorms:
//     LayerNorm(n) contributes  2*n   parameters.

/// Number of experts in the MoE (statically fixed by the design).
pub const N_EXPERTS: usize = 4;

/// Top-K routing fan-out (statically fixed by the design).
pub const TOP_K: usize = 2;

/// Raw input feature dimension (74 categorical + 22 numerical).
pub const IN_DIM: usize = 96;

/// Output dimension per expert (12 outcome classes + 8 aux heads).
pub const OUT_DIM: usize = 20;

/// `(hidden_dim, n_hidden_layers)` for the given `MoESize`.
///
/// This is a free-function re-export of [`super::MoESize::dims`]
/// kept here so callers that already depend on `topology` (e.g. the
/// arch writer) do not need to import the `MoESize` method surface.
pub fn params_for_size(size: super::MoESize) -> (usize, usize) {
    size.dims()
}

/// Exact trainable-parameter count for the given `MoESize`.
///
/// Counts:
///   - Per-expert Linears (input + n_hidden hidden + output).
///   - Per-expert LayerNorms (input + n_hidden hidden, *no* output LN).
///   - Router Linear (IN_DIM, N_EXPERTS) with bias.
///
/// `Softmax`, `GELU`, `Add` are parameter-free.
///
/// The four sizes produce the following parameter counts
/// (closed-form, no allocation):
///   - `Nano`   (hidden=128,  depth=2):   ~195,540
///   - `Tiny`   (hidden=2048, depth=5):  ~84,984,276
///   - `Medium` (hidden=4096, depth=5): ~338,476,036
///   - `Full`   (hidden=4096, depth=12): ~808,749,572
pub fn param_count(size: super::MoESize) -> usize {
    let (hidden, n_hidden) = size.dims();
    // Per-expert Linear params.
    let input_linear = IN_DIM * hidden + hidden;
    let hidden_linears = n_hidden * (hidden * hidden + hidden);
    let output_linear = hidden * OUT_DIM + OUT_DIM;
    let linear_params = input_linear + hidden_linears + output_linear;
    // Per-expert LayerNorm params: 1 input LN + n_hidden hidden LNs.
    // The final output projection has *no* LN.
    let ln_params = (1 + n_hidden) * 2 * hidden;
    let per_expert = linear_params + ln_params;
    let total_experts = N_EXPERTS * per_expert;
    // Router Linear: in_features=IN_DIM, out_features=N_EXPERTS, plus bias.
    let router = IN_DIM * N_EXPERTS + N_EXPERTS;
    total_experts + router
}