1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
//! MoE topology: router + experts + head.
//!
//! The topology holds the per-expert MLP weights, the router
//! weights, and the head. Constructed via `MoEBuilder` (see
//! `builder.rs`) so the user can specify the depth, width, and
//! the number of experts.
//!
// Phase 2.4 MoE topology constants and exact param-count formulas.
//
// Architecture (per `docs/MOE_DESIGN.md` §3-§4):
// - Input: 96-dim (74 categorical one-hot + 22 numerical p-adic).
// - Router: Linear(96, N_EXPERTS) -> Softmax -> Top-K mask.
// - Each expert:
// Linear(IN_DIM, hidden) -> LayerNorm(hidden) -> GELU
// x n_hidden_layers
// [Linear(hidden, hidden) -> LayerNorm(hidden) -> GELU]
// Linear(hidden, OUT_DIM) // no LN / GELU after
//
// Per-expert parameter count (exact):
// (1 input + n_hidden hidden + 1 output) Linears:
// Linear(in, out) contributes in*out + out parameters.
// (1 input + n_hidden hidden) LayerNorms:
// LayerNorm(n) contributes 2*n parameters.
/// Number of experts in the MoE (statically fixed by the design).
pub const N_EXPERTS: usize = 4;
/// Top-K routing fan-out (statically fixed by the design).
pub const TOP_K: usize = 2;
/// Raw input feature dimension (74 categorical + 22 numerical).
pub const IN_DIM: usize = 96;
/// Output dimension per expert (12 outcome classes + 8 aux heads).
pub const OUT_DIM: usize = 20;
/// `(hidden_dim, n_hidden_layers)` for the given `MoESize`.
///
/// This is a free-function re-export of [`super::MoESize::dims`]
/// kept here so callers that already depend on `topology` (e.g. the
/// arch writer) do not need to import the `MoESize` method surface.
/// Exact trainable-parameter count for the given `MoESize`.
///
/// Counts:
/// - Per-expert Linears (input + n_hidden hidden + output).
/// - Per-expert LayerNorms (input + n_hidden hidden, *no* output LN).
/// - Router Linear (IN_DIM, N_EXPERTS) with bias.
///
/// `Softmax`, `GELU`, `Add` are parameter-free.
///
/// The four sizes produce the following parameter counts
/// (closed-form, no allocation):
/// - `Nano` (hidden=128, depth=2): ~195,540
/// - `Tiny` (hidden=2048, depth=5): ~84,984,276
/// - `Medium` (hidden=4096, depth=5): ~338,476,036
/// - `Full` (hidden=4096, depth=12): ~808,749,572