use crate::model::layer::{GELU, Layer, LayerNorm, Linear, Router};
use crate::model::sequential::Sequential;
use super::topology::{IN_DIM, N_EXPERTS, OUT_DIM};
use super::{ExpertConfig, MoEModel, MoESize, RouterConfig};
impl MoEModel {
pub fn new(size: MoESize, seed: u64) -> Self {
let (hidden, n_hidden) = size.dims();
let router_seed = (seed & 0xFFFF_FFFF) as u32;
let base_seed = ((seed >> 32) as u32).wrapping_add(0xA5A5_0001);
let router = Router::new(IN_DIM, N_EXPERTS, super::topology::TOP_K, router_seed);
let mut experts: Vec<Sequential> = Vec::with_capacity(N_EXPERTS);
for ei in 0..N_EXPERTS {
let expert_seed = base_seed.wrapping_add((ei as u32).wrapping_mul(0x1000));
let mut layers: Vec<Box<dyn Layer>> = Vec::new();
layers.push(Box::new(Linear::new(
IN_DIM,
hidden,
expert_seed.wrapping_add(0x10),
)));
layers.push(Box::new(LayerNorm::new(hidden, 1e-5)));
layers.push(Box::new(GELU));
for hi in 0..n_hidden {
layers.push(Box::new(Linear::new(
hidden,
hidden,
expert_seed.wrapping_add(0x20 + hi as u32),
)));
layers.push(Box::new(LayerNorm::new(hidden, 1e-5)));
layers.push(Box::new(GELU));
}
layers.push(Box::new(Linear::new(
hidden,
OUT_DIM,
expert_seed.wrapping_add(0x30 + n_hidden as u32),
)));
experts.push(Sequential::new(layers));
}
Self {
size,
router,
experts,
last_cache: std::cell::RefCell::new(None),
}
}
pub fn router_config(&self) -> RouterConfig {
RouterConfig {
in_features: IN_DIM,
n_experts: N_EXPERTS,
top_k: super::topology::TOP_K,
}
}
pub fn expert_config(&self, _idx: usize) -> ExpertConfig {
let (hidden, n_hidden) = self.size.dims();
ExpertConfig {
in_features: IN_DIM,
hidden,
n_hidden_layers: n_hidden,
out_features: OUT_DIM,
}
}
}