pub mod config;
pub mod model;
pub mod tasks;
pub use config::{ActivationType, DeepSeekV2Config, TopKMethod};
pub use model::{
apply_activation, gelu, silu, DeepSeekV2DecoderLayer, DeepSeekV2MLP, DeepSeekV2MoELayer,
DeepSeekV2Model, DeepSeekV2RmsNorm, DeepSeekV2RotaryEmbedding, ExpertRouter, MlaAttention,
};
pub use tasks::{DeepSeekV2Error, DeepSeekV2ForCausalLM};
#[cfg(test)]
mod tests {
use super::*;
use trustformers_core::traits::Config;
fn tiny_config() -> DeepSeekV2Config {
DeepSeekV2Config {
vocab_size: 64,
hidden_size: 32,
intermediate_size: 64,
num_hidden_layers: 2,
num_attention_heads: 4,
kv_lora_rank: 16,
q_lora_rank: 32,
qk_rope_head_dim: 8,
qk_nope_head_dim: 8,
v_head_dim: 8,
n_routed_experts: 8,
num_experts_per_tok: 2,
n_shared_experts: 1,
n_group: 2,
topk_group: 1,
first_k_dense_replace: 1,
moe_layer_freq: 1,
..DeepSeekV2Config::default()
}
}
#[test]
fn test_config_defaults() {
let cfg = DeepSeekV2Config::default();
assert_eq!(cfg.vocab_size, 102400);
assert_eq!(cfg.hidden_size, 5120);
assert_eq!(cfg.intermediate_size, 12288);
assert_eq!(cfg.num_hidden_layers, 60);
assert_eq!(cfg.num_attention_heads, 128);
assert_eq!(cfg.kv_lora_rank, 512);
assert_eq!(cfg.q_lora_rank, 1536);
assert_eq!(cfg.qk_rope_head_dim, 64);
assert_eq!(cfg.qk_nope_head_dim, 128);
assert_eq!(cfg.v_head_dim, 128);
assert_eq!(cfg.n_routed_experts, 160);
assert_eq!(cfg.num_experts_per_tok, 6);
assert_eq!(cfg.n_shared_experts, 2);
assert_eq!(cfg.n_group, 8);
assert_eq!(cfg.topk_group, 3);
assert!((cfg.aux_loss_alpha - 0.001).abs() < 1e-7);
assert_eq!(cfg.first_k_dense_replace, 1);
assert_eq!(cfg.moe_layer_freq, 1);
}
#[test]
fn test_mla_projection_dimensions() {
let cfg = tiny_config();
assert_eq!(
cfg.qk_head_dim(),
cfg.qk_rope_head_dim + cfg.qk_nope_head_dim
);
let q_b_out = cfg.num_attention_heads * cfg.qk_head_dim();
assert_eq!(q_b_out, 4 * 16);
let o_in = cfg.num_attention_heads * cfg.v_head_dim;
assert_eq!(o_in, 4 * 8);
}
#[test]
fn test_dense_vs_moe_layer_selection() {
let cfg = tiny_config();
assert!(cfg.is_dense_layer(0), "layer 0 should be dense");
assert!(!cfg.is_dense_layer(1), "layer 1 should be MoE");
assert!(!cfg.is_dense_layer(2), "layer 2 should be MoE");
let cfg2 = DeepSeekV2Config {
first_k_dense_replace: 3,
moe_layer_freq: 2,
..tiny_config()
};
assert!(cfg2.is_dense_layer(0));
assert!(cfg2.is_dense_layer(1));
assert!(cfg2.is_dense_layer(2));
assert!(!cfg2.is_dense_layer(3));
assert!(cfg2.is_dense_layer(4));
}
#[test]
fn test_config_validation() {
let mut cfg = tiny_config();
assert!(cfg.validate().is_ok(), "valid config should pass");
cfg.vocab_size = 0;
assert!(cfg.validate().is_err(), "vocab_size=0 should fail");
cfg.vocab_size = 64;
cfg.kv_lora_rank = 0;
assert!(cfg.validate().is_err(), "kv_lora_rank=0 should fail");
cfg.kv_lora_rank = 16;
cfg.num_experts_per_tok = cfg.n_routed_experts + 1;
assert!(
cfg.validate().is_err(),
"num_experts_per_tok > n_routed_experts should fail"
);
}
#[test]
fn test_causal_lm_forward_mock() {
use trustformers_core::tensor::Tensor;
use trustformers_core::traits::Model;
let cfg = tiny_config();
let model = DeepSeekV2ForCausalLM::new(cfg).expect("model creation");
let input = Tensor::from_vec(vec![1.0f32, 2.0, 3.0], &[3]).expect("tensor");
let result = model.forward(input);
assert!(result.is_ok(), "forward failed: {:?}", result.err());
}
#[test]
fn test_kv_cache_compression() {
let cfg = DeepSeekV2Config::default();
let mla_size = cfg.mla_kv_cache_per_token_per_layer();
let mha_size = cfg.mha_kv_cache_per_token_per_layer();
assert!(
mla_size < mha_size,
"MLA KV cache ({mla_size}) should be smaller than MHA ({mha_size})"
);
let ratio = cfg.kv_cache_compression_ratio();
assert!(
ratio < 0.5,
"KV cache compression ratio {ratio} should be < 0.5"
);
assert!(ratio > 0.0, "compression ratio must be positive");
let tiny = tiny_config();
let tiny_mla = tiny.mla_kv_cache_per_token_per_layer();
let tiny_mha = tiny.mha_kv_cache_per_token_per_layer();
assert!(
tiny_mla < tiny_mha,
"tiny config: MLA ({tiny_mla}) must be < MHA ({tiny_mha})"
);
}
#[test]
fn test_topk_method_display() {
assert_eq!(
TopKMethod::GroupLimitedGreedy.to_string(),
"GroupLimitedGreedy"
);
assert_eq!(TopKMethod::Noaux.to_string(), "Noaux");
assert_eq!(ActivationType::SiLU.to_string(), "silu");
assert_eq!(ActivationType::GeLU.to_string(), "gelu");
let json = serde_json::to_string(&TopKMethod::GroupLimitedGreedy).expect("serialize");
let back: TopKMethod = serde_json::from_str(&json).expect("deserialize");
assert_eq!(back, TopKMethod::GroupLimitedGreedy);
}
#[test]
fn test_moe_layer_expert_counts() {
use trustformers_core::device::Device;
let cfg = tiny_config();
let moe = DeepSeekV2MoELayer::new(&cfg, Device::CPU).expect("moe creation");
assert_eq!(moe.num_routed_experts(), cfg.n_routed_experts);
assert_eq!(moe.num_shared_experts(), cfg.n_shared_experts);
}
#[test]
fn test_decoder_layer_type() {
use trustformers_core::device::Device;
let cfg = tiny_config();
let layer0 = DeepSeekV2DecoderLayer::new(&cfg, 0, Device::CPU).expect("layer0 creation");
assert!(layer0.is_dense(), "layer 0 must be dense");
let layer1 = DeepSeekV2DecoderLayer::new(&cfg, 1, Device::CPU).expect("layer1 creation");
assert!(!layer1.is_dense(), "layer 1 must be MoE");
}
#[test]
fn test_kv_lora_rank_default() {
let cfg = DeepSeekV2Config::default();
assert_eq!(cfg.kv_lora_rank, 512, "default kv_lora_rank must be 512");
}
#[test]
fn test_q_lora_rank_default() {
let cfg = DeepSeekV2Config::default();
assert_eq!(cfg.q_lora_rank, 1536, "default q_lora_rank must be 1536");
}
#[test]
fn test_qk_rope_vs_nope_head_dim() {
let cfg = DeepSeekV2Config::default();
assert_ne!(
cfg.qk_rope_head_dim, cfg.qk_nope_head_dim,
"rope and nope head dims must differ for default 236B config"
);
assert_eq!(cfg.qk_rope_head_dim, 64);
assert_eq!(cfg.qk_nope_head_dim, 128);
}
#[test]
fn test_qk_head_dim_sum() {
let cfg = DeepSeekV2Config::default();
assert_eq!(
cfg.qk_head_dim(),
cfg.qk_rope_head_dim + cfg.qk_nope_head_dim,
"qk_head_dim must equal qk_rope + qk_nope"
);
let tiny = tiny_config();
assert_eq!(
tiny.qk_head_dim(),
tiny.qk_rope_head_dim + tiny.qk_nope_head_dim
);
}
#[test]
fn test_mla_kv_latent_smaller_than_expanded_kv() {
let cfg = DeepSeekV2Config::default();
let expanded_v = cfg.num_attention_heads * cfg.v_head_dim;
assert!(
cfg.kv_lora_rank < expanded_v,
"kv_lora_rank ({}) must be smaller than num_heads*v_head_dim ({}) — compression",
cfg.kv_lora_rank,
expanded_v
);
}
#[test]
fn test_moe_num_routed_experts_default() {
let cfg = DeepSeekV2Config::default();
assert_eq!(cfg.n_routed_experts, 160, "default has 160 routed experts");
}
#[test]
fn test_moe_num_experts_per_tok_default() {
let cfg = DeepSeekV2Config::default();
assert_eq!(
cfg.num_experts_per_tok, 6,
"default routes to 6 experts per token"
);
assert!(
cfg.num_experts_per_tok < cfg.n_routed_experts,
"experts_per_tok must be less than total routed experts"
);
}
#[test]
fn test_dense_model_all_layers_dense() {
let cfg = DeepSeekV2Config {
first_k_dense_replace: 4, moe_layer_freq: 1,
num_hidden_layers: 2,
..tiny_config()
};
for i in 0..cfg.num_hidden_layers {
assert!(
cfg.is_dense_layer(i),
"layer {i} should be dense in all-dense variant"
);
}
}
#[test]
fn test_shared_vs_routed_experts() {
let cfg = DeepSeekV2Config::default();
assert!(
cfg.n_shared_experts > 0,
"must have at least one shared expert"
);
assert!(
cfg.n_routed_experts > cfg.n_shared_experts,
"more routed than shared"
);
}
#[test]
fn test_expert_router_output_shape() {
use trustformers_core::{device::Device, tensor::Tensor};
let cfg = tiny_config();
let router = ExpertRouter::new(&cfg, Device::CPU);
let hidden =
Tensor::from_vec(vec![0.1f32; cfg.hidden_size], &[1, cfg.hidden_size]).expect("tensor");
let result = router.route(&hidden).expect("router route");
let (expert_indices, weights) = result;
assert_eq!(
expert_indices.len(),
cfg.num_experts_per_tok,
"router must select num_experts_per_tok experts"
);
assert_eq!(
weights.len(),
cfg.num_experts_per_tok,
"routing weights count must match num_experts_per_tok"
);
}
#[test]
fn test_mla_kv_cache_per_token_ratio() {
let cfg = tiny_config();
let mla = cfg.mla_kv_cache_per_token_per_layer();
let mha = cfg.mha_kv_cache_per_token_per_layer();
assert!(
mla < mha,
"MLA ({mla}) must be smaller than MHA ({mha}) for tiny config"
);
}
#[test]
fn test_causal_lm_forward_ids_output_length() {
let cfg = tiny_config();
let model = DeepSeekV2ForCausalLM::new(cfg.clone()).expect("model");
let logits = model.forward_ids(&[1u32, 2, 3]).expect("forward_ids");
assert_eq!(
logits.len(),
3 * cfg.vocab_size,
"forward_ids output must be seq*vocab"
);
}
#[test]
fn test_causal_lm_empty_input_error() {
let cfg = tiny_config();
let model = DeepSeekV2ForCausalLM::new(cfg).expect("model");
let result = model.forward_ids(&[]);
assert!(result.is_err(), "empty input must return an error");
matches!(result.unwrap_err(), DeepSeekV2Error::EmptyInput);
}
#[test]
fn test_causal_lm_generate_token_count() {
let cfg = tiny_config();
let model = DeepSeekV2ForCausalLM::new(cfg).expect("model");
let result = model.generate(&[1u32, 2], 3);
assert!(result.is_ok(), "generate failed: {:?}", result.err());
assert_eq!(
result.expect("generated").len(),
3,
"must generate exactly 3 tokens"
);
}
#[test]
fn test_error_display_invalid_config() {
let s = format!(
"{}",
DeepSeekV2Error::InvalidConfig("bad param".to_string())
);
assert!(
s.contains("bad param"),
"InvalidConfig display must include message"
);
}
#[test]
fn test_error_display_empty_input() {
let s = format!("{}", DeepSeekV2Error::EmptyInput);
assert!(
s.contains("empty") || s.contains("Empty"),
"EmptyInput must mention empty"
);
}
#[test]
fn test_error_display_shape_mismatch() {
let err = DeepSeekV2Error::ShapeMismatch {
expected: vec![3, 4],
got: vec![5, 6],
};
let s = format!("{err}");
assert!(
s.contains("3") && s.contains("5"),
"ShapeMismatch must include both shapes"
);
}
#[test]
fn test_aux_loss_alpha_small() {
let cfg = DeepSeekV2Config::default();
assert!(
cfg.aux_loss_alpha < 0.01,
"aux_loss_alpha ({}) should be small (< 0.01)",
cfg.aux_loss_alpha
);
assert!(cfg.aux_loss_alpha > 0.0, "aux_loss_alpha must be positive");
}
}