use serde::{Deserialize, Serialize};
use trustformers_core::traits::Config;
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct MixtralConfig {
pub hidden_size: usize, pub intermediate_size: usize, pub num_hidden_layers: usize, pub num_attention_heads: usize, pub num_key_value_heads: usize, pub num_local_experts: usize, pub num_experts_per_tok: usize, pub sliding_window: Option<usize>, pub vocab_size: usize, pub max_position_embeddings: usize, pub rope_theta: f32, pub rms_norm_eps: f64, pub hidden_act: String, pub router_aux_loss_coef: f32, pub model_type: String,
}
impl Default for MixtralConfig {
fn default() -> Self {
Self {
hidden_size: 4096,
intermediate_size: 14336,
num_hidden_layers: 32,
num_attention_heads: 32,
num_key_value_heads: 8,
num_local_experts: 8,
num_experts_per_tok: 2,
sliding_window: None,
vocab_size: 32000,
max_position_embeddings: 32768,
rope_theta: 1_000_000.0,
rms_norm_eps: 1e-5,
hidden_act: "silu".to_string(),
router_aux_loss_coef: 0.02,
model_type: "mixtral".to_string(),
}
}
}
impl Config for MixtralConfig {
fn validate(&self) -> trustformers_core::errors::Result<()> {
if !self.hidden_size.is_multiple_of(self.num_attention_heads) {
return Err(trustformers_core::errors::invalid_config(
"hidden_size",
"hidden_size must be divisible by num_attention_heads",
));
}
if !self.num_attention_heads.is_multiple_of(self.num_key_value_heads) {
return Err(trustformers_core::errors::invalid_config(
"num_attention_heads",
"num_attention_heads must be divisible by num_key_value_heads",
));
}
if self.num_local_experts == 0 {
return Err(trustformers_core::errors::invalid_config(
"num_local_experts",
"num_local_experts must be greater than 0",
));
}
if self.num_experts_per_tok == 0 || self.num_experts_per_tok > self.num_local_experts {
return Err(trustformers_core::errors::invalid_config(
"num_experts_per_tok",
"num_experts_per_tok must be in [1, num_local_experts]",
));
}
Ok(())
}
fn architecture(&self) -> &'static str {
"Mixtral"
}
}
impl MixtralConfig {
pub fn mixtral_8x7b() -> Self {
Self::default()
}
pub fn mixtral_8x22b() -> Self {
Self {
hidden_size: 6144,
intermediate_size: 16384,
num_hidden_layers: 56,
num_attention_heads: 48,
num_key_value_heads: 8,
num_local_experts: 8,
num_experts_per_tok: 2,
vocab_size: 32768,
max_position_embeddings: 65536,
..Self::default()
}
}
pub fn head_dim(&self) -> usize {
self.hidden_size / self.num_attention_heads
}
pub fn num_query_groups(&self) -> usize {
self.num_attention_heads / self.num_key_value_heads
}
}
#[cfg(test)]
mod tests {
use super::*;
use trustformers_core::traits::Config;
#[test]
fn test_default_is_8x7b() {
let cfg = MixtralConfig::default();
assert_eq!(cfg.hidden_size, 4096);
assert_eq!(cfg.num_local_experts, 8);
assert_eq!(cfg.num_experts_per_tok, 2);
}
#[test]
fn test_8x7b_preset() {
let cfg = MixtralConfig::mixtral_8x7b();
assert_eq!(cfg.hidden_size, 4096);
assert_eq!(cfg.intermediate_size, 14336);
assert_eq!(cfg.num_hidden_layers, 32);
assert_eq!(cfg.num_attention_heads, 32);
assert_eq!(cfg.num_key_value_heads, 8);
assert_eq!(cfg.vocab_size, 32000);
assert!(cfg.sliding_window.is_none());
}
#[test]
fn test_8x22b_preset() {
let cfg = MixtralConfig::mixtral_8x22b();
assert_eq!(cfg.hidden_size, 6144);
assert_eq!(cfg.num_hidden_layers, 56);
assert_eq!(cfg.num_attention_heads, 48);
assert_eq!(cfg.vocab_size, 32768);
assert_eq!(cfg.max_position_embeddings, 65536);
}
#[test]
fn test_head_dim_8x7b() {
assert_eq!(MixtralConfig::mixtral_8x7b().head_dim(), 128);
}
#[test]
fn test_head_dim_8x22b() {
assert_eq!(MixtralConfig::mixtral_8x22b().head_dim(), 128);
}
#[test]
fn test_num_query_groups_8x7b() {
assert_eq!(MixtralConfig::mixtral_8x7b().num_query_groups(), 4);
}
#[test]
fn test_model_type_is_mixtral() {
assert_eq!(MixtralConfig::default().model_type, "mixtral");
}
#[test]
fn test_architecture_label() {
assert_eq!(MixtralConfig::default().architecture(), "Mixtral");
}
#[test]
fn test_validate_8x7b_ok() {
assert!(MixtralConfig::mixtral_8x7b().validate().is_ok());
}
#[test]
fn test_validate_8x22b_ok() {
assert!(MixtralConfig::mixtral_8x22b().validate().is_ok());
}
#[test]
fn test_validate_hidden_not_divisible_by_heads() {
let cfg = MixtralConfig {
hidden_size: 4097,
..MixtralConfig::default()
};
assert!(cfg.validate().is_err());
}
#[test]
fn test_validate_heads_not_divisible_by_kv_heads() {
let cfg = MixtralConfig {
num_key_value_heads: 7,
..MixtralConfig::default()
};
assert!(cfg.validate().is_err());
}
#[test]
fn test_validate_zero_local_experts() {
let cfg = MixtralConfig {
num_local_experts: 0,
..MixtralConfig::default()
};
assert!(cfg.validate().is_err());
}
#[test]
fn test_validate_experts_per_tok_exceeds_local() {
let default = MixtralConfig::default();
let cfg = MixtralConfig {
num_experts_per_tok: default.num_local_experts + 1,
..default
};
assert!(cfg.validate().is_err());
}
#[test]
fn test_validate_zero_experts_per_tok() {
let cfg = MixtralConfig {
num_experts_per_tok: 0,
..MixtralConfig::default()
};
assert!(cfg.validate().is_err());
}
#[test]
fn test_sliding_window_none_by_default() {
assert!(MixtralConfig::default().sliding_window.is_none());
}
#[test]
fn test_clone_preserves_fields() {
let cfg = MixtralConfig::mixtral_8x7b();
let cloned = cfg.clone();
assert_eq!(cfg.hidden_size, cloned.hidden_size);
assert_eq!(cfg.num_local_experts, cloned.num_local_experts);
assert_eq!(cfg.model_type, cloned.model_type);
}
#[test]
fn test_lcg_varied_expert_topk() {
let mut s = 23u64;
let n_experts = 8usize;
for _ in 0..5 {
s = s.wrapping_mul(6364136223846793005).wrapping_add(1442695040888963407);
let topk = ((s % n_experts as u64) + 1) as usize;
let cfg = MixtralConfig {
num_local_experts: n_experts,
num_experts_per_tok: topk,
..MixtralConfig::default()
};
assert!(cfg.validate().is_ok(), "topk={topk} failed");
}
}
}