use serde::{Deserialize, Serialize};
use trustformers_core::{
errors::{invalid_config, Result},
traits::Config,
};
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct MambaConfig {
pub d_model: usize,
pub d_state: usize,
pub d_conv: usize,
pub expand: usize,
pub n_layer: usize,
pub vocab_size: usize,
pub max_position_embeddings: usize,
pub rms_norm_eps: f32,
pub initializer_range: f32,
pub rescale_prenorm_residual: bool,
pub use_bias: bool,
pub use_conv_bias: bool,
pub dt_rank: Option<usize>,
pub dt_min: f32,
pub dt_max: f32,
pub dt_init: String,
pub dt_scale: f32,
pub dt_init_floor: f32,
pub pad_token_id: Option<u32>,
pub bos_token_id: u32,
pub eos_token_id: u32,
pub tie_word_embeddings: bool,
pub model_type: String,
}
impl Default for MambaConfig {
fn default() -> Self {
Self {
d_model: 768,
d_state: 16,
d_conv: 4,
expand: 2,
n_layer: 24,
vocab_size: 50280,
max_position_embeddings: 2048,
rms_norm_eps: 1e-5,
initializer_range: 0.1,
rescale_prenorm_residual: true,
use_bias: false,
use_conv_bias: true,
dt_rank: None, dt_min: 0.001,
dt_max: 0.1,
dt_init: "random".to_string(),
dt_scale: 1.0,
dt_init_floor: 1e-4,
pad_token_id: Some(0),
bos_token_id: 0,
eos_token_id: 0,
tie_word_embeddings: true,
model_type: "mamba".to_string(),
}
}
}
impl MambaConfig {
pub fn mamba_130m() -> Self {
Self {
d_model: 768,
n_layer: 24,
vocab_size: 50280,
max_position_embeddings: 2048,
..Default::default()
}
}
pub fn mamba_370m() -> Self {
Self {
d_model: 1024,
n_layer: 48,
vocab_size: 50280,
max_position_embeddings: 2048,
..Default::default()
}
}
pub fn mamba_790m() -> Self {
Self {
d_model: 1536,
n_layer: 48,
vocab_size: 50280,
max_position_embeddings: 2048,
..Default::default()
}
}
pub fn mamba_1_4b() -> Self {
Self {
d_model: 2048,
n_layer: 48,
vocab_size: 50280,
max_position_embeddings: 2048,
..Default::default()
}
}
pub fn mamba_2_8b() -> Self {
Self {
d_model: 2560,
n_layer: 64,
vocab_size: 50280,
max_position_embeddings: 2048,
..Default::default()
}
}
pub fn get_dt_rank(&self) -> usize {
self.dt_rank.unwrap_or_else(|| {
self.d_model.div_ceil(16)
})
}
pub fn get_d_inner(&self) -> usize {
self.d_model * self.expand
}
pub fn from_pretrained_name(name: &str) -> Option<Self> {
match name {
"state-spaces/mamba-130m" | "mamba-130m" => Some(Self::mamba_130m()),
"state-spaces/mamba-370m" | "mamba-370m" => Some(Self::mamba_370m()),
"state-spaces/mamba-790m" | "mamba-790m" => Some(Self::mamba_790m()),
"state-spaces/mamba-1.4b" | "mamba-1.4b" => Some(Self::mamba_1_4b()),
"state-spaces/mamba-2.8b" | "mamba-2.8b" => Some(Self::mamba_2_8b()),
_ => None,
}
}
}
impl Config for MambaConfig {
fn architecture(&self) -> &'static str {
"mamba"
}
fn validate(&self) -> Result<()> {
if self.d_model == 0 {
return Err(invalid_config(
"config_field",
"d_model must be greater than 0",
));
}
if self.n_layer == 0 {
return Err(invalid_config(
"config_field",
"n_layer must be greater than 0",
));
}
if self.vocab_size == 0 {
return Err(invalid_config(
"config_field",
"vocab_size must be greater than 0",
));
}
Ok(())
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_default_config() {
let config = MambaConfig::default();
assert_eq!(config.d_model, 768);
assert_eq!(config.d_state, 16);
assert_eq!(config.d_conv, 4);
assert_eq!(config.expand, 2);
assert_eq!(config.n_layer, 24);
}
#[test]
fn test_dt_rank_computation() {
let config = MambaConfig::default();
assert_eq!(config.get_dt_rank(), 48);
let config_with_dt_rank = MambaConfig {
dt_rank: Some(32),
..Default::default()
};
assert_eq!(config_with_dt_rank.get_dt_rank(), 32);
}
#[test]
fn test_d_inner_computation() {
let config = MambaConfig::default();
assert_eq!(config.get_d_inner(), 1536); }
#[test]
fn test_predefined_configs() {
let config_130m = MambaConfig::mamba_130m();
assert_eq!(config_130m.d_model, 768);
assert_eq!(config_130m.n_layer, 24);
let config_2_8b = MambaConfig::mamba_2_8b();
assert_eq!(config_2_8b.d_model, 2560);
assert_eq!(config_2_8b.n_layer, 64);
}
#[test]
fn test_from_pretrained_name() {
let config = MambaConfig::from_pretrained_name("state-spaces/mamba-130m");
assert!(config.is_some());
assert_eq!(config.expect("operation failed").d_model, 768);
let config = MambaConfig::from_pretrained_name("unknown-model");
assert!(config.is_none());
}
#[test]
fn test_config_trait() {
let config = MambaConfig::default();
assert_eq!(config.architecture(), "mamba");
assert!(config.validate().is_ok());
}
#[test]
fn test_mamba_2_8b_d_model() {
let config = MambaConfig::mamba_2_8b();
assert_eq!(config.d_model, 2560, "Mamba-2.8B d_model must be 2560");
}
#[test]
fn test_mamba_2_8b_n_layers() {
let config = MambaConfig::mamba_2_8b();
assert_eq!(config.n_layer, 64, "Mamba-2.8B must have 64 layers");
}
#[test]
fn test_mamba_2_8b_d_state() {
let config = MambaConfig::mamba_2_8b();
assert_eq!(config.d_state, 16, "Mamba-2.8B d_state must be 16");
}
#[test]
fn test_mamba_2_8b_expand() {
let config = MambaConfig::mamba_2_8b();
assert_eq!(config.expand, 2, "Mamba-2.8B expand must be 2");
}
#[test]
fn test_dt_rank_auto_130m() {
let config = MambaConfig::mamba_130m();
assert_eq!(config.get_dt_rank(), 48);
}
#[test]
fn test_dt_rank_auto_370m() {
let config = MambaConfig::mamba_370m();
assert_eq!(config.get_dt_rank(), 64);
}
#[test]
fn test_dt_rank_auto_790m() {
let config = MambaConfig::mamba_790m();
assert_eq!(config.get_dt_rank(), 96);
}
#[test]
fn test_dt_rank_auto_1_4b() {
let config = MambaConfig::mamba_1_4b();
assert_eq!(config.get_dt_rank(), 128);
}
#[test]
fn test_dt_rank_auto_2_8b() {
let config = MambaConfig::mamba_2_8b();
assert_eq!(config.get_dt_rank(), 160);
}
#[test]
fn test_dt_rank_explicit_overrides_auto() {
let config = MambaConfig {
d_model: 768,
dt_rank: Some(100),
..Default::default()
};
assert_eq!(
config.get_dt_rank(),
100,
"Explicit dt_rank should override auto"
);
}
#[test]
fn test_d_inner_2_8b() {
let config = MambaConfig::mamba_2_8b();
assert_eq!(
config.get_d_inner(),
config.expand * config.d_model,
"d_inner = expand * d_model"
);
}
#[test]
fn test_d_inner_370m() {
let config = MambaConfig::mamba_370m();
assert_eq!(config.get_d_inner(), 2048);
}
#[test]
fn test_d_conv_default_is_4() {
let config = MambaConfig::default();
assert_eq!(config.d_conv, 4);
}
#[test]
fn test_d_conv_preserved_in_presets() {
for d_model in [768_usize, 1024, 1536, 2048, 2560] {
let config = MambaConfig {
d_model,
..Default::default()
};
assert_eq!(config.d_conv, 4, "d_conv must be 4 for d_model={}", d_model);
}
}
#[test]
fn test_use_conv_bias_default_true() {
let config = MambaConfig::default();
assert!(
config.use_conv_bias,
"use_conv_bias must be true by default"
);
}
#[test]
fn test_use_bias_default_false() {
let config = MambaConfig::default();
assert!(
!config.use_bias,
"use_bias must be false by default (no linear bias)"
);
}
#[test]
fn test_validation_fails_zero_d_model() {
let config = MambaConfig {
d_model: 0,
..Default::default()
};
assert!(config.validate().is_err());
}
#[test]
fn test_validation_fails_zero_n_layer() {
let config = MambaConfig {
n_layer: 0,
..Default::default()
};
assert!(config.validate().is_err());
}
#[test]
fn test_validation_fails_zero_vocab_size() {
let config = MambaConfig {
vocab_size: 0,
..Default::default()
};
assert!(config.validate().is_err());
}
#[test]
fn test_all_presets_validate() {
for config in [
MambaConfig::mamba_130m(),
MambaConfig::mamba_370m(),
MambaConfig::mamba_790m(),
MambaConfig::mamba_1_4b(),
MambaConfig::mamba_2_8b(),
] {
assert!(
config.validate().is_ok(),
"Preset with d_model={} failed validation",
config.d_model
);
}
}
#[test]
fn test_from_pretrained_short_name_2_8b() {
let config = MambaConfig::from_pretrained_name("mamba-2.8b");
assert!(config.is_some());
assert_eq!(config.expect("mamba-2.8b config").d_model, 2560);
}
#[test]
fn test_from_pretrained_short_name_790m() {
let config = MambaConfig::from_pretrained_name("mamba-790m");
assert!(config.is_some());
assert_eq!(config.expect("mamba-790m config").d_model, 1536);
}
#[test]
fn test_d_model_increases_with_model_size() {
let configs = [
MambaConfig::mamba_130m(),
MambaConfig::mamba_370m(),
MambaConfig::mamba_790m(),
MambaConfig::mamba_1_4b(),
MambaConfig::mamba_2_8b(),
];
for i in 1..configs.len() {
assert!(
configs[i].d_model >= configs[i - 1].d_model,
"d_model must be non-decreasing with model size"
);
}
}
#[test]
fn test_config_serialization_roundtrip() {
let config = MambaConfig::mamba_2_8b();
let json = serde_json::to_string(&config).expect("serialize MambaConfig");
let restored: MambaConfig = serde_json::from_str(&json).expect("deserialize MambaConfig");
assert_eq!(config.d_model, restored.d_model);
assert_eq!(config.n_layer, restored.n_layer);
assert_eq!(config.d_state, restored.d_state);
}
#[test]
fn test_config_clone_preserves_fields() {
let config = MambaConfig::mamba_2_8b();
let cloned = config.clone();
assert_eq!(config.d_model, cloned.d_model);
assert_eq!(config.expand, cloned.expand);
assert_eq!(config.d_conv, cloned.d_conv);
}
}