#[derive(Debug, Clone)]
pub struct JambaConfig {
pub vocab_size: usize,
pub hidden_size: usize,
pub intermediate_size: usize,
pub num_hidden_layers: usize,
pub num_attention_heads: usize,
pub num_key_value_heads: usize,
pub attn_layer_offset: usize,
pub attn_layer_period: usize,
pub expert_layer_offset: usize,
pub expert_layer_period: usize,
pub num_experts: usize,
pub num_experts_per_tok: usize,
pub mamba_d_state: usize,
pub mamba_d_conv: usize,
pub mamba_expand: usize,
pub rms_norm_eps: f64,
pub rope_theta: f64,
}
impl JambaConfig {
pub fn jamba_1_5b() -> Self {
Self {
vocab_size: 65536,
hidden_size: 4096,
intermediate_size: 14336,
num_hidden_layers: 32,
num_attention_heads: 32,
num_key_value_heads: 8,
attn_layer_offset: 3,
attn_layer_period: 8,
expert_layer_offset: 1,
expert_layer_period: 2,
num_experts: 16,
num_experts_per_tok: 2,
mamba_d_state: 16,
mamba_d_conv: 4,
mamba_expand: 2,
rms_norm_eps: 1e-5,
rope_theta: 10000.0,
}
}
pub fn small_test() -> Self {
Self {
vocab_size: 256,
hidden_size: 64,
intermediate_size: 128,
num_hidden_layers: 8,
num_attention_heads: 4,
num_key_value_heads: 2,
attn_layer_offset: 3,
attn_layer_period: 8,
expert_layer_offset: 1,
expert_layer_period: 2,
num_experts: 4,
num_experts_per_tok: 2,
mamba_d_state: 8,
mamba_d_conv: 4,
mamba_expand: 2,
rms_norm_eps: 1e-5,
rope_theta: 10000.0,
}
}
pub fn is_attention_layer(&self, layer_idx: usize) -> bool {
layer_idx >= self.attn_layer_offset
&& (layer_idx - self.attn_layer_offset).is_multiple_of(self.attn_layer_period)
}
pub fn is_moe_layer(&self, layer_idx: usize) -> bool {
self.is_attention_layer(layer_idx)
&& layer_idx >= self.expert_layer_offset
&& (layer_idx - self.expert_layer_offset).is_multiple_of(self.expert_layer_period)
}
pub fn head_dim(&self) -> usize {
self.hidden_size / self.num_attention_heads
}
pub fn mamba_inner_dim(&self) -> usize {
self.hidden_size * self.mamba_expand
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_jamba_1_5b_vocab_size() {
let cfg = JambaConfig::jamba_1_5b();
assert_eq!(cfg.vocab_size, 65536);
}
#[test]
fn test_jamba_1_5b_hidden_size() {
let cfg = JambaConfig::jamba_1_5b();
assert_eq!(cfg.hidden_size, 4096);
}
#[test]
fn test_jamba_1_5b_num_hidden_layers() {
let cfg = JambaConfig::jamba_1_5b();
assert_eq!(cfg.num_hidden_layers, 32);
}
#[test]
fn test_jamba_1_5b_num_attention_heads() {
let cfg = JambaConfig::jamba_1_5b();
assert_eq!(cfg.num_attention_heads, 32);
}
#[test]
fn test_jamba_1_5b_num_key_value_heads() {
let cfg = JambaConfig::jamba_1_5b();
assert_eq!(cfg.num_key_value_heads, 8);
}
#[test]
fn test_jamba_1_5b_moe_params() {
let cfg = JambaConfig::jamba_1_5b();
assert_eq!(cfg.num_experts, 16);
assert_eq!(cfg.num_experts_per_tok, 2);
}
#[test]
fn test_jamba_1_5b_attn_layer_pattern() {
let cfg = JambaConfig::jamba_1_5b();
assert_eq!(cfg.attn_layer_offset, 3);
assert_eq!(cfg.attn_layer_period, 8);
}
#[test]
fn test_jamba_1_5b_expert_layer_pattern() {
let cfg = JambaConfig::jamba_1_5b();
assert_eq!(cfg.expert_layer_offset, 1);
assert_eq!(cfg.expert_layer_period, 2);
}
#[test]
fn test_jamba_small_test_config() {
let cfg = JambaConfig::small_test();
assert_eq!(cfg.vocab_size, 256);
assert_eq!(cfg.hidden_size, 64);
assert_eq!(cfg.num_hidden_layers, 8);
}
#[test]
fn test_jamba_is_attention_layer_true() {
let cfg = JambaConfig::jamba_1_5b();
assert!(cfg.is_attention_layer(3));
assert!(cfg.is_attention_layer(11));
assert!(cfg.is_attention_layer(19));
assert!(cfg.is_attention_layer(27));
}
#[test]
fn test_jamba_is_attention_layer_false() {
let cfg = JambaConfig::jamba_1_5b();
assert!(!cfg.is_attention_layer(0));
assert!(!cfg.is_attention_layer(1));
assert!(!cfg.is_attention_layer(4));
}
#[test]
fn test_jamba_is_moe_layer() {
let cfg = JambaConfig::jamba_1_5b();
assert!(cfg.is_moe_layer(3));
}
#[test]
fn test_jamba_is_moe_layer_false_non_attn() {
let cfg = JambaConfig::jamba_1_5b();
assert!(!cfg.is_moe_layer(5));
}
#[test]
fn test_jamba_head_dim() {
let cfg = JambaConfig::jamba_1_5b();
assert_eq!(cfg.head_dim(), 4096 / 32);
}
#[test]
fn test_jamba_mamba_inner_dim() {
let cfg = JambaConfig::jamba_1_5b();
assert_eq!(cfg.mamba_inner_dim(), cfg.hidden_size * cfg.mamba_expand);
assert_eq!(cfg.mamba_inner_dim(), 8192);
}
#[test]
fn test_jamba_mamba_d_state() {
let cfg = JambaConfig::jamba_1_5b();
assert_eq!(cfg.mamba_d_state, 16);
}
#[test]
fn test_jamba_mamba_d_conv() {
let cfg = JambaConfig::jamba_1_5b();
assert_eq!(cfg.mamba_d_conv, 4);
}
#[test]
fn test_jamba_mamba_expand() {
let cfg = JambaConfig::jamba_1_5b();
assert_eq!(cfg.mamba_expand, 2);
}
#[test]
fn test_jamba_rms_norm_eps() {
let cfg = JambaConfig::jamba_1_5b();
assert!(cfg.rms_norm_eps > 0.0);
}
#[test]
fn test_jamba_rope_theta() {
let cfg = JambaConfig::jamba_1_5b();
assert!((cfg.rope_theta - 10000.0).abs() < 1.0);
}
#[test]
fn test_jamba_lcg_values_in_range() {
let mut s = 42u64;
s = s.wrapping_mul(6364136223846793005).wrapping_add(1442695040888963407);
let v = (s % 1000) as f32 / 1000.0;
assert!((0.0..1.0).contains(&v));
}
#[test]
fn test_jamba_small_head_dim() {
let cfg = JambaConfig::small_test();
assert_eq!(cfg.head_dim(), 64 / 4);
}
}