use serde::{Deserialize, Serialize};
use trustformers_core::errors::invalid_config;
use trustformers_core::traits::Config;
use crate::nemotron::tasks::NemotronError;
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
pub enum NormType {
RmsNorm,
LayerNorm,
}
impl Default for NormType {
fn default() -> Self {
NormType::RmsNorm
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct NemotronConfig {
pub vocab_size: usize,
pub hidden_size: usize,
pub intermediate_size: usize,
pub num_hidden_layers: usize,
pub num_attention_heads: usize,
pub num_key_value_heads: usize,
pub head_dim: usize,
pub max_position_embeddings: usize,
pub rms_norm_eps: f64,
pub rope_theta: f64,
pub partial_rotary_factor: f32,
pub hidden_act: String,
pub tie_word_embeddings: bool,
pub norm_type: NormType,
pub attention_bias: bool,
pub mlp_bias: bool,
}
impl Default for NemotronConfig {
fn default() -> Self {
Self {
vocab_size: 256000,
hidden_size: 6144,
intermediate_size: 24576,
num_hidden_layers: 32,
num_attention_heads: 48,
num_key_value_heads: 8,
head_dim: 128,
max_position_embeddings: 4096,
rms_norm_eps: 1e-5,
rope_theta: 10000.0,
partial_rotary_factor: 0.5,
hidden_act: "relu2".to_string(),
tie_word_embeddings: false,
norm_type: NormType::RmsNorm,
attention_bias: false,
mlp_bias: false,
}
}
}
impl NemotronConfig {
pub fn rotary_dim(&self) -> usize {
(self.head_dim as f32 * self.partial_rotary_factor) as usize
}
pub fn validate(&self) -> std::result::Result<(), NemotronError> {
if self.vocab_size == 0 {
return Err(NemotronError::InvalidConfig(
"vocab_size must be > 0".to_string(),
));
}
if self.hidden_size == 0 {
return Err(NemotronError::InvalidConfig(
"hidden_size must be > 0".to_string(),
));
}
if self.num_attention_heads == 0 {
return Err(NemotronError::InvalidConfig(
"num_attention_heads must be > 0".to_string(),
));
}
if self.num_key_value_heads == 0 {
return Err(NemotronError::InvalidConfig(
"num_key_value_heads must be > 0".to_string(),
));
}
if !self.num_attention_heads.is_multiple_of(self.num_key_value_heads) {
return Err(NemotronError::InvalidConfig(
"num_attention_heads must be divisible by num_key_value_heads".to_string(),
));
}
if self.head_dim == 0 {
return Err(NemotronError::InvalidConfig(
"head_dim must be > 0".to_string(),
));
}
if !(0.0..=1.0).contains(&self.partial_rotary_factor) {
return Err(NemotronError::InvalidConfig(
"partial_rotary_factor must be in [0, 1]".to_string(),
));
}
if self.rms_norm_eps <= 0.0 {
return Err(NemotronError::InvalidConfig(
"rms_norm_eps must be positive".to_string(),
));
}
if self.rope_theta <= 0.0 {
return Err(NemotronError::InvalidConfig(
"rope_theta must be positive".to_string(),
));
}
Ok(())
}
pub fn nemotron_4_340b() -> Self {
Self {
vocab_size: 256000,
hidden_size: 18432,
intermediate_size: 73728,
num_hidden_layers: 96,
num_attention_heads: 96,
num_key_value_heads: 8,
head_dim: 192,
max_position_embeddings: 4096,
rms_norm_eps: 1e-5,
rope_theta: 10000.0,
partial_rotary_factor: 0.5,
hidden_act: "relu2".to_string(),
tie_word_embeddings: false,
norm_type: NormType::RmsNorm,
attention_bias: false,
mlp_bias: false,
}
}
pub fn nemotron_4_22b() -> Self {
Self {
vocab_size: 256000,
hidden_size: 6144,
intermediate_size: 24576,
num_hidden_layers: 40,
num_attention_heads: 48,
num_key_value_heads: 8,
head_dim: 128,
max_position_embeddings: 4096,
rms_norm_eps: 1e-5,
rope_theta: 10000.0,
partial_rotary_factor: 0.5,
hidden_act: "relu2".to_string(),
tie_word_embeddings: false,
norm_type: NormType::RmsNorm,
attention_bias: false,
mlp_bias: false,
}
}
}
impl Config for NemotronConfig {
fn validate(&self) -> trustformers_core::errors::Result<()> {
if self.hidden_size == 0 {
return Err(invalid_config(
"config_field",
"hidden_size must be > 0".to_string(),
));
}
if !self.num_attention_heads.is_multiple_of(self.num_key_value_heads) {
return Err(invalid_config(
"config_field",
"num_attention_heads must be divisible by num_key_value_heads".to_string(),
));
}
if self.vocab_size == 0 {
return Err(invalid_config(
"config_field",
"vocab_size must be > 0".to_string(),
));
}
Ok(())
}
fn architecture(&self) -> &'static str {
"Nemotron"
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_nemotron_default_vocab_size() {
let cfg = NemotronConfig::default();
assert_eq!(cfg.vocab_size, 256000);
}
#[test]
fn test_nemotron_default_hidden_size() {
let cfg = NemotronConfig::default();
assert_eq!(cfg.hidden_size, 6144);
}
#[test]
fn test_nemotron_default_num_hidden_layers() {
let cfg = NemotronConfig::default();
assert_eq!(cfg.num_hidden_layers, 32);
}
#[test]
fn test_nemotron_default_num_attention_heads() {
let cfg = NemotronConfig::default();
assert_eq!(cfg.num_attention_heads, 48);
}
#[test]
fn test_nemotron_default_num_key_value_heads() {
let cfg = NemotronConfig::default();
assert_eq!(cfg.num_key_value_heads, 8);
}
#[test]
fn test_nemotron_default_head_dim() {
let cfg = NemotronConfig::default();
assert_eq!(cfg.head_dim, 128);
}
#[test]
fn test_nemotron_default_hidden_act() {
let cfg = NemotronConfig::default();
assert_eq!(cfg.hidden_act, "relu2");
}
#[test]
fn test_nemotron_default_partial_rotary_factor() {
let cfg = NemotronConfig::default();
assert!((cfg.partial_rotary_factor - 0.5).abs() < 1e-6);
}
#[test]
fn test_nemotron_default_norm_type() {
let cfg = NemotronConfig::default();
assert_eq!(cfg.norm_type, NormType::RmsNorm);
}
#[test]
fn test_nemotron_validate_passes_default() {
let cfg = NemotronConfig::default();
assert!(cfg.validate().is_ok());
}
#[test]
fn test_nemotron_validate_fails_zero_vocab_size() {
let cfg = NemotronConfig {
vocab_size: 0,
..NemotronConfig::default()
};
assert!(cfg.validate().is_err());
}
#[test]
fn test_nemotron_validate_fails_zero_hidden_size() {
let cfg = NemotronConfig {
hidden_size: 0,
..NemotronConfig::default()
};
assert!(cfg.validate().is_err());
}
#[test]
fn test_nemotron_validate_fails_invalid_rotary_factor() {
let cfg = NemotronConfig {
partial_rotary_factor: 1.5,
..NemotronConfig::default()
};
assert!(cfg.validate().is_err());
}
#[test]
fn test_nemotron_validate_fails_heads_not_divisible() {
let cfg = NemotronConfig {
num_attention_heads: 48,
num_key_value_heads: 7,
..NemotronConfig::default()
};
assert!(cfg.validate().is_err());
}
#[test]
fn test_nemotron_rotary_dim() {
let cfg = NemotronConfig::default();
assert_eq!(cfg.rotary_dim(), 64); }
#[test]
fn test_nemotron_4_340b_preset() {
let cfg = NemotronConfig::nemotron_4_340b();
assert_eq!(cfg.hidden_size, 18432);
assert_eq!(cfg.num_hidden_layers, 96);
assert_eq!(cfg.head_dim, 192);
assert!(cfg.validate().is_ok());
}
#[test]
fn test_nemotron_4_22b_preset() {
let cfg = NemotronConfig::nemotron_4_22b();
assert_eq!(cfg.hidden_size, 6144);
assert_eq!(cfg.num_hidden_layers, 40);
assert!(cfg.validate().is_ok());
}
#[test]
fn test_nemotron_attention_bias_default_false() {
let cfg = NemotronConfig::default();
assert!(!cfg.attention_bias);
assert!(!cfg.mlp_bias);
}
#[test]
fn test_nemotron_tie_word_embeddings_default_false() {
let cfg = NemotronConfig::default();
assert!(!cfg.tie_word_embeddings);
}
#[test]
fn test_nemotron_architecture_name() {
let cfg = NemotronConfig::default();
assert_eq!(cfg.architecture(), "Nemotron");
}
#[test]
fn test_nemotron_validate_fails_negative_rms_norm_eps() {
let cfg = NemotronConfig {
rms_norm_eps: 0.0,
..NemotronConfig::default()
};
assert!(cfg.validate().is_err());
}
#[test]
fn test_nemotron_rotary_dim_22b() {
let cfg = NemotronConfig::nemotron_4_22b();
assert_eq!(cfg.rotary_dim(), 64); }
#[test]
fn test_nemotron_lcg_values_in_range() {
let mut s = 42u64;
s = s.wrapping_mul(6364136223846793005).wrapping_add(1442695040888963407);
let v = (s % 1000) as f32 / 1000.0;
assert!((0.0..1.0).contains(&v));
}
}