use serde::{Deserialize, Serialize};
use trustformers_core::traits::Config;
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct LlamaConfig {
pub vocab_size: usize,
pub hidden_size: usize,
pub intermediate_size: usize,
pub num_hidden_layers: usize,
pub num_attention_heads: usize,
pub num_key_value_heads: Option<usize>, pub hidden_act: String,
pub max_position_embeddings: usize,
pub initializer_range: f32,
pub rms_norm_eps: f32,
pub use_cache: bool,
pub pad_token_id: Option<u32>,
pub bos_token_id: u32,
pub eos_token_id: u32,
pub rope_theta: f32, pub rope_scaling: Option<RopeScaling>,
pub attention_bias: bool,
pub mlp_bias: bool,
pub model_type: String,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct RopeScaling {
pub scaling_type: String, pub scaling_factor: f32,
}
impl Default for LlamaConfig {
fn default() -> Self {
Self {
vocab_size: 32000,
hidden_size: 4096,
intermediate_size: 11008,
num_hidden_layers: 32,
num_attention_heads: 32,
num_key_value_heads: None, hidden_act: "silu".to_string(),
max_position_embeddings: 2048,
initializer_range: 0.02,
rms_norm_eps: 1e-6,
use_cache: true,
pad_token_id: None,
bos_token_id: 1,
eos_token_id: 2,
rope_theta: 10000.0,
rope_scaling: None,
attention_bias: false,
mlp_bias: false,
model_type: "llama".to_string(),
}
}
}
impl Config for LlamaConfig {
fn validate(&self) -> trustformers_core::errors::Result<()> {
if !self.hidden_size.is_multiple_of(self.num_attention_heads) {
return Err(
trustformers_core::errors::TrustformersError::invalid_config(
"hidden_size must be divisible by num_attention_heads".to_string(),
),
);
}
if let Some(num_kv_heads) = self.num_key_value_heads {
if !self.num_attention_heads.is_multiple_of(num_kv_heads) {
return Err(
trustformers_core::errors::TrustformersError::invalid_config(
"num_attention_heads must be divisible by num_key_value_heads".to_string(),
),
);
}
}
Ok(())
}
fn architecture(&self) -> &'static str {
"LLaMA"
}
}
impl LlamaConfig {
pub fn llama_7b() -> Self {
Self {
vocab_size: 32000,
hidden_size: 4096,
intermediate_size: 11008,
num_hidden_layers: 32,
num_attention_heads: 32,
max_position_embeddings: 2048,
..Self::default()
}
}
pub fn llama_13b() -> Self {
Self {
vocab_size: 32000,
hidden_size: 5120,
intermediate_size: 13824,
num_hidden_layers: 40,
num_attention_heads: 40,
max_position_embeddings: 2048,
..Self::default()
}
}
pub fn llama_30b() -> Self {
Self {
vocab_size: 32000,
hidden_size: 6656,
intermediate_size: 17920,
num_hidden_layers: 60,
num_attention_heads: 52,
max_position_embeddings: 2048,
..Self::default()
}
}
pub fn llama_65b() -> Self {
Self {
vocab_size: 32000,
hidden_size: 8192,
intermediate_size: 22016,
num_hidden_layers: 80,
num_attention_heads: 64,
max_position_embeddings: 2048,
..Self::default()
}
}
pub fn llama2_7b() -> Self {
Self {
vocab_size: 32000,
hidden_size: 4096,
intermediate_size: 11008,
num_hidden_layers: 32,
num_attention_heads: 32,
max_position_embeddings: 4096, ..Self::default()
}
}
pub fn llama2_13b() -> Self {
Self {
vocab_size: 32000,
hidden_size: 5120,
intermediate_size: 13824,
num_hidden_layers: 40,
num_attention_heads: 40,
max_position_embeddings: 4096,
..Self::default()
}
}
pub fn llama2_70b() -> Self {
Self {
vocab_size: 32000,
hidden_size: 8192,
intermediate_size: 28672,
num_hidden_layers: 80,
num_attention_heads: 64,
num_key_value_heads: Some(8), max_position_embeddings: 4096,
..Self::default()
}
}
pub fn code_llama_7b() -> Self {
Self {
vocab_size: 32016, max_position_embeddings: 16384, ..Self::llama2_7b()
}
}
pub fn head_dim(&self) -> usize {
self.hidden_size / self.num_attention_heads
}
pub fn num_kv_heads(&self) -> usize {
self.num_key_value_heads.unwrap_or(self.num_attention_heads)
}
pub fn num_query_groups(&self) -> usize {
self.num_attention_heads / self.num_kv_heads()
}
pub fn llama3_8b() -> Self {
Self {
vocab_size: 128256, hidden_size: 4096,
intermediate_size: 14336, num_hidden_layers: 32,
num_attention_heads: 32,
num_key_value_heads: Some(8), max_position_embeddings: 8192, rope_theta: 500000.0, rms_norm_eps: 1e-5, ..Self::default()
}
}
pub fn llama3_70b() -> Self {
Self {
vocab_size: 128256,
hidden_size: 8192,
intermediate_size: 28672,
num_hidden_layers: 80,
num_attention_heads: 64,
num_key_value_heads: Some(8), max_position_embeddings: 8192, rope_theta: 500000.0,
rms_norm_eps: 1e-5,
..Self::default()
}
}
pub fn llama3_405b() -> Self {
Self {
vocab_size: 128256,
hidden_size: 16384, intermediate_size: 53248, num_hidden_layers: 126, num_attention_heads: 128,
num_key_value_heads: Some(8), max_position_embeddings: 8192, rope_theta: 500000.0,
rms_norm_eps: 1e-5,
..Self::default()
}
}
pub fn llama3_8b_instruct() -> Self {
Self {
model_type: "llama3-instruct".to_string(),
..Self::llama3_8b()
}
}
pub fn llama3_70b_instruct() -> Self {
Self {
model_type: "llama3-instruct".to_string(),
..Self::llama3_70b()
}
}
pub fn llama3_405b_instruct() -> Self {
Self {
model_type: "llama3-instruct".to_string(),
..Self::llama3_405b()
}
}
pub fn llama3_1_8b_128k() -> Self {
Self {
vocab_size: 128256,
hidden_size: 4096,
intermediate_size: 14336,
num_hidden_layers: 32,
num_attention_heads: 32,
num_key_value_heads: Some(8),
max_position_embeddings: 131072, rope_theta: 500000.0,
rope_scaling: Some(RopeScaling {
scaling_type: "linear".to_string(),
scaling_factor: 16.0, }),
rms_norm_eps: 1e-5,
model_type: "llama3.1".to_string(),
..Self::default()
}
}
pub fn llama3_1_70b_128k() -> Self {
Self {
vocab_size: 128256,
hidden_size: 8192,
intermediate_size: 28672,
num_hidden_layers: 80,
num_attention_heads: 64,
num_key_value_heads: Some(8),
max_position_embeddings: 131072, rope_theta: 500000.0,
rope_scaling: Some(RopeScaling {
scaling_type: "linear".to_string(),
scaling_factor: 16.0,
}),
rms_norm_eps: 1e-5,
model_type: "llama3.1".to_string(),
..Self::default()
}
}
pub fn llama3_1_405b_128k() -> Self {
Self {
vocab_size: 128256,
hidden_size: 16384,
intermediate_size: 53248,
num_hidden_layers: 126,
num_attention_heads: 128,
num_key_value_heads: Some(8),
max_position_embeddings: 131072, rope_theta: 500000.0,
rope_scaling: Some(RopeScaling {
scaling_type: "linear".to_string(),
scaling_factor: 16.0,
}),
rms_norm_eps: 1e-5,
model_type: "llama3.1".to_string(),
..Self::default()
}
}
pub fn llama_multilingual_7b() -> Self {
Self {
vocab_size: 250000, hidden_size: 4096,
intermediate_size: 14336,
num_hidden_layers: 32,
num_attention_heads: 32,
num_key_value_heads: Some(8),
max_position_embeddings: 16384, rope_theta: 500000.0,
rms_norm_eps: 1e-5,
model_type: "llama-multilingual".to_string(),
..Self::default()
}
}
pub fn llama_scientific_7b() -> Self {
Self {
vocab_size: 50000, hidden_size: 4096,
intermediate_size: 14336,
num_hidden_layers: 32,
num_attention_heads: 32,
num_key_value_heads: Some(8),
max_position_embeddings: 32768, rope_theta: 500000.0,
rms_norm_eps: 1e-5,
model_type: "llama-scientific".to_string(),
..Self::default()
}
}
pub fn llama_legal_7b() -> Self {
Self {
vocab_size: 40000, hidden_size: 4096,
intermediate_size: 14336,
num_hidden_layers: 32,
num_attention_heads: 32,
num_key_value_heads: Some(8),
max_position_embeddings: 65536, rope_theta: 500000.0,
rope_scaling: Some(RopeScaling {
scaling_type: "linear".to_string(),
scaling_factor: 8.0,
}),
rms_norm_eps: 1e-5,
model_type: "llama-legal".to_string(),
..Self::default()
}
}
pub fn llama_medical_7b() -> Self {
Self {
vocab_size: 45000, hidden_size: 4096,
intermediate_size: 14336,
num_hidden_layers: 32,
num_attention_heads: 32,
num_key_value_heads: Some(8),
max_position_embeddings: 32768, rope_theta: 500000.0,
rms_norm_eps: 1e-5,
model_type: "llama-medical".to_string(),
..Self::default()
}
}
pub fn llama_creative_7b() -> Self {
Self {
vocab_size: 35000, hidden_size: 4096,
intermediate_size: 14336,
num_hidden_layers: 32,
num_attention_heads: 32,
num_key_value_heads: Some(8),
max_position_embeddings: 16384, rope_theta: 500000.0,
rms_norm_eps: 1e-5,
model_type: "llama-creative".to_string(),
..Self::default()
}
}
pub fn llama_1b() -> Self {
Self {
vocab_size: 32000,
hidden_size: 2048,
intermediate_size: 5504,
num_hidden_layers: 16,
num_attention_heads: 16,
num_key_value_heads: Some(4),
max_position_embeddings: 4096,
rope_theta: 500000.0,
rms_norm_eps: 1e-5,
model_type: "llama-1b".to_string(),
..Self::default()
}
}
pub fn llama_3b() -> Self {
Self {
vocab_size: 32000,
hidden_size: 2560,
intermediate_size: 6912,
num_hidden_layers: 20,
num_attention_heads: 20,
num_key_value_heads: Some(4),
max_position_embeddings: 4096,
rope_theta: 500000.0,
rms_norm_eps: 1e-5,
model_type: "llama-3b".to_string(),
..Self::default()
}
}
pub fn from_pretrained_name(name: &str) -> Option<Self> {
match name {
"llama-7b" => Some(Self::llama_7b()),
"llama-13b" => Some(Self::llama_13b()),
"llama-30b" => Some(Self::llama_30b()),
"llama-65b" => Some(Self::llama_65b()),
"meta-llama/Llama-2-7b-hf" | "llama2-7b" => Some(Self::llama2_7b()),
"meta-llama/Llama-2-13b-hf" | "llama2-13b" => Some(Self::llama2_13b()),
"meta-llama/Llama-2-70b-hf" | "llama2-70b" => Some(Self::llama2_70b()),
"codellama/CodeLlama-7b-hf" | "code-llama-7b" => Some(Self::code_llama_7b()),
"meta-llama/Meta-Llama-3-8B" | "llama3-8b" => Some(Self::llama3_8b()),
"meta-llama/Meta-Llama-3-70B" | "llama3-70b" => Some(Self::llama3_70b()),
"meta-llama/Meta-Llama-3-405B" | "llama3-405b" => Some(Self::llama3_405b()),
"meta-llama/Meta-Llama-3-8B-Instruct" | "llama3-8b-instruct" => {
Some(Self::llama3_8b_instruct())
},
"meta-llama/Meta-Llama-3-70B-Instruct" | "llama3-70b-instruct" => {
Some(Self::llama3_70b_instruct())
},
"meta-llama/Meta-Llama-3-405B-Instruct" | "llama3-405b-instruct" => {
Some(Self::llama3_405b_instruct())
},
"meta-llama/Meta-Llama-3.1-8B" | "llama3.1-8b-128k" => Some(Self::llama3_1_8b_128k()),
"meta-llama/Meta-Llama-3.1-70B" | "llama3.1-70b-128k" => {
Some(Self::llama3_1_70b_128k())
},
"meta-llama/Meta-Llama-3.1-405B" | "llama3.1-405b-128k" => {
Some(Self::llama3_1_405b_128k())
},
"llama-multilingual-7b" => Some(Self::llama_multilingual_7b()),
"llama-scientific-7b" => Some(Self::llama_scientific_7b()),
"llama-legal-7b" => Some(Self::llama_legal_7b()),
"llama-medical-7b" => Some(Self::llama_medical_7b()),
"llama-creative-7b" => Some(Self::llama_creative_7b()),
"llama-1b" => Some(Self::llama_1b()),
"llama-3b" => Some(Self::llama_3b()),
_ => None,
}
}
}
#[cfg(test)]
mod tests {
use super::*;
use trustformers_core::traits::Config;
struct Lcg {
state: u64,
}
impl Lcg {
fn new(seed: u64) -> Self {
Lcg { state: seed }
}
fn next(&mut self) -> u64 {
self.state = self
.state
.wrapping_mul(6364136223846793005u64)
.wrapping_add(1442695040888963407u64);
self.state
}
fn next_f32(&mut self) -> f32 {
(self.next() >> 11) as f32 / (1u64 << 53) as f32
}
}
#[test]
fn test_default_config_fields() {
let cfg = LlamaConfig::default();
assert_eq!(cfg.vocab_size, 32000);
assert_eq!(cfg.hidden_size, 4096);
assert_eq!(cfg.num_attention_heads, 32);
assert_eq!(cfg.model_type, "llama");
assert!(cfg.use_cache);
assert!(!cfg.attention_bias);
assert!(!cfg.mlp_bias);
}
#[test]
fn test_default_validate_passes() {
let cfg = LlamaConfig::default();
assert!(cfg.validate().is_ok());
}
#[test]
fn test_architecture_name() {
let cfg = LlamaConfig::default();
assert_eq!(cfg.architecture(), "LLaMA");
}
#[test]
fn test_hidden_size_not_divisible_fails_validation() {
let cfg = LlamaConfig {
hidden_size: 100,
num_attention_heads: 32,
..LlamaConfig::default()
};
assert!(cfg.validate().is_err());
}
#[test]
fn test_kv_heads_not_divisible_fails_validation() {
let cfg = LlamaConfig {
num_attention_heads: 32,
num_key_value_heads: Some(7),
..LlamaConfig::default()
};
assert!(cfg.validate().is_err());
}
#[test]
fn test_valid_gqa_config() {
let cfg = LlamaConfig {
num_attention_heads: 32,
num_key_value_heads: Some(8),
..LlamaConfig::default()
};
assert!(cfg.validate().is_ok());
}
#[test]
fn test_head_dim_computation() {
let cfg = LlamaConfig::default();
assert_eq!(cfg.head_dim(), 4096 / 32);
}
#[test]
fn test_num_kv_heads_default() {
let cfg = LlamaConfig::default();
assert_eq!(cfg.num_kv_heads(), cfg.num_attention_heads);
}
#[test]
fn test_num_kv_heads_gqa() {
let cfg = LlamaConfig {
num_key_value_heads: Some(8),
..LlamaConfig::default()
};
assert_eq!(cfg.num_kv_heads(), 8);
}
#[test]
fn test_num_query_groups() {
let cfg = LlamaConfig {
num_attention_heads: 32,
num_key_value_heads: Some(8),
..LlamaConfig::default()
};
assert_eq!(cfg.num_query_groups(), 4);
}
#[test]
fn test_llama_7b_config() {
let cfg = LlamaConfig::llama_7b();
assert_eq!(cfg.vocab_size, 32000);
assert_eq!(cfg.hidden_size, 4096);
assert_eq!(cfg.num_hidden_layers, 32);
assert!(cfg.validate().is_ok());
}
#[test]
fn test_llama_13b_config() {
let cfg = LlamaConfig::llama_13b();
assert_eq!(cfg.hidden_size, 5120);
assert_eq!(cfg.num_hidden_layers, 40);
assert!(cfg.validate().is_ok());
}
#[test]
fn test_llama2_70b_has_gqa() {
let cfg = LlamaConfig::llama2_70b();
assert_eq!(cfg.num_key_value_heads, Some(8));
assert_eq!(cfg.max_position_embeddings, 4096);
assert!(cfg.validate().is_ok());
}
#[test]
fn test_llama3_8b_config() {
let cfg = LlamaConfig::llama3_8b();
assert_eq!(cfg.vocab_size, 128256);
assert_eq!(cfg.num_key_value_heads, Some(8));
assert_eq!(cfg.rope_theta, 500000.0);
assert!(cfg.validate().is_ok());
}
#[test]
fn test_llama3_1_8b_128k_has_rope_scaling() {
let cfg = LlamaConfig::llama3_1_8b_128k();
assert_eq!(cfg.max_position_embeddings, 131072);
if let Some(scaling) = &cfg.rope_scaling {
assert_eq!(scaling.scaling_type, "linear");
assert_eq!(scaling.scaling_factor, 16.0);
} else {
panic!("expected rope_scaling to be Some");
}
}
#[test]
fn test_code_llama_7b_config() {
let cfg = LlamaConfig::code_llama_7b();
assert_eq!(cfg.vocab_size, 32016);
assert_eq!(cfg.max_position_embeddings, 16384);
assert!(cfg.validate().is_ok());
}
#[test]
fn test_from_pretrained_name_llama2_7b() {
let result = LlamaConfig::from_pretrained_name("llama2-7b");
assert!(result.is_some());
if let Some(cfg) = result {
assert_eq!(cfg.max_position_embeddings, 4096);
}
}
#[test]
fn test_from_pretrained_name_unknown_returns_none() {
let result = LlamaConfig::from_pretrained_name("nonexistent-model-xyz");
assert!(result.is_none());
}
#[test]
fn test_from_pretrained_name_llama3_70b_instruct() {
let result = LlamaConfig::from_pretrained_name("llama3-70b-instruct");
assert!(result.is_some());
if let Some(cfg) = result {
assert!(cfg.model_type.contains("instruct"));
}
}
#[test]
fn test_llama_1b_config() {
let cfg = LlamaConfig::llama_1b();
assert_eq!(cfg.hidden_size, 2048);
assert_eq!(cfg.num_hidden_layers, 16);
assert_eq!(cfg.num_key_value_heads, Some(4));
assert!(cfg.validate().is_ok());
}
#[test]
fn test_llama_specialized_configs_validate() {
let configs = vec![
LlamaConfig::llama_multilingual_7b(),
LlamaConfig::llama_scientific_7b(),
LlamaConfig::llama_legal_7b(),
LlamaConfig::llama_medical_7b(),
LlamaConfig::llama_creative_7b(),
];
for cfg in configs {
assert!(
cfg.validate().is_ok(),
"config {} failed validation",
cfg.model_type
);
}
}
#[test]
fn test_lcg_produces_values_in_range() {
let mut rng = Lcg::new(12345);
for _ in 0..100 {
let v = rng.next_f32();
assert!((0.0..1.0).contains(&v), "LCG value {} out of range", v);
}
}
#[test]
fn test_rope_scaling_fields() {
let scaling = RopeScaling {
scaling_type: "dynamic".to_string(),
scaling_factor: 4.0,
};
assert_eq!(scaling.scaling_type, "dynamic");
assert_eq!(scaling.scaling_factor, 4.0);
}
#[test]
fn test_llama3_405b_architecture() {
let cfg = LlamaConfig::llama3_405b();
assert_eq!(cfg.num_hidden_layers, 126);
assert_eq!(cfg.hidden_size, 16384);
assert_eq!(cfg.num_key_value_heads, Some(8));
assert!(cfg.validate().is_ok());
}
#[test]
fn test_llama_3b_config() {
let cfg = LlamaConfig::llama_3b();
assert_eq!(cfg.hidden_size, 2560);
assert_eq!(cfg.num_hidden_layers, 20);
assert_eq!(cfg.num_key_value_heads, Some(4));
assert!(cfg.validate().is_ok());
}
#[test]
fn test_head_dim_varies_by_model() {
let small = LlamaConfig::llama_1b();
let medium = LlamaConfig::llama3_8b();
let small_hd = small.head_dim();
let medium_hd = medium.head_dim();
assert!(small_hd > 0);
assert!(medium_hd > 0);
}
#[test]
fn test_llama3_instruct_model_type() {
let cfg = LlamaConfig::llama3_8b_instruct();
assert_eq!(cfg.model_type, "llama3-instruct");
}
#[test]
fn test_legal_config_has_rope_scaling() {
let cfg = LlamaConfig::llama_legal_7b();
assert!(cfg.rope_scaling.is_some());
assert_eq!(cfg.max_position_embeddings, 65536);
}
#[test]
fn test_query_groups_mha_equals_one() {
let cfg = LlamaConfig::default(); assert_eq!(cfg.num_query_groups(), 1);
}
}