use serde::{Deserialize, Serialize};
use trustformers_core::traits::Config;
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct LLaMA2Config {
pub hidden_size: usize,
pub intermediate_size: usize,
pub num_hidden_layers: usize,
pub num_attention_heads: usize,
pub num_key_value_heads: usize,
pub vocab_size: usize,
pub max_position_embeddings: usize,
pub rope_theta: f32,
pub rms_norm_eps: f64,
pub hidden_act: String,
pub pretraining_tp: usize,
pub attention_bias: bool,
pub mlp_bias: bool,
pub bos_token_id: u32,
pub eos_token_id: u32,
pub use_cache: bool,
pub pad_token_id: Option<u32>,
}
impl Default for LLaMA2Config {
fn default() -> Self {
Self {
hidden_size: 4096,
intermediate_size: 11008,
num_hidden_layers: 32,
num_attention_heads: 32,
num_key_value_heads: 32, vocab_size: 32000,
max_position_embeddings: 4096,
rope_theta: 10000.0,
rms_norm_eps: 1e-5,
hidden_act: "silu".to_string(),
pretraining_tp: 1,
attention_bias: false,
mlp_bias: false,
bos_token_id: 1,
eos_token_id: 2,
use_cache: true,
pad_token_id: None,
}
}
}
impl Config for LLaMA2Config {
fn validate(&self) -> trustformers_core::errors::Result<()> {
if !self.hidden_size.is_multiple_of(self.num_attention_heads) {
return Err(
trustformers_core::errors::TrustformersError::invalid_config(
"hidden_size must be divisible by num_attention_heads".to_string(),
),
);
}
if !self.num_attention_heads.is_multiple_of(self.num_key_value_heads) {
return Err(
trustformers_core::errors::TrustformersError::invalid_config(
"num_attention_heads must be divisible by num_key_value_heads".to_string(),
),
);
}
if self.vocab_size == 0 {
return Err(
trustformers_core::errors::TrustformersError::invalid_config(
"vocab_size must be greater than 0".to_string(),
),
);
}
if self.num_hidden_layers == 0 {
return Err(
trustformers_core::errors::TrustformersError::invalid_config(
"num_hidden_layers must be greater than 0".to_string(),
),
);
}
if self.pretraining_tp == 0 {
return Err(
trustformers_core::errors::TrustformersError::invalid_config(
"pretraining_tp must be at least 1".to_string(),
),
);
}
Ok(())
}
fn architecture(&self) -> &'static str {
"LLaMA-2"
}
}
impl LLaMA2Config {
pub fn head_dim(&self) -> usize {
self.hidden_size / self.num_attention_heads
}
pub fn num_query_groups(&self) -> usize {
self.num_attention_heads / self.num_key_value_heads
}
pub fn uses_gqa(&self) -> bool {
self.num_key_value_heads < self.num_attention_heads
}
pub fn llama2_7b() -> Self {
Self {
hidden_size: 4096,
intermediate_size: 11008,
num_hidden_layers: 32,
num_attention_heads: 32,
num_key_value_heads: 32, vocab_size: 32000,
max_position_embeddings: 4096,
..Self::default()
}
}
pub fn llama2_13b() -> Self {
Self {
hidden_size: 5120,
intermediate_size: 13824,
num_hidden_layers: 40,
num_attention_heads: 40,
num_key_value_heads: 40, vocab_size: 32000,
max_position_embeddings: 4096,
..Self::default()
}
}
pub fn llama2_70b() -> Self {
Self {
hidden_size: 8192,
intermediate_size: 28672,
num_hidden_layers: 80,
num_attention_heads: 64,
num_key_value_heads: 8, vocab_size: 32000,
max_position_embeddings: 4096,
..Self::default()
}
}
pub fn llama2_7b_chat() -> Self {
Self::llama2_7b()
}
pub fn llama2_13b_chat() -> Self {
Self::llama2_13b()
}
pub fn llama2_70b_chat() -> Self {
Self::llama2_70b()
}
pub fn from_pretrained_name(name: &str) -> Option<Self> {
match name {
"meta-llama/Llama-2-7b-hf" | "llama2-7b" => Some(Self::llama2_7b()),
"meta-llama/Llama-2-13b-hf" | "llama2-13b" => Some(Self::llama2_13b()),
"meta-llama/Llama-2-70b-hf" | "llama2-70b" => Some(Self::llama2_70b()),
"meta-llama/Llama-2-7b-chat-hf" | "llama2-7b-chat" => Some(Self::llama2_7b_chat()),
"meta-llama/Llama-2-13b-chat-hf" | "llama2-13b-chat" => Some(Self::llama2_13b_chat()),
"meta-llama/Llama-2-70b-chat-hf" | "llama2-70b-chat" => Some(Self::llama2_70b_chat()),
_ => None,
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_llama2_default_hidden_size() {
let cfg = LLaMA2Config::default();
assert_eq!(cfg.hidden_size, 4096);
}
#[test]
fn test_llama2_default_intermediate_size() {
let cfg = LLaMA2Config::default();
assert_eq!(cfg.intermediate_size, 11008);
}
#[test]
fn test_llama2_default_num_hidden_layers() {
let cfg = LLaMA2Config::default();
assert_eq!(cfg.num_hidden_layers, 32);
}
#[test]
fn test_llama2_default_num_attention_heads() {
let cfg = LLaMA2Config::default();
assert_eq!(cfg.num_attention_heads, 32);
}
#[test]
fn test_llama2_default_num_key_value_heads() {
let cfg = LLaMA2Config::default();
assert_eq!(cfg.num_key_value_heads, 32);
}
#[test]
fn test_llama2_default_vocab_size() {
let cfg = LLaMA2Config::default();
assert_eq!(cfg.vocab_size, 32000);
}
#[test]
fn test_llama2_default_max_position_embeddings() {
let cfg = LLaMA2Config::default();
assert_eq!(cfg.max_position_embeddings, 4096);
}
#[test]
fn test_llama2_default_hidden_act() {
let cfg = LLaMA2Config::default();
assert_eq!(cfg.hidden_act, "silu");
}
#[test]
fn test_llama2_default_use_cache() {
let cfg = LLaMA2Config::default();
assert!(cfg.use_cache);
}
#[test]
fn test_llama2_default_pad_token_id_none() {
let cfg = LLaMA2Config::default();
assert!(cfg.pad_token_id.is_none());
}
#[test]
fn test_llama2_validate_passes_default() {
let cfg = LLaMA2Config::default();
assert!(cfg.validate().is_ok());
}
#[test]
fn test_llama2_validate_fails_zero_vocab_size() {
let cfg = LLaMA2Config {
vocab_size: 0,
..LLaMA2Config::default()
};
assert!(cfg.validate().is_err());
}
#[test]
fn test_llama2_validate_fails_zero_num_hidden_layers() {
let cfg = LLaMA2Config {
num_hidden_layers: 0,
..LLaMA2Config::default()
};
assert!(cfg.validate().is_err());
}
#[test]
fn test_llama2_validate_fails_hidden_not_divisible_by_heads() {
let cfg = LLaMA2Config {
hidden_size: 4096,
num_attention_heads: 7,
num_key_value_heads: 7,
..LLaMA2Config::default()
};
assert!(cfg.validate().is_err());
}
#[test]
fn test_llama2_validate_fails_heads_not_divisible_by_kv_heads() {
let cfg = LLaMA2Config {
num_attention_heads: 32,
num_key_value_heads: 7,
..LLaMA2Config::default()
};
assert!(cfg.validate().is_err());
}
#[test]
fn test_llama2_validate_fails_zero_pretraining_tp() {
let cfg = LLaMA2Config {
pretraining_tp: 0,
..LLaMA2Config::default()
};
assert!(cfg.validate().is_err());
}
#[test]
fn test_llama2_7b_preset() {
let cfg = LLaMA2Config::llama2_7b();
assert_eq!(cfg.hidden_size, 4096);
assert_eq!(cfg.num_hidden_layers, 32);
assert_eq!(cfg.num_attention_heads, 32);
assert_eq!(cfg.num_key_value_heads, 32);
assert!(cfg.validate().is_ok());
}
#[test]
fn test_llama2_13b_preset() {
let cfg = LLaMA2Config::llama2_13b();
assert_eq!(cfg.hidden_size, 5120);
assert_eq!(cfg.num_hidden_layers, 40);
assert_eq!(cfg.num_attention_heads, 40);
assert!(cfg.validate().is_ok());
}
#[test]
fn test_llama2_70b_preset_gqa() {
let cfg = LLaMA2Config::llama2_70b();
assert_eq!(cfg.hidden_size, 8192);
assert_eq!(cfg.num_attention_heads, 64);
assert_eq!(cfg.num_key_value_heads, 8);
assert!(cfg.uses_gqa());
assert!(cfg.validate().is_ok());
}
#[test]
fn test_llama2_head_dim() {
let cfg = LLaMA2Config::llama2_7b();
assert_eq!(cfg.head_dim(), 4096 / 32);
}
#[test]
fn test_llama2_num_query_groups_no_gqa() {
let cfg = LLaMA2Config::llama2_7b();
assert_eq!(cfg.num_query_groups(), 1);
}
#[test]
fn test_llama2_num_query_groups_with_gqa() {
let cfg = LLaMA2Config::llama2_70b();
assert_eq!(cfg.num_query_groups(), 64 / 8);
}
#[test]
fn test_llama2_uses_gqa_false_for_7b() {
let cfg = LLaMA2Config::llama2_7b();
assert!(!cfg.uses_gqa());
}
#[test]
fn test_llama2_from_pretrained_name_7b() {
let cfg = LLaMA2Config::from_pretrained_name("llama2-7b");
assert!(cfg.is_some());
let c = cfg.unwrap_or_default();
assert_eq!(c.hidden_size, 4096);
}
#[test]
fn test_llama2_from_pretrained_name_unknown() {
let cfg = LLaMA2Config::from_pretrained_name("unknown-model");
assert!(cfg.is_none());
}
#[test]
fn test_llama2_chat_variant_same_arch_as_base() {
let base = LLaMA2Config::llama2_7b();
let chat = LLaMA2Config::llama2_7b_chat();
assert_eq!(base.hidden_size, chat.hidden_size);
assert_eq!(base.num_hidden_layers, chat.num_hidden_layers);
}
#[test]
fn test_llama2_bos_eos_token_ids() {
let cfg = LLaMA2Config::default();
assert_eq!(cfg.bos_token_id, 1);
assert_eq!(cfg.eos_token_id, 2);
}
#[test]
fn test_llama2_architecture_name() {
let cfg = LLaMA2Config::default();
assert_eq!(cfg.architecture(), "LLaMA-2");
}
#[test]
fn test_llama2_lcg_derived_values_in_range() {
let mut s = 42u64;
s = s.wrapping_mul(6364136223846793005).wrapping_add(1442695040888963407);
let v = (s % 1000) as f32 / 1000.0;
assert!((0.0..1.0).contains(&v));
}
#[test]
fn test_llama2_pretraining_tp_default() {
let cfg = LLaMA2Config::default();
assert_eq!(cfg.pretraining_tp, 1);
}
#[test]
fn test_llama2_attention_and_mlp_bias_default_false() {
let cfg = LLaMA2Config::default();
assert!(!cfg.attention_bias);
assert!(!cfg.mlp_bias);
}
}