use serde::{Deserialize, Serialize};
use trustformers_core::traits::Config;
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ClaudeConfig {
pub vocab_size: usize,
pub hidden_size: usize,
pub intermediate_size: usize,
pub num_hidden_layers: usize,
pub num_attention_heads: usize,
pub num_key_value_heads: Option<usize>,
pub hidden_act: String,
pub max_position_embeddings: usize,
pub initializer_range: f32,
pub layer_norm_eps: f32,
pub use_cache: bool,
pub pad_token_id: Option<u32>,
pub bos_token_id: u32,
pub eos_token_id: u32,
pub rope_theta: f32,
pub rope_scaling: Option<RopeScaling>,
pub attention_dropout: f32,
pub ffn_dropout: f32,
pub constitutional_ai: bool,
pub harmlessness_weight: f32,
pub helpfulness_weight: f32,
pub honesty_weight: f32,
pub model_type: String,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct RopeScaling {
pub scaling_type: String,
pub scaling_factor: f32,
}
impl Default for ClaudeConfig {
fn default() -> Self {
Self {
vocab_size: 100352,
hidden_size: 4096,
intermediate_size: 11008,
num_hidden_layers: 32,
num_attention_heads: 32,
num_key_value_heads: None,
hidden_act: "swiglu".to_string(),
max_position_embeddings: 8192,
initializer_range: 0.02,
layer_norm_eps: 1e-5,
use_cache: true,
pad_token_id: None,
bos_token_id: 1,
eos_token_id: 2,
rope_theta: 10000.0,
rope_scaling: None,
attention_dropout: 0.0,
ffn_dropout: 0.0,
constitutional_ai: true,
harmlessness_weight: 1.0,
helpfulness_weight: 1.0,
honesty_weight: 1.0,
model_type: "claude".to_string(),
}
}
}
impl Config for ClaudeConfig {
fn validate(&self) -> trustformers_core::errors::Result<()> {
if !self.hidden_size.is_multiple_of(self.num_attention_heads) {
return Err(trustformers_core::errors::TrustformersError::config_error(
"hidden_size must be divisible by num_attention_heads",
"ClaudeConfig::validate",
));
}
if let Some(num_kv_heads) = self.num_key_value_heads {
if !self.num_attention_heads.is_multiple_of(num_kv_heads) {
return Err(trustformers_core::errors::TrustformersError::config_error(
"num_attention_heads must be divisible by num_key_value_heads",
"ClaudeConfig::validate",
));
}
}
if self.constitutional_ai
&& (self.harmlessness_weight < 0.0
|| self.helpfulness_weight < 0.0
|| self.honesty_weight < 0.0)
{
return Err(trustformers_core::errors::TrustformersError::config_error(
"Constitutional AI weights must be non-negative",
"ClaudeConfig::validate",
));
}
Ok(())
}
fn architecture(&self) -> &'static str {
"Claude"
}
}
impl ClaudeConfig {
pub fn claude_1() -> Self {
Self {
vocab_size: 100352,
hidden_size: 4096,
intermediate_size: 11008,
num_hidden_layers: 32,
num_attention_heads: 32,
max_position_embeddings: 8192,
model_type: "claude-1".to_string(),
..Self::default()
}
}
pub fn claude_2() -> Self {
Self {
vocab_size: 100352,
hidden_size: 5120,
intermediate_size: 13824,
num_hidden_layers: 40,
num_attention_heads: 40,
max_position_embeddings: 100000, model_type: "claude-2".to_string(),
..Self::default()
}
}
pub fn claude_2_1() -> Self {
Self {
vocab_size: 100352,
hidden_size: 5120,
intermediate_size: 13824,
num_hidden_layers: 40,
num_attention_heads: 40,
max_position_embeddings: 200000, model_type: "claude-2.1".to_string(),
..Self::default()
}
}
pub fn claude_3_haiku() -> Self {
Self {
vocab_size: 100352,
hidden_size: 3072,
intermediate_size: 8192,
num_hidden_layers: 24,
num_attention_heads: 24,
num_key_value_heads: Some(8), max_position_embeddings: 200000,
model_type: "claude-3-haiku".to_string(),
..Self::default()
}
}
pub fn claude_3_sonnet() -> Self {
Self {
vocab_size: 100352,
hidden_size: 4096,
intermediate_size: 11008,
num_hidden_layers: 32,
num_attention_heads: 32,
num_key_value_heads: Some(8),
max_position_embeddings: 200000,
model_type: "claude-3-sonnet".to_string(),
..Self::default()
}
}
pub fn claude_3_opus() -> Self {
Self {
vocab_size: 100352,
hidden_size: 8192,
intermediate_size: 22016,
num_hidden_layers: 64,
num_attention_heads: 64,
num_key_value_heads: Some(8),
max_position_embeddings: 200000,
model_type: "claude-3-opus".to_string(),
..Self::default()
}
}
pub fn claude_3_5_sonnet() -> Self {
Self {
vocab_size: 100352,
hidden_size: 4096,
intermediate_size: 11008,
num_hidden_layers: 32,
num_attention_heads: 32,
num_key_value_heads: Some(8),
max_position_embeddings: 200000,
rope_theta: 500000.0, model_type: "claude-3.5-sonnet".to_string(),
..Self::default()
}
}
pub fn head_dim(&self) -> usize {
self.hidden_size / self.num_attention_heads
}
pub fn num_kv_heads(&self) -> usize {
self.num_key_value_heads.unwrap_or(self.num_attention_heads)
}
pub fn num_query_groups(&self) -> usize {
self.num_attention_heads / self.num_kv_heads()
}
pub fn from_pretrained_name(name: &str) -> Option<Self> {
match name {
"claude-1" => Some(Self::claude_1()),
"claude-2" => Some(Self::claude_2()),
"claude-2.1" => Some(Self::claude_2_1()),
"claude-3-haiku" | "claude-3-haiku-20240307" => Some(Self::claude_3_haiku()),
"claude-3-sonnet" | "claude-3-sonnet-20240229" => Some(Self::claude_3_sonnet()),
"claude-3-opus" | "claude-3-opus-20240229" => Some(Self::claude_3_opus()),
"claude-3-5-sonnet" | "claude-3-5-sonnet-20240620" => Some(Self::claude_3_5_sonnet()),
_ => None,
}
}
pub fn with_constitutional_ai(&mut self, enabled: bool) -> &mut Self {
self.constitutional_ai = enabled;
self
}
pub fn with_constitutional_weights(
&mut self,
harmlessness: f32,
helpfulness: f32,
honesty: f32,
) -> &mut Self {
self.harmlessness_weight = harmlessness;
self.helpfulness_weight = helpfulness;
self.honesty_weight = honesty;
self
}
pub fn small_test_config() -> Self {
Self {
vocab_size: 1000,
hidden_size: 64,
intermediate_size: 128,
num_hidden_layers: 2,
num_attention_heads: 4,
num_key_value_heads: Some(2),
max_position_embeddings: 512,
model_type: "claude-test".to_string(),
..Self::default()
}
}
}