use serde::{Deserialize, Serialize};
use std::fmt;
use trustformers_core::traits::Config;
#[derive(Debug)]
pub enum Llama32Error {
InvalidConfig(String),
VisionShapeMismatch { expected: usize, got: usize },
PixelBufferSize { expected: usize, got: usize },
CrossAttentionIndexOutOfRange { index: usize, num_layers: usize },
TensorOp(String),
NotImplemented(String),
}
impl fmt::Display for Llama32Error {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
Self::InvalidConfig(msg) => write!(f, "Llama32 invalid config: {msg}"),
Self::VisionShapeMismatch { expected, got } => {
write!(f, "vision shape mismatch: expected {expected}, got {got}")
},
Self::PixelBufferSize { expected, got } => {
write!(f, "pixel buffer size: expected {expected}, got {got}")
},
Self::CrossAttentionIndexOutOfRange { index, num_layers } => {
write!(
f,
"cross-attention layer index {index} out of range for {num_layers} layers"
)
},
Self::TensorOp(msg) => write!(f, "tensor op error: {msg}"),
Self::NotImplemented(msg) => write!(f, "not implemented: {msg}"),
}
}
}
impl std::error::Error for Llama32Error {}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Llama32Config {
pub vocab_size: usize,
pub hidden_size: usize,
pub intermediate_size: usize,
pub num_hidden_layers: usize,
pub num_attention_heads: usize,
pub num_key_value_heads: usize,
pub head_dim: usize,
pub max_position_embeddings: usize,
pub rms_norm_eps: f64,
pub rope_theta: f64,
pub rope_scaling_factor: f32,
pub use_scaled_rope: bool,
pub vision_hidden_size: usize,
pub vision_num_attention_heads: usize,
pub vision_num_hidden_layers: usize,
pub vision_intermediate_size: usize,
pub image_size: usize,
pub patch_size: usize,
pub num_patches: usize,
pub vision_output_dim: usize,
pub cross_attention_layers: Vec<usize>,
pub num_global_layers: usize,
}
impl Default for Llama32Config {
fn default() -> Self {
Self::llama32_3b()
}
}
impl Config for Llama32Config {
fn validate(&self) -> trustformers_core::errors::Result<()> {
self.validate_internal().map_err(|e| {
trustformers_core::errors::TrustformersError::invalid_config(e.to_string())
})
}
fn architecture(&self) -> &'static str {
"Llama-3.2"
}
}
impl Llama32Config {
pub fn llama32_3b() -> Self {
let num_hidden_layers = 28;
let image_size = 560;
let patch_size = 14;
let num_patches = (image_size / patch_size) * (image_size / patch_size);
let hidden_size = 3072_usize;
let cross_attention_layers = Self::default_cross_attention_layers(num_hidden_layers);
Self {
vocab_size: 128256,
hidden_size,
intermediate_size: 8192,
num_hidden_layers,
num_attention_heads: 24,
num_key_value_heads: 8,
head_dim: 128,
max_position_embeddings: 131072,
rms_norm_eps: 1e-5,
rope_theta: 500000.0,
rope_scaling_factor: 32.0,
use_scaled_rope: true,
vision_hidden_size: 1280,
vision_num_attention_heads: 16,
vision_num_hidden_layers: 32,
vision_intermediate_size: 5120,
image_size,
patch_size,
num_patches,
vision_output_dim: 6 * hidden_size, cross_attention_layers,
num_global_layers: 8,
}
}
pub fn llama32_11b() -> Self {
let num_hidden_layers = 32;
let image_size = 560;
let patch_size = 14;
let num_patches = (image_size / patch_size) * (image_size / patch_size);
let hidden_size = 4096_usize;
let cross_attention_layers = Self::default_cross_attention_layers(num_hidden_layers);
Self {
vocab_size: 128256,
hidden_size,
intermediate_size: 14336,
num_hidden_layers,
num_attention_heads: 32,
num_key_value_heads: 8,
head_dim: 128,
max_position_embeddings: 131072,
rms_norm_eps: 1e-5,
rope_theta: 500000.0,
rope_scaling_factor: 32.0,
use_scaled_rope: true,
vision_hidden_size: 1280,
vision_num_attention_heads: 16,
vision_num_hidden_layers: 32,
vision_intermediate_size: 5120,
image_size,
patch_size,
num_patches,
vision_output_dim: 6 * hidden_size, cross_attention_layers,
num_global_layers: 8,
}
}
pub fn small_test() -> Self {
let num_hidden_layers = 4;
let image_size = 28;
let patch_size = 14;
let num_patches = (image_size / patch_size) * (image_size / patch_size);
let hidden_size = 64_usize;
let cross_attention_layers = Self::default_cross_attention_layers(num_hidden_layers);
Self {
vocab_size: 256,
hidden_size,
intermediate_size: 128,
num_hidden_layers,
num_attention_heads: 4,
num_key_value_heads: 2,
head_dim: 16,
max_position_embeddings: 64,
rms_norm_eps: 1e-5,
rope_theta: 500000.0,
rope_scaling_factor: 32.0,
use_scaled_rope: true,
vision_hidden_size: 32,
vision_num_attention_heads: 2,
vision_num_hidden_layers: 2,
vision_intermediate_size: 64,
image_size,
patch_size,
num_patches,
vision_output_dim: 6 * hidden_size, cross_attention_layers,
num_global_layers: 1,
}
}
pub fn num_patches(image_size: usize, patch_size: usize) -> usize {
let side = image_size / patch_size;
side * side
}
pub fn default_cross_attention_layers(num_layers: usize) -> Vec<usize> {
(0..num_layers).filter(|&i| (i + 1) % 4 == 0).collect()
}
pub fn validate_internal(&self) -> Result<(), Llama32Error> {
if self.hidden_size == 0 {
return Err(Llama32Error::InvalidConfig(
"hidden_size must be > 0".to_string(),
));
}
if !self.hidden_size.is_multiple_of(self.num_attention_heads) {
return Err(Llama32Error::InvalidConfig(format!(
"hidden_size ({}) must be divisible by num_attention_heads ({})",
self.hidden_size, self.num_attention_heads
)));
}
if !self.num_attention_heads.is_multiple_of(self.num_key_value_heads) {
return Err(Llama32Error::InvalidConfig(format!(
"num_attention_heads ({}) must be divisible by num_key_value_heads ({})",
self.num_attention_heads, self.num_key_value_heads
)));
}
if self.vocab_size == 0 {
return Err(Llama32Error::InvalidConfig(
"vocab_size must be > 0".to_string(),
));
}
if self.num_hidden_layers == 0 {
return Err(Llama32Error::InvalidConfig(
"num_hidden_layers must be > 0".to_string(),
));
}
if self.intermediate_size == 0 {
return Err(Llama32Error::InvalidConfig(
"intermediate_size must be > 0".to_string(),
));
}
if self.vision_hidden_size == 0 {
return Err(Llama32Error::InvalidConfig(
"vision_hidden_size must be > 0".to_string(),
));
}
if self.patch_size == 0 {
return Err(Llama32Error::InvalidConfig(
"patch_size must be > 0".to_string(),
));
}
if self.image_size == 0 {
return Err(Llama32Error::InvalidConfig(
"image_size must be > 0".to_string(),
));
}
if !self.image_size.is_multiple_of(self.patch_size) {
return Err(Llama32Error::InvalidConfig(format!(
"image_size ({}) must be divisible by patch_size ({})",
self.image_size, self.patch_size
)));
}
if self.vision_num_attention_heads == 0 {
return Err(Llama32Error::InvalidConfig(
"vision_num_attention_heads must be > 0".to_string(),
));
}
if !self.vision_hidden_size.is_multiple_of(self.vision_num_attention_heads) {
return Err(Llama32Error::InvalidConfig(format!(
"vision_hidden_size ({}) must be divisible by vision_num_attention_heads ({})",
self.vision_hidden_size, self.vision_num_attention_heads
)));
}
for &idx in &self.cross_attention_layers {
if idx >= self.num_hidden_layers {
return Err(Llama32Error::CrossAttentionIndexOutOfRange {
index: idx,
num_layers: self.num_hidden_layers,
});
}
}
Ok(())
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_llama32_default_is_3b() {
let cfg = Llama32Config::default();
assert_eq!(cfg.vocab_size, 128256);
assert_eq!(cfg.hidden_size, 3072);
}
#[test]
fn test_llama32_3b_preset_fields() {
let cfg = Llama32Config::llama32_3b();
assert_eq!(cfg.num_hidden_layers, 28);
assert_eq!(cfg.num_attention_heads, 24);
assert_eq!(cfg.num_key_value_heads, 8);
assert_eq!(cfg.head_dim, 128);
}
#[test]
fn test_llama32_11b_preset_fields() {
let cfg = Llama32Config::llama32_11b();
assert_eq!(cfg.hidden_size, 4096);
assert_eq!(cfg.num_hidden_layers, 32);
assert_eq!(cfg.num_attention_heads, 32);
}
#[test]
fn test_llama32_small_test_config() {
let cfg = Llama32Config::small_test();
assert_eq!(cfg.vocab_size, 256);
assert_eq!(cfg.hidden_size, 64);
assert_eq!(cfg.num_hidden_layers, 4);
}
#[test]
fn test_llama32_validate_passes_3b() {
let cfg = Llama32Config::llama32_3b();
assert!(cfg.validate_internal().is_ok());
}
#[test]
fn test_llama32_validate_passes_11b() {
let cfg = Llama32Config::llama32_11b();
assert!(cfg.validate_internal().is_ok());
}
#[test]
fn test_llama32_validate_fails_zero_hidden_size() {
let cfg = Llama32Config {
hidden_size: 0,
..Llama32Config::small_test()
};
assert!(cfg.validate_internal().is_err());
}
#[test]
fn test_llama32_validate_fails_zero_vocab_size() {
let cfg = Llama32Config {
vocab_size: 0,
..Llama32Config::small_test()
};
assert!(cfg.validate_internal().is_err());
}
#[test]
fn test_llama32_validate_fails_hidden_not_divisible_by_heads() {
let cfg = Llama32Config {
hidden_size: 63,
num_attention_heads: 4,
..Llama32Config::small_test()
};
assert!(cfg.validate_internal().is_err());
}
#[test]
fn test_llama32_validate_fails_heads_not_divisible_by_kv_heads() {
let cfg = Llama32Config {
num_attention_heads: 4,
num_key_value_heads: 3,
..Llama32Config::small_test()
};
assert!(cfg.validate_internal().is_err());
}
#[test]
fn test_llama32_validate_fails_image_not_divisible_by_patch() {
let cfg = Llama32Config {
image_size: 30,
patch_size: 14,
..Llama32Config::small_test()
};
assert!(cfg.validate_internal().is_err());
}
#[test]
fn test_llama32_patch_calculation() {
let n = Llama32Config::num_patches(560, 14);
assert_eq!(n, 40 * 40);
}
#[test]
fn test_llama32_cross_attention_layers_28() {
let layers = Llama32Config::default_cross_attention_layers(28);
assert!(layers.contains(&3));
assert!(layers.contains(&7));
assert!(layers.contains(&27));
assert!(!layers.contains(&0));
}
#[test]
fn test_llama32_max_position_embeddings_3b() {
let cfg = Llama32Config::llama32_3b();
assert_eq!(cfg.max_position_embeddings, 131072);
}
#[test]
fn test_llama32_use_scaled_rope_3b() {
let cfg = Llama32Config::llama32_3b();
assert!(cfg.use_scaled_rope);
assert!((cfg.rope_scaling_factor - 32.0).abs() < 1e-6);
}
#[test]
fn test_llama32_vision_output_dim_3b() {
let cfg = Llama32Config::llama32_3b();
assert_eq!(cfg.vision_output_dim, 6 * cfg.hidden_size);
}
#[test]
fn test_llama32_architecture_name() {
let cfg = Llama32Config::default();
assert_eq!(cfg.architecture(), "Llama-3.2");
}
#[test]
fn test_llama32_num_global_layers_3b() {
let cfg = Llama32Config::llama32_3b();
assert_eq!(cfg.num_global_layers, 8);
}
#[test]
fn test_llama32_vision_heads_3b() {
let cfg = Llama32Config::llama32_3b();
assert_eq!(cfg.vision_num_attention_heads, 16);
assert_eq!(cfg.vision_num_hidden_layers, 32);
}
#[test]
fn test_llama32_validate_fails_cross_attn_out_of_range() {
let mut cfg = Llama32Config::small_test();
cfg.cross_attention_layers = vec![100];
assert!(cfg.validate_internal().is_err());
}
#[test]
fn test_llama32_small_test_validate_passes() {
let cfg = Llama32Config::small_test();
assert!(cfg.validate_internal().is_ok());
}
#[test]
fn test_llama32_lcg_values_in_range() {
let mut s = 42u64;
s = s.wrapping_mul(6364136223846793005).wrapping_add(1442695040888963407);
let v = (s % 1000) as f32 / 1000.0;
assert!((0.0..1.0).contains(&v));
}
#[test]
fn test_llama32_11b_vision_hidden_size() {
let cfg = Llama32Config::llama32_11b();
assert_eq!(cfg.vision_hidden_size, 1280);
assert_eq!(cfg.vision_intermediate_size, 5120);
}
}