use crate::llama3_2::config::{Llama32Config, Llama32Error};
use crate::llama3_2::model::Llama32VisionModel;
use trustformers_core::{errors::Result, layers::Linear, tensor::Tensor, traits::Layer};
pub struct Llama32ForConditionalGeneration {
model: Llama32VisionModel,
lm_head: Linear,
}
impl Llama32ForConditionalGeneration {
pub fn new(config: Llama32Config) -> Result<Self> {
let lm_head = Linear::new_with_device(
config.hidden_size,
config.vocab_size,
false,
trustformers_core::device::Device::CPU,
);
let model = Llama32VisionModel::new(config)?;
Ok(Self { model, lm_head })
}
pub fn config(&self) -> &Llama32Config {
self.model.config()
}
pub fn parameter_count(&self) -> usize {
self.model.parameter_count() + self.lm_head.parameter_count()
}
pub fn encode_image(
&self,
pixel_values: &[f32],
height: usize,
width: usize,
) -> std::result::Result<Vec<f32>, Llama32Error> {
let expected = height * width * 3;
if pixel_values.len() != expected {
return Err(Llama32Error::PixelBufferSize {
expected,
got: pixel_values.len(),
});
}
let tensor = self
.model
.encode_image(pixel_values, height, width)
.map_err(|e| Llama32Error::TensorOp(e.to_string()))?;
match tensor {
Tensor::F32(arr) => Ok(arr.iter().copied().collect()),
_ => Err(Llama32Error::TensorOp(
"unexpected tensor dtype in vision encoder output".to_string(),
)),
}
}
pub fn forward_multimodal(
&self,
input_ids: Vec<u32>,
pixel_values: &[f32],
height: usize,
width: usize,
) -> Result<Tensor> {
let hidden = self.model.forward_multimodal(input_ids, pixel_values, height, width)?;
self.lm_head.forward(hidden)
}
pub fn forward_text_only(&self, input_ids: Vec<u32>) -> Result<Tensor> {
let hidden = self.model.forward_text_only(input_ids)?;
self.lm_head.forward(hidden)
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::llama3_2::config::Llama32Config;
fn small_config() -> Llama32Config {
Llama32Config::small_test()
}
#[test]
fn test_construction_succeeds() {
let result = Llama32ForConditionalGeneration::new(small_config());
assert!(
result.is_ok(),
"Llama32ForConditionalGeneration must construct"
);
}
#[test]
fn test_config_accessor_vocab_size() {
let cfg = small_config();
let vocab = cfg.vocab_size;
let model =
Llama32ForConditionalGeneration::new(cfg).unwrap_or_else(|_| panic!("init failed"));
assert_eq!(model.config().vocab_size, vocab);
}
#[test]
fn test_parameter_count_nonzero() {
let model = Llama32ForConditionalGeneration::new(small_config())
.unwrap_or_else(|_| panic!("init failed"));
assert!(model.parameter_count() > 0, "parameter_count must be > 0");
}
#[test]
fn test_text_only_forward_succeeds() {
let model = Llama32ForConditionalGeneration::new(small_config())
.unwrap_or_else(|_| panic!("init failed"));
let result = model.forward_text_only(vec![1u32, 2, 3]);
assert!(result.is_ok(), "text-only forward must succeed");
}
#[test]
fn test_text_only_forward_nonempty() {
let model = Llama32ForConditionalGeneration::new(small_config())
.unwrap_or_else(|_| panic!("init failed"));
if let Ok(Tensor::F32(arr)) = model.forward_text_only(vec![0u32]).as_ref() {
assert!(!arr.is_empty(), "output must be non-empty");
}
}
#[test]
fn test_text_only_forward_finite() {
let model = Llama32ForConditionalGeneration::new(small_config())
.unwrap_or_else(|_| panic!("init failed"));
if let Ok(Tensor::F32(arr)) = model.forward_text_only(vec![1u32, 2]).as_ref() {
for &v in arr.iter() {
assert!(v.is_finite(), "logit {v} must be finite");
}
}
}
#[test]
fn test_encode_image_correct_size() {
let model = Llama32ForConditionalGeneration::new(small_config())
.unwrap_or_else(|_| panic!("init failed"));
let h = 4;
let w = 4;
let pixels = vec![0.5f32; h * w * 3];
let result = model.encode_image(&pixels, h, w);
assert!(
result.is_ok(),
"encode_image with correct size must succeed"
);
}
#[test]
fn test_encode_image_wrong_size_error() {
let model = Llama32ForConditionalGeneration::new(small_config())
.unwrap_or_else(|_| panic!("init failed"));
let wrong_pixels = vec![0.5f32; 10]; let err = model.encode_image(&wrong_pixels, 4, 4);
assert!(err.is_err(), "wrong pixel buffer size must return error");
}
#[test]
fn test_error_invalid_config_display() {
let msg = format!("{}", Llama32Error::InvalidConfig("bad".to_string()));
assert!(!msg.is_empty(), "error display must be non-empty");
assert!(msg.contains("bad"), "must mention error detail");
}
#[test]
fn test_error_pixel_buffer_size_display() {
let msg = format!(
"{}",
Llama32Error::PixelBufferSize {
expected: 48,
got: 12
}
);
assert!(msg.contains("48"), "must mention expected");
assert!(msg.contains("12"), "must mention actual");
}
#[test]
fn test_small_test_config_has_vision_fields() {
let cfg = small_config();
assert!(cfg.vision_hidden_size > 0, "vision_hidden_size must be > 0");
assert!(cfg.image_size > 0, "image_size must be > 0");
}
#[test]
fn test_num_patches_calculation() {
let patches = Llama32Config::num_patches(224, 14);
assert_eq!(patches, 256, "224/14 rounded = 16, 16*16 = 256");
}
#[test]
fn test_default_cross_attention_layers() {
let layers = Llama32Config::default_cross_attention_layers(8);
assert!(
!layers.is_empty(),
"must return at least one cross-attention layer"
);
for &l in &layers {
assert!(l < 8, "layer index {l} must be < num_layers 8");
}
}
#[test]
fn test_error_vision_shape_mismatch_display() {
let msg = format!(
"{}",
Llama32Error::VisionShapeMismatch {
expected: 512,
got: 256
}
);
assert!(msg.contains("512"), "must contain expected");
assert!(msg.contains("256"), "must contain actual");
}
#[test]
fn test_encode_image_output_nonempty() {
let model = Llama32ForConditionalGeneration::new(small_config())
.unwrap_or_else(|_| panic!("init failed"));
let h = 28;
let w = 28;
let pixels = vec![0.1f32; h * w * 3];
if let Ok(features) = model.encode_image(&pixels, h, w) {
assert!(!features.is_empty(), "vision features must be non-empty");
}
}
#[test]
fn test_text_only_forward_deterministic() {
let model = Llama32ForConditionalGeneration::new(small_config())
.unwrap_or_else(|_| panic!("init failed"));
let ids = vec![1u32, 2, 3];
let r1 = model.forward_text_only(ids.clone());
let r2 = model.forward_text_only(ids);
if let (Ok(a), Ok(b)) = (r1, r2) {
if let (Tensor::F32(arr_a), Tensor::F32(arr_b)) = (&a, &b) {
let v1: Vec<f32> = arr_a.iter().copied().collect();
let v2: Vec<f32> = arr_b.iter().copied().collect();
assert_eq!(v1, v2, "forward must be deterministic");
}
}
}
#[test]
fn test_error_tensor_op_display() {
let msg = format!("{}", Llama32Error::TensorOp("matmul failed".to_string()));
assert!(
msg.contains("matmul"),
"TensorOp display must include message"
);
}
#[test]
fn test_error_not_implemented_display() {
let msg = format!(
"{}",
Llama32Error::NotImplemented("beam search".to_string())
);
assert!(
msg.contains("beam search"),
"NotImplemented must include feature name"
);
}
}