use super::*;
use trustformers_core::{
layers::Embedding,
tensor::Tensor,
traits::{Config, Layer, Model},
};
#[test]
fn test_cogvlm_config_creation() {
let config = CogVlmConfig::default();
assert_eq!(config.hidden_size, 4096);
assert_eq!(config.num_attention_heads, 32);
assert_eq!(config.vision_config.hidden_size, 1792);
assert!(config.validate().is_ok());
}
#[test]
fn test_cogvlm_chat_config() {
let config = CogVlmConfig::cogvlm_chat_17b();
assert_eq!(config.model_type, "cogvlm-chat-17b");
assert_eq!(config.cogvlm_stage, 2);
assert_eq!(config.template_version, "chat");
assert!(config.validate().is_ok());
}
#[test]
fn test_cogvlm_base_config() {
let config = CogVlmConfig::cogvlm_base_17b();
assert_eq!(config.model_type, "cogvlm-base-17b");
assert_eq!(config.cogvlm_stage, 1);
assert_eq!(config.template_version, "base");
assert!(config.validate().is_ok());
}
#[test]
fn test_cogvlm_grounding_config() {
let config = CogVlmConfig::cogvlm_grounding_17b();
assert_eq!(config.model_type, "cogvlm-grounding-17b");
assert_eq!(config.template_version, "grounding");
assert!(config.validate().is_ok());
}
#[test]
fn test_cogvideo_config() {
let config = CogVlmConfig::cogvideo();
assert_eq!(config.model_type, "cogvideo");
assert_eq!(config.template_version, "video");
assert_eq!(config.max_position_embeddings, 4096);
assert_eq!(config.vision_token_num, 1024);
assert!(config.validate().is_ok());
}
#[test]
fn test_cogvideo_full_config() {
let config = CogVideoConfig::default();
assert_eq!(config.video_frames, 16);
assert_eq!(config.frame_stride, 2);
assert_eq!(config.temporal_num_layers, 4);
assert!(config.validate().is_ok());
}
#[test]
#[allow(clippy::field_reassign_with_default)]
fn test_config_validation() {
let mut config = CogVlmConfig::default();
config.hidden_size = 100; assert!(config.validate().is_err());
config.hidden_size = 4096;
assert!(config.validate().is_ok());
config.cogvlm_stage = 3;
assert!(config.validate().is_err());
config.cogvlm_stage = 2;
assert!(config.validate().is_ok());
}
#[test]
fn test_from_pretrained_name() {
assert!(CogVlmConfig::from_pretrained_name("cogvlm-chat-17b").is_some());
assert!(CogVlmConfig::from_pretrained_name("THUDM/cogvlm-chat-hf").is_some());
assert!(CogVlmConfig::from_pretrained_name("cogvlm-base-17b").is_some());
assert!(CogVlmConfig::from_pretrained_name("cogvlm-grounding-17b").is_some());
assert!(CogVlmConfig::from_pretrained_name("cogvideo").is_some());
assert!(CogVlmConfig::from_pretrained_name("invalid-model").is_none());
}
#[test]
fn test_config_head_dimensions() {
let config = CogVlmConfig::default();
assert_eq!(config.head_dim(), 128); assert_eq!(config.vision_head_dim(), 112); assert_eq!(config.num_kv_heads(), 32); }
#[test]
fn test_config_num_patches() {
let config = CogVlmConfig::default();
let expected_patches = (490_i32 / 14).pow(2) as usize; assert_eq!(config.num_patches(), expected_patches);
}
#[test]
fn test_config_with_lora() {
let mut config = CogVlmConfig::default();
config.with_lora(true, Some(16));
assert!(config.use_lora);
assert_eq!(config.lora_rank, Some(16));
}
#[test]
fn test_config_with_vision_tokens() {
let mut config = CogVlmConfig::default();
config.with_vision_tokens(512);
assert_eq!(config.vision_token_num, 512);
}
#[test]
fn test_config_with_stage() {
let mut config = CogVlmConfig::default();
config.with_stage(1, "base");
assert_eq!(config.cogvlm_stage, 1);
assert_eq!(config.template_version, "base");
}
#[test]
fn test_vision_config_validation() {
let config = CogVlmVisionConfig::default();
assert_eq!(config.hidden_size % config.num_attention_heads, 0);
assert!(config.image_size >= config.patch_size);
assert!(config.num_hidden_layers > 0);
}
#[test]
fn test_cogvlm_model_creation() {
let config = CogVlmConfig::small_test_config();
let result = CogVlmModel::new(config);
assert!(
result.is_ok(),
"Failed to create CogVLM model: {:?}",
result.err()
);
}
#[test]
fn test_cogvideo_model_creation() {
let config = CogVideoConfig {
base_config: CogVlmConfig::small_test_config(),
temporal_hidden_size: 64,
temporal_num_layers: 1,
..CogVideoConfig::default()
};
let result = CogVideoModel::new(config);
assert!(
result.is_ok(),
"Failed to create CogVideo model: {:?}",
result.err()
);
}
#[test]
fn test_vision_transformer_creation() {
let config = CogVlmVisionConfig {
hidden_size: 64,
num_hidden_layers: 1,
num_attention_heads: 4,
image_size: 56,
..CogVlmVisionConfig::default()
};
let result = CogVlmVisionTransformer::new(config);
assert!(
result.is_ok(),
"Failed to create vision transformer: {:?}",
result.err()
);
}
#[test]
fn test_visual_expert_creation() {
let config = CogVlmConfig::small_test_config();
let result = VisualExpert::new(config);
assert!(
result.is_ok(),
"Failed to create visual expert: {:?}",
result.err()
);
}
#[test]
fn test_temporal_encoder_creation() {
let config = CogVideoConfig {
temporal_hidden_size: 64,
temporal_num_layers: 1,
..CogVideoConfig::default()
};
let result = TemporalEncoder::new(config);
assert!(
result.is_ok(),
"Failed to create temporal encoder: {:?}",
result.err()
);
}
#[test]
fn test_cogvlm_forward_text_only() {
let mut config = CogVlmConfig::small_test_config();
config.hidden_size = 128;
config.num_attention_heads = 8;
let model = CogVlmModel::new(config).expect("operation failed");
let input_ids = Tensor::zeros(&[1, 10]).expect("operation failed"); let input = CogVlmInput {
pixel_values: None,
input_ids,
attention_mask: None,
position_ids: None,
token_type_ids: None,
images_seq_mask: None,
images_emb_mask: None,
};
let result = model.forward(input);
assert!(result.is_ok(), "Forward pass failed: {:?}", result.err());
let output = result.expect("operation failed");
assert_eq!(output.last_hidden_state.shape()[0], 1); assert_eq!(output.last_hidden_state.shape()[1], 10); assert_eq!(
output.last_hidden_state.shape()[2],
model.get_config().hidden_size
); }
#[test]
fn test_cogvlm_forward_with_vision() {
let mut config = CogVlmConfig::small_test_config();
config.hidden_size = 128;
config.num_attention_heads = 8;
config.cross_hidden_size = 128; config.vision_config.hidden_size = 128;
config.vision_config.num_attention_heads = 8;
config.vision_config.image_size = 56; let model = CogVlmModel::new(config.clone()).expect("operation failed");
config.vision_config.patch_size = 14; let vision_tokens =
(config.vision_config.image_size / config.vision_config.patch_size).pow(2) + 1;
let seq_len = vision_tokens; let input_ids = Tensor::zeros(&[1, seq_len]).expect("operation failed");
let pixel_values = Tensor::zeros(&[1, 3, 56, 56]).expect("operation failed"); let images_emb_mask = Tensor::zeros(&[1, seq_len]).expect("operation failed");
let input = CogVlmInput {
pixel_values: Some(pixel_values),
input_ids,
attention_mask: None,
position_ids: None,
token_type_ids: None,
images_seq_mask: None,
images_emb_mask: Some(images_emb_mask),
};
let result = model.forward(input);
assert!(
result.is_ok(),
"Forward pass with vision failed: {:?}",
result.err()
);
let output = result.expect("operation failed");
assert_eq!(output.last_hidden_state.shape()[0], 1); assert_eq!(output.last_hidden_state.shape()[1], seq_len); assert_eq!(output.last_hidden_state.shape()[2], config.hidden_size); }
#[test]
fn test_cogvideo_forward() {
let config = CogVideoConfig {
base_config: CogVlmConfig::small_test_config(),
temporal_hidden_size: 64,
temporal_num_layers: 1,
..CogVideoConfig::default()
};
let model = CogVideoModel::new(config.clone()).expect("operation failed");
let input_ids = Tensor::zeros(&[1, 4]).expect("operation failed"); let video_frames = Tensor::zeros(&[1, 2, 3, 56, 56]).expect("operation failed");
let input = CogVideoInput {
video_frames,
input_ids,
attention_mask: None,
position_ids: None,
token_type_ids: None,
};
let result = model.forward(input);
assert!(
result.is_ok(),
"CogVideo forward pass failed: {:?}",
result.err()
);
let output = result.expect("operation failed");
assert_eq!(output.last_hidden_state.shape()[0], 1);
assert_eq!(output.logits.shape()[2], config.base_config.vocab_size);
}
#[test]
fn test_vision_transformer_forward() {
let config = CogVlmVisionConfig {
hidden_size: 128,
num_attention_heads: 8,
image_size: 56, patch_size: 14, ..CogVlmVisionConfig::default()
};
let vision_model = CogVlmVisionTransformer::new(config.clone()).expect("operation failed");
let pixel_values = Tensor::zeros(&[1, 3, 56, 56]).expect("operation failed"); let result = vision_model.forward(pixel_values);
assert!(
result.is_ok(),
"Vision transformer forward failed: {:?}",
result.err()
);
let output = result.expect("operation failed");
assert_eq!(output.shape()[0], 1); let expected_seq_len = (config.image_size / config.patch_size).pow(2) + 1; assert_eq!(output.shape()[1], expected_seq_len); assert_eq!(output.shape()[2], config.hidden_size); }
#[test]
fn test_visual_expert_forward() {
let config = CogVlmConfig::small_test_config();
let visual_expert = VisualExpert::new(config.clone()).expect("operation failed");
let lang_hidden = Tensor::zeros(&[1, 10, config.hidden_size]).expect("operation failed");
let vision_hidden =
Tensor::zeros(&[1, 256, config.vision_config.hidden_size]).expect("operation failed");
let result = visual_expert.forward((lang_hidden, vision_hidden));
assert!(
result.is_ok(),
"Visual expert forward failed: {:?}",
result.err()
);
let output = result.expect("operation failed");
assert_eq!(output.shape()[0], 1);
assert_eq!(output.shape()[1], 10);
assert_eq!(output.shape()[2], config.hidden_size);
}
#[test]
fn test_temporal_encoder_forward() {
let config = CogVideoConfig {
temporal_hidden_size: 64,
temporal_num_layers: 1,
..CogVideoConfig::default()
};
let temporal_encoder = TemporalEncoder::new(config.clone()).expect("operation failed");
let video_frames = Tensor::zeros(&[1, 2, 3, 56, 56]).expect("operation failed"); let result = temporal_encoder.forward(video_frames);
assert!(
result.is_ok(),
"Temporal encoder forward failed: {:?}",
result.err()
);
let output = result.expect("operation failed");
assert_eq!(output.shape()[0], 1); assert_eq!(output.shape()[1], 2); assert_eq!(output.shape()[2], config.temporal_hidden_size);
}
#[test]
fn test_model_info() {
let info = model_info("cogvlm-chat-17b").expect("operation failed");
assert_eq!(info.name, "CogVLM-Chat-17B");
assert!(!info.supports_video);
assert_eq!(info.parameters, "17B");
let video_info = model_info("cogvideo").expect("operation failed");
assert!(video_info.supports_video);
assert_eq!(video_info.context_length, 4096);
}
#[test]
fn test_available_models() {
let models = available_models();
assert!(models.contains(&"cogvlm-chat-17b"));
assert!(models.contains(&"cogvideo"));
assert!(models.contains(&"THUDM/cogvlm-chat-hf"));
assert!(models.len() >= 8);
}
#[test]
#[ignore] fn test_convenience_functions() {
assert!(from_pretrained("cogvlm-chat-17b").is_ok());
assert!(from_pretrained("invalid-model").is_err());
assert!(cogvideo_from_pretrained("cogvideo").is_ok());
assert!(cogvideo_from_pretrained("invalid-model").is_err());
}
#[test]
fn test_vision_encoder_standalone() {
let config = CogVlmVisionConfig {
hidden_size: 64,
num_hidden_layers: 1,
num_attention_heads: 4,
image_size: 56, ..CogVlmVisionConfig::default()
};
let vision_encoder = vision_encoder(config).expect("operation failed");
let pixel_values = Tensor::zeros(&[1, 3, 56, 56]).expect("operation failed"); let result = vision_encoder.forward(pixel_values);
assert!(result.is_ok());
}
#[test]
fn test_visual_expert_standalone() {
let config = CogVlmConfig::small_test_config();
let expert = visual_expert(config.clone()).expect("operation failed");
let lang_hidden = Tensor::zeros(&[1, 10, config.hidden_size]).expect("operation failed");
let vision_hidden =
Tensor::zeros(&[1, 256, config.vision_config.hidden_size]).expect("operation failed");
let result = expert.forward((lang_hidden, vision_hidden));
assert!(result.is_ok());
}
#[test]
fn test_empty_vision_input() {
let mut config = CogVlmConfig::small_test_config();
config.hidden_size = 128; config.num_attention_heads = 8; config.vision_config.hidden_size = 128; config.vision_config.num_attention_heads = 8;
let embeddings =
Embedding::new(config.vocab_size, config.hidden_size, None).expect("operation failed");
let test_tokens = vec![1u32, 2u32, 3u32, 4u32, 5u32, 6u32, 7u32, 8u32]; let embedding_result = embeddings.forward(test_tokens);
assert!(
embedding_result.is_ok(),
"Embedding failed: {:?}",
embedding_result.err()
);
let model = CogVlmModel::new(config.clone()).expect("operation failed");
let input_ids = Tensor::zeros(&[1, 8]).expect("operation failed"); let input = CogVlmInput {
pixel_values: None,
input_ids,
attention_mask: None,
position_ids: None,
token_type_ids: None,
images_seq_mask: None,
images_emb_mask: None,
};
let result = model.forward(input);
assert!(
result.is_ok(),
"Empty vision input forward pass failed: {:?}",
result.err()
);
}
#[test]
fn test_batch_processing() {
let mut config = CogVlmConfig::small_test_config();
config.hidden_size = 128;
config.num_attention_heads = 8;
config.vision_config.hidden_size = 128;
config.vision_config.num_attention_heads = 8;
config.vision_config.image_size = 56;
let model = CogVlmModel::new(config).expect("operation failed");
let batch_size = 1; let seq_len = 4;
let input_ids = Tensor::zeros(&[batch_size, seq_len]).expect("operation failed");
let input = CogVlmInput {
pixel_values: None, input_ids,
attention_mask: None,
position_ids: None,
token_type_ids: None,
images_seq_mask: None,
images_emb_mask: None,
};
let result = model.forward(input);
assert!(result.is_ok());
let output = result.expect("operation failed");
assert_eq!(output.last_hidden_state.shape()[0], batch_size);
assert_eq!(output.last_hidden_state.shape()[1], seq_len);
}
#[test]
fn test_config_architecture_name() {
let cogvlm_config = CogVlmConfig::default();
assert_eq!(cogvlm_config.architecture(), "CogVLM");
let cogvideo_config = CogVideoConfig::default();
assert_eq!(cogvideo_config.architecture(), "CogVideo");
}