pub mod config;
pub mod model;
#[cfg(test)]
mod model_tests;
#[cfg(test)]
mod tests;
pub use config::{CogVideoConfig, CogVlmConfig, CogVlmVisionConfig};
pub use model::{
CogVideoInput, CogVideoModel, CogVlmInput, CogVlmModel, CogVlmOutput, CogVlmVisionTransformer,
TemporalEncoder, VisualExpert,
};
use trustformers_core::errors::Result;
pub fn cogvlm_chat_17b() -> Result<CogVlmModel> {
let config = CogVlmConfig::cogvlm_chat_17b();
CogVlmModel::new(config)
}
pub fn cogvlm_base_17b() -> Result<CogVlmModel> {
let config = CogVlmConfig::cogvlm_base_17b();
CogVlmModel::new(config)
}
pub fn cogvlm_grounding_17b() -> Result<CogVlmModel> {
let config = CogVlmConfig::cogvlm_grounding_17b();
CogVlmModel::new(config)
}
pub fn cogvideo() -> Result<CogVideoModel> {
let config = CogVideoConfig::default();
CogVideoModel::new(config)
}
pub fn from_pretrained(model_name: &str) -> Result<CogVlmModel> {
let config = CogVlmConfig::from_pretrained_name(model_name).ok_or_else(|| {
trustformers_core::errors::TrustformersError::invalid_config(format!(
"Unknown model name: {}",
model_name
))
})?;
CogVlmModel::new(config)
}
pub fn cogvideo_from_pretrained(model_name: &str) -> Result<CogVideoModel> {
if model_name.contains("cogvideo") {
cogvideo()
} else {
Err(
trustformers_core::errors::TrustformersError::invalid_config(format!(
"Unknown CogVideo model name: {}",
model_name
)),
)
}
}
pub fn vision_encoder(config: CogVlmVisionConfig) -> Result<CogVlmVisionTransformer> {
CogVlmVisionTransformer::new(config)
}
pub fn visual_expert(config: CogVlmConfig) -> Result<VisualExpert> {
VisualExpert::new(config)
}
pub fn available_models() -> Vec<&'static str> {
vec![
"cogvlm-chat-17b",
"cogvlm-base-17b",
"cogvlm-grounding-17b",
"cogvideo",
"THUDM/cogvlm-chat-hf",
"THUDM/cogvlm-base-hf",
"THUDM/cogvlm-grounding-generalist-hf",
"THUDM/cogvideo-chat",
]
}
pub fn model_info(model_name: &str) -> Option<ModelInfo> {
match model_name {
"cogvlm-chat-17b" | "THUDM/cogvlm-chat-hf" => Some(ModelInfo {
name: "CogVLM-Chat-17B",
description: "Conversational multimodal model for visual Q&A and chat",
use_cases: vec!["Visual Q&A", "Image captioning", "Visual chat"],
parameters: "17B",
context_length: 2048,
supports_video: false,
}),
"cogvlm-base-17b" | "THUDM/cogvlm-base-hf" => Some(ModelInfo {
name: "CogVLM-Base-17B",
description: "Base multimodal model for fine-tuning",
use_cases: vec!["Fine-tuning", "Research", "Custom applications"],
parameters: "17B",
context_length: 2048,
supports_video: false,
}),
"cogvlm-grounding-17b" | "THUDM/cogvlm-grounding-generalist-hf" => Some(ModelInfo {
name: "CogVLM-Grounding-17B",
description: "Specialized for visual grounding and object localization",
use_cases: vec!["Visual grounding", "Object detection", "Spatial reasoning"],
parameters: "17B",
context_length: 2048,
supports_video: false,
}),
"cogvideo" | "THUDM/cogvideo-chat" => Some(ModelInfo {
name: "CogVideo",
description: "Video understanding and generation model",
use_cases: vec!["Video Q&A", "Video captioning", "Temporal reasoning"],
parameters: "17B",
context_length: 4096,
supports_video: true,
}),
_ => None,
}
}
#[derive(Debug, Clone)]
pub struct ModelInfo {
pub name: &'static str,
pub description: &'static str,
pub use_cases: Vec<&'static str>,
pub parameters: &'static str,
pub context_length: usize,
pub supports_video: bool,
}