#![allow(clippy::all)]
use std::path::PathBuf;
#[allow(unused_variables)]
use std::time::Instant;
use trustformers::pipeline::tensorrt_backend::{
DeviceType, ExecutionStrategy, MemoryPoolType, OptimizationLevel, PrecisionMode,
};
use trustformers::pipeline::{
pipeline, tensorrt_text_classification_pipeline, tensorrt_text_generation_pipeline, Backend,
Device, Pipeline, PipelineOptions, TensorRTBackendConfig, TensorRTPipelineManager,
TensorRTPipelineOptions,
};
use trustformers::AutoTokenizer;
fn main() -> Result<(), Box<dyn std::error::Error>> {
println!("🚀 TrustformeRS TensorRT Backend Demo");
println!("====================================");
let model_path = PathBuf::from("./models/bert-base-uncased.onnx");
let generation_model_path = PathBuf::from("./models/gpt2.onnx");
let test_inputs = vec![
("This is fantastic!", "Simple positive sentiment"),
("I absolutely love this amazing product with its incredible features and outstanding quality.", "Complex positive sentiment"),
("The service was okay, not terrible but not great either.", "Neutral sentiment"),
("This product is terrible and I hate it completely.", "Negative sentiment"),
("The performance is excellent and the interface is very user-friendly.", "Positive technical review"),
];
println!("\n📊 Testing Different TensorRT Configurations");
println!("===========================================");
println!("\n1. 🏃 Latency-Optimized Configuration (FP16)");
demo_latency_optimized(&model_path, &test_inputs)?;
println!("\n2. 🚀 Throughput-Optimized Configuration (Batch Size 8)");
demo_throughput_optimized(&model_path, &test_inputs)?;
println!("\n3. 💾 Memory-Optimized Configuration (INT8)");
demo_memory_optimized(&model_path, &test_inputs)?;
println!("\n4. 🏭 Production Configuration");
demo_production_config(&model_path, &test_inputs)?;
println!("\n5. 🔧 Custom Configuration with Profiling");
demo_custom_config(&model_path, &test_inputs)?;
println!("\n6. 📝 Text Generation with TensorRT");
demo_text_generation(&generation_model_path)?;
println!("\n7. 🎯 Precision Mode Comparison");
demo_precision_comparison(&model_path, &test_inputs)?;
println!("\n8. 🎛️ Pipeline Manager Demo");
demo_pipeline_manager(&model_path, &generation_model_path)?;
println!("\n9. 🔗 Unified Pipeline API with TensorRT Backend");
demo_unified_api(&model_path)?;
println!("\n10. 📈 TensorRT Benchmarking");
demo_benchmarking(&model_path, &test_inputs)?;
println!("\n✅ TensorRT Backend Demo Complete!");
println!("================================");
Ok(())
}
fn demo_latency_optimized(
model_path: &PathBuf,
test_inputs: &[(&str, &str)],
) -> Result<(), Box<dyn std::error::Error>> {
println!(" Creating latency-optimized TensorRT pipeline...");
let config = TensorRTBackendConfig::latency_optimized(model_path.clone());
let tokenizer = AutoTokenizer::from_pretrained("bert-base-uncased")?;
let pipeline = tensorrt_text_classification_pipeline(model_path, tokenizer, Some(config))?;
for (input_text, description) in test_inputs {
let start = Instant::now();
let result = pipeline.__call__(input_text.to_string())?;
let duration = start.elapsed();
println!(" 📝 {}: {}", description, input_text);
println!(" ⏱️ Time: {:.2}ms", duration.as_millis());
println!(" 📊 Result: {:?}", result);
let performance = pipeline.performance_metrics()?;
println!(" 🚀 Performance: {:?}", performance);
let memory_info = pipeline.memory_info()?;
println!(" 💾 Memory: {:?}", memory_info);
println!();
}
Ok(())
}
fn demo_throughput_optimized(
model_path: &PathBuf,
test_inputs: &[(&str, &str)],
) -> Result<(), Box<dyn std::error::Error>> {
println!(" Creating throughput-optimized TensorRT pipeline...");
let config = TensorRTBackendConfig::throughput_optimized(model_path.clone(), 8);
let tokenizer = AutoTokenizer::from_pretrained("bert-base-uncased")?;
let pipeline = tensorrt_text_classification_pipeline(model_path, tokenizer, Some(config))?;
let batch_inputs: Vec<String> =
test_inputs.iter().map(|(input, _)| input.to_string()).collect();
let start = Instant::now();
let results = pipeline.batch(batch_inputs)?;
let duration = start.elapsed();
println!(" 📦 Batch processing {} inputs", test_inputs.len());
println!(" ⏱️ Total time: {:.2}ms", duration.as_millis());
println!(
" 📊 Average time per item: {:.2}ms",
duration.as_millis() as f64 / test_inputs.len() as f64
);
for (i, result) in results.iter().enumerate() {
println!(" 📝 Input {}: {:?}", i + 1, result);
}
Ok(())
}
fn demo_memory_optimized(
model_path: &PathBuf,
test_inputs: &[(&str, &str)],
) -> Result<(), Box<dyn std::error::Error>> {
println!(" Creating memory-optimized TensorRT pipeline...");
let config = TensorRTBackendConfig::memory_optimized(model_path.clone());
let tokenizer = AutoTokenizer::from_pretrained("bert-base-uncased")?;
let pipeline = tensorrt_text_classification_pipeline(model_path, tokenizer, Some(config))?;
let test_input = test_inputs[0].0;
let start = Instant::now();
let result = pipeline.__call__(test_input.to_string())?;
let duration = start.elapsed();
println!(" 📝 Input: {}", test_input);
println!(" ⏱️ Time: {:.2}ms", duration.as_millis());
println!(" 📊 Result: {:?}", result);
let memory_info = pipeline.memory_info()?;
println!(" 💾 Memory Usage: {:?}", memory_info);
println!(" 🎯 Precision: INT8 (Memory Optimized)");
Ok(())
}
fn demo_production_config(
model_path: &PathBuf,
test_inputs: &[(&str, &str)],
) -> Result<(), Box<dyn std::error::Error>> {
println!(" Creating production-ready TensorRT pipeline...");
let config = TensorRTBackendConfig::production(model_path.clone(), 4);
let tokenizer = AutoTokenizer::from_pretrained("bert-base-uncased")?;
let pipeline = tensorrt_text_classification_pipeline(model_path, tokenizer, Some(config))?;
let batch_inputs: Vec<String> =
test_inputs.iter().take(4).map(|(input, _)| input.to_string()).collect();
let start = Instant::now();
let results = pipeline.batch(batch_inputs)?;
let duration = start.elapsed();
println!(" 📦 Production batch processing: {} items", results.len());
println!(" ⏱️ Time: {:.2}ms", duration.as_millis());
println!(" 🎯 Precision: FP16 (Production Optimized)");
let device_info = pipeline.device_info()?;
println!(" 🖥️ Device Info: {:?}", device_info);
Ok(())
}
fn demo_custom_config(
model_path: &PathBuf,
test_inputs: &[(&str, &str)],
) -> Result<(), Box<dyn std::error::Error>> {
println!(" Creating custom TensorRT configuration...");
let profile_path = PathBuf::from("/tmp/tensorrt_profile.json");
let cache_path = PathBuf::from("/tmp/tensorrt_cache");
let config = TensorRTBackendConfig {
model_path: model_path.clone(),
device_type: DeviceType::GPU,
device_id: 0,
precision_mode: PrecisionMode::FP16,
max_batch_size: 2,
workspace_size: 512 * 1024 * 1024, memory_pool_type: MemoryPoolType::Workspace,
execution_strategy: ExecutionStrategy::Balanced,
optimization_level: OptimizationLevel::O3,
enable_fp16: true,
enable_profiling: true,
profile_output_path: Some(profile_path),
engine_cache_path: Some(cache_path),
..Default::default()
};
let tokenizer = AutoTokenizer::from_pretrained("bert-base-uncased")?;
let pipeline = tensorrt_text_classification_pipeline(model_path, tokenizer, Some(config))?;
let test_input = test_inputs[0].0;
let start = Instant::now();
let result = pipeline.__call__(test_input.to_string())?;
let duration = start.elapsed();
println!(" 📝 Input: {}", test_input);
println!(" ⏱️ Time: {:.2}ms", duration.as_millis());
println!(" 📊 Result: {:?}", result);
println!(" 🔧 Custom Config: FP16, 512MB workspace, profiling enabled");
Ok(())
}
fn demo_text_generation(model_path: &PathBuf) -> Result<(), Box<dyn std::error::Error>> {
println!(" Creating TensorRT text generation pipeline...");
let config = TensorRTBackendConfig::latency_optimized(model_path.clone());
let tokenizer = AutoTokenizer::from_pretrained("gpt2")?;
let pipeline = tensorrt_text_generation_pipeline(model_path, tokenizer, Some(config))?
.with_max_new_tokens(50)
.with_temperature(0.8)
.with_top_p(0.9)
.with_do_sample(true);
let prompts = vec![
"The future of artificial intelligence is",
"In a world where technology advances rapidly,",
"The most important thing about machine learning is",
];
for prompt in prompts {
let start = Instant::now();
let result = pipeline.__call__(prompt.to_string())?;
let duration = start.elapsed();
println!(" 📝 Prompt: {}", prompt);
println!(" ⏱️ Time: {:.2}ms", duration.as_millis());
println!(" 📊 Generated: {:?}", result);
let benchmark = pipeline.benchmark(prompt, 5)?;
println!(" 📈 Benchmark: {:?}", benchmark);
println!();
}
Ok(())
}
fn demo_precision_comparison(
model_path: &PathBuf,
test_inputs: &[(&str, &str)],
) -> Result<(), Box<dyn std::error::Error>> {
println!(" Comparing different precision modes...");
let precision_modes = vec![
(PrecisionMode::FP32, "FP32 (Full Precision)"),
(PrecisionMode::FP16, "FP16 (Half Precision)"),
(PrecisionMode::INT8, "INT8 (8-bit Quantization)"),
];
let test_input = test_inputs[0].0;
let tokenizer = AutoTokenizer::from_pretrained("bert-base-uncased")?;
for (precision_mode, description) in precision_modes {
println!(" 📏 Testing {}", description);
let config = TensorRTBackendConfig {
model_path: model_path.clone(),
precision_mode: precision_mode.clone(),
enable_fp16: matches!(precision_mode, PrecisionMode::FP16),
enable_int8: matches!(precision_mode, PrecisionMode::INT8),
..TensorRTBackendConfig::latency_optimized(model_path.clone())
};
let pipeline =
tensorrt_text_classification_pipeline(model_path, tokenizer.clone(), Some(config))?;
let start = Instant::now();
let result = pipeline.__call__(test_input.to_string())?;
let duration = start.elapsed();
println!(" ⏱️ Time: {:.2}ms", duration.as_millis());
println!(" 📊 Result: {:?}", result);
let memory_info = pipeline.memory_info()?;
println!(" 💾 Memory: {:?}", memory_info);
println!();
}
Ok(())
}
fn demo_pipeline_manager(
model_path: &PathBuf,
generation_model_path: &PathBuf,
) -> Result<(), Box<dyn std::error::Error>> {
println!(" Creating TensorRT pipeline manager...");
let default_config = TensorRTBackendConfig::production(model_path.clone(), 4);
let mut manager = TensorRTPipelineManager::new(default_config)
.with_engine_cache(PathBuf::from("/tmp/tensorrt_cache"));
manager.load_model("classifier".to_string(), model_path)?;
let gen_config = TensorRTBackendConfig::throughput_optimized(generation_model_path.clone(), 8);
manager.load_model_with_config("generator".to_string(), generation_model_path, gen_config)?;
println!(" 📋 Registered models: {:?}", manager.list_models());
if let Some(classifier) = manager.get_model("classifier") {
println!(
" 📊 Classifier precision: {:?}",
classifier.precision_mode()
);
println!(
" 📦 Classifier batch size: {}",
classifier.max_batch_size()
);
}
let cache_size = manager.cache_size()?;
println!(" 🗂️ Cache size: {} bytes", cache_size);
Ok(())
}
fn demo_unified_api(model_path: &PathBuf) -> Result<(), Box<dyn std::error::Error>> {
println!(" Testing unified pipeline API with TensorRT backend...");
let tensorrt_config = TensorRTBackendConfig::latency_optimized(model_path.clone());
let options = PipelineOptions {
model: Some("bert-base-uncased".to_string()),
device: Some(Device::Gpu(0)),
batch_size: Some(1),
max_length: Some(512),
backend: Some(Backend::TensorRT {
model_path: model_path.clone(),
}),
tensorrt_config: Some(tensorrt_config),
..Default::default()
};
let pipeline = pipeline("sentiment-analysis", None, Some(options))?;
let test_input = "This TensorRT integration is amazing!";
let start = Instant::now();
let result = pipeline.__call__(test_input.to_string())?;
let duration = start.elapsed();
println!(" 📝 Input: {}", test_input);
println!(" ⏱️ Time: {:.2}ms", duration.as_millis());
println!(" 📊 Result: {:?}", result);
println!(" 🔧 Backend: TensorRT through unified API");
Ok(())
}
fn demo_benchmarking(
model_path: &PathBuf,
test_inputs: &[(&str, &str)],
) -> Result<(), Box<dyn std::error::Error>> {
println!(" Running comprehensive TensorRT benchmarks...");
let config = TensorRTBackendConfig::production(model_path.clone(), 4);
let tokenizer = AutoTokenizer::from_pretrained("bert-base-uncased")?;
let pipeline = tensorrt_text_classification_pipeline(model_path, tokenizer, Some(config))?;
let test_input = test_inputs[0].0;
let run_counts = vec![1, 10, 50, 100];
for &num_runs in &run_counts {
println!(" 🔄 Benchmarking {} runs...", num_runs);
let start = Instant::now();
let benchmark = pipeline.benchmark(test_input, num_runs)?;
let duration = start.elapsed();
println!(" ⏱️ Benchmark time: {:.2}ms", duration.as_millis());
println!(" 📊 Results: {:?}", benchmark);
let avg_time = duration.as_millis() as f64 / num_runs as f64;
let throughput = 1000.0 / avg_time;
println!(" 📈 Average time per run: {:.2}ms", avg_time);
println!(" 🚀 Throughput: {:.1} requests/second", throughput);
println!();
}
Ok(())
}
#[allow(dead_code)]
fn demonstrate_tensorrt_options() -> Result<(), Box<dyn std::error::Error>> {
println!("\n🎛️ TensorRT Pipeline Options Demo");
println!("=================================");
let model_path = PathBuf::from("./models/bert-base-uncased.onnx");
let options = vec![
(
"Latency Optimized",
TensorRTPipelineOptions::latency_optimized(model_path.clone()),
),
(
"Throughput Optimized",
TensorRTPipelineOptions::throughput_optimized(model_path.clone(), 8),
),
(
"Memory Optimized",
TensorRTPipelineOptions::memory_optimized(model_path.clone()),
),
(
"Production",
TensorRTPipelineOptions::production(model_path.clone(), 4),
),
];
for (name, option) in options {
println!(" 🔧 {}: {:?}", name, option);
println!(" - Enable profiling: {}", option.enable_profiling);
println!(" - Warmup runs: {}", option.warmup_runs);
println!(" - Engine cache: {}", option.enable_engine_cache);
println!(
" - Precision: {:?}",
option.tensorrt_config.precision_mode
);
println!(
" - Batch size: {}",
option.tensorrt_config.max_batch_size
);
println!(
" - Workspace: {}MB",
option.tensorrt_config.workspace_size / (1024 * 1024)
);
println!();
}
Ok(())
}
#[cfg(test)]
mod tests {
use super::*;
use tempfile::tempdir;
#[test]
fn test_tensorrt_config_creation() {
let temp_dir = tempdir().expect("Failed to create temp dir for test");
let model_path = temp_dir.path().join("model.onnx");
let config = TensorRTBackendConfig::latency_optimized(model_path.clone());
assert_eq!(config.model_path, model_path);
assert!(matches!(config.precision_mode, PrecisionMode::FP16));
assert!(config.enable_fp16);
assert_eq!(config.max_batch_size, 1);
}
#[test]
fn test_tensorrt_pipeline_options() {
let temp_dir = tempdir().expect("Failed to create temp directory");
let model_path = temp_dir.path().join("model.onnx");
let options = TensorRTPipelineOptions::latency_optimized(model_path.clone());
assert_eq!(options.tensorrt_config.model_path, model_path);
assert_eq!(options.warmup_runs, 5);
assert!(options.enable_engine_cache);
}
#[test]
fn test_tensorrt_manager_creation() {
let temp_dir = tempdir().expect("Failed to create temp directory");
let model_path = temp_dir.path().join("model.onnx");
let config = TensorRTBackendConfig::production(model_path, 4);
let manager = TensorRTPipelineManager::new(config);
assert_eq!(manager.list_models().len(), 0);
assert_eq!(
manager.cache_size().expect("Cache size should be retrievable"),
0
);
}
}