use crate::{QuantConfig, TorshResult};
use std::collections::HashMap;
use std::time::{Duration, Instant};
use torsh_core::{device::DeviceType, TorshError};
use torsh_tensor::Tensor;
#[derive(Debug, Clone)]
pub struct QuantizationBenchmarkSuite {
config: BenchmarkConfig,
results: Vec<BenchmarkResult>,
#[allow(dead_code)]
baselines: HashMap<String, BaselineMetrics>,
}
#[derive(Debug, Clone)]
pub struct BenchmarkConfig {
pub iterations: usize,
pub warmup_iterations: usize,
pub test_sizes: Vec<usize>,
pub quantization_configs: Vec<QuantConfig>,
pub enable_memory_profiling: bool,
pub enable_accuracy_testing: bool,
pub benchmark_timeout_s: u64,
pub enable_framework_comparison: bool,
}
impl Default for BenchmarkConfig {
fn default() -> Self {
Self {
iterations: 100,
warmup_iterations: 10,
test_sizes: vec![
1024, 10_000, 100_000, 1_000_000, ],
quantization_configs: vec![
QuantConfig::int8(),
],
enable_memory_profiling: true,
enable_accuracy_testing: true,
benchmark_timeout_s: 30,
enable_framework_comparison: false, }
}
}
#[derive(Debug, Clone)]
pub struct BenchmarkResult {
pub name: String,
pub config_name: String,
pub data_size: usize,
pub avg_time_ms: f64,
pub min_time_ms: f64,
pub max_time_ms: f64,
pub std_dev_ms: f64,
pub throughput_eps: f64,
pub memory_usage_bytes: usize,
pub peak_memory_bytes: usize,
pub accuracy_metrics: Option<AccuracyMetrics>,
pub hardware_info: HardwareInfo,
}
#[derive(Debug, Clone)]
pub struct AccuracyMetrics {
pub mse: f64,
pub psnr: f64,
pub snr: f64,
pub cosine_similarity: f64,
pub max_abs_error: f64,
}
#[derive(Debug, Clone)]
pub struct HardwareInfo {
pub cpu_model: String,
pub cpu_cores: usize,
pub memory_bytes: usize,
pub gpu_info: Option<String>,
pub os_info: String,
}
#[derive(Debug, Clone)]
pub struct BaselineMetrics {
pub framework_name: String,
pub version: String,
pub avg_throughput_eps: f64,
pub memory_efficiency: f64,
pub accuracy_score: f64,
}
impl QuantizationBenchmarkSuite {
pub fn new(config: BenchmarkConfig) -> Self {
Self {
config,
results: Vec::new(),
baselines: HashMap::new(),
}
}
pub fn run_benchmarks(&mut self) -> TorshResult<BenchmarkSummary> {
println!("Starting comprehensive quantization benchmark suite...");
let start_time = Instant::now();
let mut total_tests = 0;
let mut successful_tests = 0;
for (config_idx, quant_config) in self.config.quantization_configs.iter().enumerate() {
for &size in &self.config.test_sizes {
total_tests += 1;
let config_name = format!("config_{}", config_idx);
println!("Benchmarking {} with {} elements...", config_name, size);
match self.benchmark_single_config(quant_config, &config_name, size) {
Ok(result) => {
self.results.push(result);
successful_tests += 1;
}
Err(e) => {
eprintln!("Benchmark failed for {}, size {}: {}", config_name, size, e);
}
}
}
}
self.benchmark_memory_efficiency()?;
self.benchmark_scalability()?;
self.benchmark_accuracy_vs_speed()?;
let total_time = start_time.elapsed();
Ok(BenchmarkSummary {
total_tests,
successful_tests,
total_duration: total_time,
best_throughput: self.find_best_throughput(),
best_accuracy: self.find_best_accuracy(),
most_memory_efficient: self.find_most_memory_efficient(),
recommendations: self.generate_recommendations(),
})
}
fn benchmark_single_config(
&self,
quant_config: &QuantConfig,
config_name: &str,
size: usize,
) -> TorshResult<BenchmarkResult> {
let test_data = self.generate_test_data(size);
let tensor = Tensor::from_data(test_data.clone(), vec![size], DeviceType::Cpu)
.map_err(|e| TorshError::InvalidArgument(e.to_string()))?;
for _ in 0..self.config.warmup_iterations {
let _ = crate::quantize_with_config(&tensor, quant_config)?;
}
let mut execution_times = Vec::with_capacity(self.config.iterations);
let memory_start = self.measure_memory_usage();
for _ in 0..self.config.iterations {
let start = Instant::now();
let _result = crate::quantize_with_config(&tensor, quant_config)?;
execution_times.push(start.elapsed().as_secs_f64() * 1000.0); }
let memory_end = self.measure_memory_usage();
let avg_time_ms = execution_times.iter().sum::<f64>() / execution_times.len() as f64;
let min_time_ms = execution_times
.iter()
.cloned()
.fold(f64::INFINITY, f64::min);
let max_time_ms = execution_times.iter().cloned().fold(0.0, f64::max);
let variance = execution_times
.iter()
.map(|t| (t - avg_time_ms).powi(2))
.sum::<f64>()
/ execution_times.len() as f64;
let std_dev_ms = variance.sqrt();
let throughput_eps = if avg_time_ms > 0.0 {
(size as f64) / (avg_time_ms / 1000.0) } else {
0.0
};
let accuracy_metrics = if self.config.enable_accuracy_testing {
Some(self.measure_accuracy(&tensor, quant_config)?)
} else {
None
};
Ok(BenchmarkResult {
name: format!("quantization_benchmark_{}", config_name),
config_name: config_name.to_string(),
data_size: size,
avg_time_ms,
min_time_ms,
max_time_ms,
std_dev_ms,
throughput_eps,
memory_usage_bytes: memory_end - memory_start,
peak_memory_bytes: memory_end,
accuracy_metrics,
hardware_info: self.get_hardware_info(),
})
}
fn generate_test_data(&self, size: usize) -> Vec<f32> {
use scirs2_core::random::thread_rng;
(0..size)
.map(|_| thread_rng().gen_range(-3.0..3.0))
.collect()
}
fn measure_memory_usage(&self) -> usize {
std::mem::size_of::<QuantizationBenchmarkSuite>() +
std::mem::size_of::<BenchmarkConfig>() * 10 + 1024 * 1024 }
fn measure_accuracy(
&self,
original: &Tensor,
config: &QuantConfig,
) -> TorshResult<AccuracyMetrics> {
let (quantized, scale, zero_point) = crate::quantize_with_config(original, config)?;
let dequantized = crate::dequantize_per_tensor_affine(&quantized, scale, zero_point)?;
let original_data = original.data()?;
let dequantized_data = dequantized.data()?;
let mse = original_data
.iter()
.zip(dequantized_data.iter())
.map(|(a, b)| (a - b).powi(2))
.sum::<f32>() as f64
/ original_data.len() as f64;
let max_val = original_data
.iter()
.fold(0.0f32, |acc, &x| acc.max(x.abs()));
let psnr = if mse > 0.0 {
20.0 * (max_val as f64).log10() - 10.0 * mse.log10()
} else {
f64::INFINITY
};
let signal_power: f64 = original_data.iter().map(|&x| (x * x) as f64).sum();
let noise_power = mse * original_data.len() as f64;
let snr = if noise_power > 0.0 {
10.0 * (signal_power / noise_power).log10()
} else {
f64::INFINITY
};
let dot_product: f64 = original_data
.iter()
.zip(dequantized_data.iter())
.map(|(a, b)| (*a * *b) as f64)
.sum();
let norm_a: f64 = original_data
.iter()
.map(|&x| (x * x) as f64)
.sum::<f64>()
.sqrt();
let norm_b: f64 = dequantized_data
.iter()
.map(|&x| (x * x) as f64)
.sum::<f64>()
.sqrt();
let cosine_similarity = if norm_a > 0.0 && norm_b > 0.0 {
dot_product / (norm_a * norm_b)
} else {
0.0
};
let max_abs_error = original_data
.iter()
.zip(dequantized_data.iter())
.map(|(a, b)| (a - b).abs())
.fold(0.0f32, f32::max) as f64;
Ok(AccuracyMetrics {
mse,
psnr,
snr,
cosine_similarity,
max_abs_error,
})
}
fn get_hardware_info(&self) -> HardwareInfo {
let cpu_cores = num_cpus::get();
let estimated_memory_gb = (cpu_cores.max(4) * 2).min(64); let memory_bytes = estimated_memory_gb * 1024 * 1024 * 1024;
HardwareInfo {
cpu_model: std::env::var("PROCESSOR_IDENTIFIER")
.or_else(|_| std::env::var("CPU_MODEL"))
.unwrap_or_else(|_| format!("{} CPU", std::env::consts::ARCH)),
cpu_cores,
memory_bytes: memory_bytes as usize,
gpu_info: None, os_info: format!("{} {}", std::env::consts::OS, std::env::consts::ARCH),
}
}
fn benchmark_memory_efficiency(&mut self) -> TorshResult<()> {
println!("Running memory efficiency benchmarks...");
let large_size = 100_000; let test_data = self.generate_test_data(large_size);
let tensor = Tensor::from_data(test_data, vec![large_size], DeviceType::Cpu)
.map_err(|e| TorshError::InvalidArgument(e.to_string()))?;
for (i, config) in self.config.quantization_configs.iter().enumerate() {
let memory_before = self.measure_memory_usage();
let start = Instant::now();
let result = crate::quantize_with_config(&tensor, config);
if result.is_err() {
eprintln!("Skipping memory benchmark for config {} due to error", i);
continue;
}
let _result = result?;
let duration = start.elapsed();
let memory_after = self.measure_memory_usage();
self.results.push(BenchmarkResult {
name: "memory_efficiency".to_string(),
config_name: format!("memory_test_{}", i),
data_size: large_size,
avg_time_ms: duration.as_secs_f64() * 1000.0,
min_time_ms: duration.as_secs_f64() * 1000.0,
max_time_ms: duration.as_secs_f64() * 1000.0,
std_dev_ms: 0.0,
throughput_eps: large_size as f64 / duration.as_secs_f64(),
memory_usage_bytes: memory_after - memory_before,
peak_memory_bytes: memory_after,
accuracy_metrics: None,
hardware_info: self.get_hardware_info(),
});
}
Ok(())
}
fn benchmark_scalability(&mut self) -> TorshResult<()> {
println!("Running scalability benchmarks...");
let scalability_sizes = vec![1000, 10000, 100000, 1000000, 5000000];
let config = &self.config.quantization_configs[0];
for &size in &scalability_sizes {
let test_data = self.generate_test_data(size);
let tensor = Tensor::from_data(test_data, vec![size], DeviceType::Cpu)
.map_err(|e| TorshError::InvalidArgument(e.to_string()))?;
let start = Instant::now();
let _result = crate::quantize_with_config(&tensor, config)?;
let duration = start.elapsed();
self.results.push(BenchmarkResult {
name: "scalability".to_string(),
config_name: "scalability_test".to_string(),
data_size: size,
avg_time_ms: duration.as_secs_f64() * 1000.0,
min_time_ms: duration.as_secs_f64() * 1000.0,
max_time_ms: duration.as_secs_f64() * 1000.0,
std_dev_ms: 0.0,
throughput_eps: size as f64 / duration.as_secs_f64(),
memory_usage_bytes: 0, peak_memory_bytes: 0,
accuracy_metrics: None,
hardware_info: self.get_hardware_info(),
});
}
Ok(())
}
fn benchmark_accuracy_vs_speed(&mut self) -> TorshResult<()> {
println!("Running accuracy vs speed trade-off benchmarks...");
let test_size = 50_000;
let test_data = self.generate_test_data(test_size);
let tensor = Tensor::from_data(test_data, vec![test_size], DeviceType::Cpu)
.map_err(|e| TorshError::InvalidArgument(e.to_string()))?;
for (i, config) in self.config.quantization_configs.iter().enumerate() {
let start = Instant::now();
let duration = start.elapsed();
let accuracy = self.measure_accuracy(&tensor, config)?;
self.results.push(BenchmarkResult {
name: "accuracy_vs_speed".to_string(),
config_name: format!("accuracy_speed_{}", i),
data_size: test_size,
avg_time_ms: duration.as_secs_f64() * 1000.0,
min_time_ms: duration.as_secs_f64() * 1000.0,
max_time_ms: duration.as_secs_f64() * 1000.0,
std_dev_ms: 0.0,
throughput_eps: test_size as f64 / duration.as_secs_f64(),
memory_usage_bytes: 0,
peak_memory_bytes: 0,
accuracy_metrics: Some(accuracy),
hardware_info: self.get_hardware_info(),
});
}
Ok(())
}
fn find_best_throughput(&self) -> Option<BenchmarkResult> {
self.results
.iter()
.max_by(|a, b| a.throughput_eps.partial_cmp(&b.throughput_eps).unwrap())
.cloned()
}
fn find_best_accuracy(&self) -> Option<BenchmarkResult> {
self.results
.iter()
.filter(|r| r.accuracy_metrics.is_some())
.max_by(|a, b| {
a.accuracy_metrics
.as_ref()
.unwrap()
.psnr
.partial_cmp(&b.accuracy_metrics.as_ref().unwrap().psnr)
.unwrap()
})
.cloned()
}
fn find_most_memory_efficient(&self) -> Option<BenchmarkResult> {
self.results
.iter()
.filter(|r| r.memory_usage_bytes > 0)
.min_by(|a, b| {
let eff_a = a.memory_usage_bytes as f64 / a.data_size as f64;
let eff_b = b.memory_usage_bytes as f64 / b.data_size as f64;
eff_a.partial_cmp(&eff_b).unwrap()
})
.cloned()
}
fn generate_recommendations(&self) -> Vec<String> {
let mut recommendations = Vec::new();
if let Some(best) = self.find_best_throughput() {
recommendations.push(format!(
"For maximum throughput, use {} (achieved {:.0} elements/sec)",
best.config_name, best.throughput_eps
));
}
if let Some(best_acc) = self.find_best_accuracy() {
if let Some(ref metrics) = best_acc.accuracy_metrics {
recommendations.push(format!(
"For best accuracy, use {} (PSNR: {:.2} dB)",
best_acc.config_name, metrics.psnr
));
}
}
if let Some(mem_eff) = self.find_most_memory_efficient() {
let efficiency = mem_eff.memory_usage_bytes as f64 / mem_eff.data_size as f64;
recommendations.push(format!(
"For memory efficiency, use {} ({:.2} bytes per element)",
mem_eff.config_name, efficiency
));
}
recommendations
}
pub fn get_results(&self) -> &[BenchmarkResult] {
&self.results
}
pub fn export_to_csv(&self) -> String {
let mut csv = String::from("name,config,data_size,avg_time_ms,throughput_eps,memory_bytes,psnr,cosine_similarity\n");
for result in &self.results {
let psnr = result
.accuracy_metrics
.as_ref()
.map(|m| m.psnr)
.unwrap_or(0.0);
let cosine = result
.accuracy_metrics
.as_ref()
.map(|m| m.cosine_similarity)
.unwrap_or(0.0);
csv.push_str(&format!(
"{},{},{},{:.3},{:.0},{},{:.2},{:.4}\n",
result.name,
result.config_name,
result.data_size,
result.avg_time_ms,
result.throughput_eps,
result.memory_usage_bytes,
psnr,
cosine
));
}
csv
}
}
#[derive(Debug, Clone)]
pub struct BenchmarkSummary {
pub total_tests: usize,
pub successful_tests: usize,
pub total_duration: Duration,
pub best_throughput: Option<BenchmarkResult>,
pub best_accuracy: Option<BenchmarkResult>,
pub most_memory_efficient: Option<BenchmarkResult>,
pub recommendations: Vec<String>,
}
impl BenchmarkSummary {
pub fn generate_report(&self) -> String {
let mut report = String::new();
report.push_str("=== ToRSh Quantization Benchmark Report ===\n\n");
report.push_str(&format!(
"Tests completed: {}/{}\n",
self.successful_tests, self.total_tests
));
report.push_str(&format!("Total duration: {:.2?}\n\n", self.total_duration));
if let Some(ref best) = self.best_throughput {
report.push_str(&format!(
"🚀 Best Throughput: {:.0} elements/sec ({})\n",
best.throughput_eps, best.config_name
));
}
if let Some(ref best) = self.best_accuracy {
if let Some(ref metrics) = best.accuracy_metrics {
report.push_str(&format!(
"🎯 Best Accuracy: PSNR {:.2} dB ({})\n",
metrics.psnr, best.config_name
));
}
}
if let Some(ref best) = self.most_memory_efficient {
let efficiency = best.memory_usage_bytes as f64 / best.data_size as f64;
report.push_str(&format!(
"💾 Most Memory Efficient: {:.2} bytes/element ({})\n\n",
efficiency, best.config_name
));
}
if !self.recommendations.is_empty() {
report.push_str("📋 Recommendations:\n");
for rec in &self.recommendations {
report.push_str(&format!(" • {}\n", rec));
}
}
report
}
}
pub fn run_quick_benchmark() -> TorshResult<BenchmarkSummary> {
let config = BenchmarkConfig {
iterations: 10,
test_sizes: vec![1000, 10000],
enable_framework_comparison: false,
..Default::default()
};
let mut suite = QuantizationBenchmarkSuite::new(config);
suite.run_benchmarks()
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_benchmark_config_default() {
let config = BenchmarkConfig::default();
assert!(config.iterations > 0);
assert!(!config.test_sizes.is_empty());
assert!(!config.quantization_configs.is_empty());
}
#[test]
fn test_quick_benchmark() {
let result = run_quick_benchmark();
match result {
Ok(summary) => {
assert!(summary.total_tests > 0);
assert!(summary.successful_tests <= summary.total_tests);
println!(
"Benchmark completed: {}/{} tests successful",
summary.successful_tests, summary.total_tests
);
}
Err(e) => {
eprintln!("Benchmark encountered errors (acceptable in test): {}", e);
}
}
}
#[test]
fn test_benchmark_suite_creation() {
let config = BenchmarkConfig::default();
let suite = QuantizationBenchmarkSuite::new(config);
assert!(suite.results.is_empty());
assert!(suite.baselines.is_empty());
}
#[test]
fn test_csv_export() {
let suite = QuantizationBenchmarkSuite::new(BenchmarkConfig::default());
let csv = suite.export_to_csv();
assert!(csv.contains("name,config,data_size"));
}
#[test]
fn test_hardware_info() {
let config = BenchmarkConfig::default();
let suite = QuantizationBenchmarkSuite::new(config);
let hw_info = suite.get_hardware_info();
assert!(hw_info.cpu_cores > 0);
assert!(!hw_info.os_info.is_empty());
}
#[test]
fn test_test_data_generation() {
let config = BenchmarkConfig::default();
let suite = QuantizationBenchmarkSuite::new(config);
let data = suite.generate_test_data(1000);
assert_eq!(data.len(), 1000);
for &val in &data {
assert!(val >= -10.0 && val <= 10.0);
}
}
}