1use crate::{QuantConfig, TorshResult};
15use std::collections::HashMap;
16use std::time::{Duration, Instant};
17use torsh_core::{device::DeviceType, TorshError};
18use torsh_tensor::Tensor;
19
20#[derive(Debug, Clone)]
22pub struct QuantizationBenchmarkSuite {
23 config: BenchmarkConfig,
25 results: Vec<BenchmarkResult>,
27 #[allow(dead_code)]
29 baselines: HashMap<String, BaselineMetrics>,
30}
31
32#[derive(Debug, Clone)]
34pub struct BenchmarkConfig {
35 pub iterations: usize,
37 pub warmup_iterations: usize,
39 pub test_sizes: Vec<usize>,
41 pub quantization_configs: Vec<QuantConfig>,
43 pub enable_memory_profiling: bool,
45 pub enable_accuracy_testing: bool,
47 pub benchmark_timeout_s: u64,
49 pub enable_framework_comparison: bool,
51}
52
53impl Default for BenchmarkConfig {
54 fn default() -> Self {
55 Self {
56 iterations: 100,
57 warmup_iterations: 10,
58 test_sizes: vec![
59 1024, 10_000, 100_000, 1_000_000, ],
64 quantization_configs: vec![
65 QuantConfig::int8(),
66 ],
70 enable_memory_profiling: true,
71 enable_accuracy_testing: true,
72 benchmark_timeout_s: 30,
73 enable_framework_comparison: false, }
75 }
76}
77
78#[derive(Debug, Clone)]
80pub struct BenchmarkResult {
81 pub name: String,
83 pub config_name: String,
85 pub data_size: usize,
87 pub avg_time_ms: f64,
89 pub min_time_ms: f64,
91 pub max_time_ms: f64,
93 pub std_dev_ms: f64,
95 pub throughput_eps: f64,
97 pub memory_usage_bytes: usize,
99 pub peak_memory_bytes: usize,
101 pub accuracy_metrics: Option<AccuracyMetrics>,
103 pub hardware_info: HardwareInfo,
105}
106
107#[derive(Debug, Clone)]
109pub struct AccuracyMetrics {
110 pub mse: f64,
112 pub psnr: f64,
114 pub snr: f64,
116 pub cosine_similarity: f64,
118 pub max_abs_error: f64,
120}
121
122#[derive(Debug, Clone)]
124pub struct HardwareInfo {
125 pub cpu_model: String,
127 pub cpu_cores: usize,
129 pub memory_bytes: usize,
131 pub gpu_info: Option<String>,
133 pub os_info: String,
135}
136
137#[derive(Debug, Clone)]
139pub struct BaselineMetrics {
140 pub framework_name: String,
142 pub version: String,
144 pub avg_throughput_eps: f64,
146 pub memory_efficiency: f64,
148 pub accuracy_score: f64,
150}
151
152impl QuantizationBenchmarkSuite {
153 pub fn new(config: BenchmarkConfig) -> Self {
155 Self {
156 config,
157 results: Vec::new(),
158 baselines: HashMap::new(),
159 }
160 }
161
162 pub fn run_benchmarks(&mut self) -> TorshResult<BenchmarkSummary> {
164 println!("Starting comprehensive quantization benchmark suite...");
165
166 let start_time = Instant::now();
167 let mut total_tests = 0;
168 let mut successful_tests = 0;
169
170 for (config_idx, quant_config) in self.config.quantization_configs.iter().enumerate() {
172 for &size in &self.config.test_sizes {
173 total_tests += 1;
174
175 let config_name = format!("config_{}", config_idx);
176 println!("Benchmarking {} with {} elements...", config_name, size);
177
178 match self.benchmark_single_config(quant_config, &config_name, size) {
179 Ok(result) => {
180 self.results.push(result);
181 successful_tests += 1;
182 }
183 Err(e) => {
184 eprintln!("Benchmark failed for {}, size {}: {}", config_name, size, e);
185 }
186 }
187 }
188 }
189
190 self.benchmark_memory_efficiency()?;
192 self.benchmark_scalability()?;
193 self.benchmark_accuracy_vs_speed()?;
194
195 let total_time = start_time.elapsed();
196
197 Ok(BenchmarkSummary {
198 total_tests,
199 successful_tests,
200 total_duration: total_time,
201 best_throughput: self.find_best_throughput(),
202 best_accuracy: self.find_best_accuracy(),
203 most_memory_efficient: self.find_most_memory_efficient(),
204 recommendations: self.generate_recommendations(),
205 })
206 }
207
208 fn benchmark_single_config(
210 &self,
211 quant_config: &QuantConfig,
212 config_name: &str,
213 size: usize,
214 ) -> TorshResult<BenchmarkResult> {
215 let test_data = self.generate_test_data(size);
217 let tensor = Tensor::from_data(test_data.clone(), vec![size], DeviceType::Cpu)
218 .map_err(|e| TorshError::InvalidArgument(e.to_string()))?;
219
220 for _ in 0..self.config.warmup_iterations {
222 let _ = crate::quantize_with_config(&tensor, quant_config)?;
223 }
224
225 let mut execution_times = Vec::with_capacity(self.config.iterations);
227 let memory_start = self.measure_memory_usage();
228
229 for _ in 0..self.config.iterations {
230 let start = Instant::now();
231 let _result = crate::quantize_with_config(&tensor, quant_config)?;
232 execution_times.push(start.elapsed().as_secs_f64() * 1000.0); }
234
235 let memory_end = self.measure_memory_usage();
236
237 let avg_time_ms = execution_times.iter().sum::<f64>() / execution_times.len() as f64;
239 let min_time_ms = execution_times
240 .iter()
241 .cloned()
242 .fold(f64::INFINITY, f64::min);
243 let max_time_ms = execution_times.iter().cloned().fold(0.0, f64::max);
244
245 let variance = execution_times
246 .iter()
247 .map(|t| (t - avg_time_ms).powi(2))
248 .sum::<f64>()
249 / execution_times.len() as f64;
250 let std_dev_ms = variance.sqrt();
251
252 let throughput_eps = if avg_time_ms > 0.0 {
253 (size as f64) / (avg_time_ms / 1000.0) } else {
255 0.0
256 };
257
258 let accuracy_metrics = if self.config.enable_accuracy_testing {
260 Some(self.measure_accuracy(&tensor, quant_config)?)
261 } else {
262 None
263 };
264
265 Ok(BenchmarkResult {
266 name: format!("quantization_benchmark_{}", config_name),
267 config_name: config_name.to_string(),
268 data_size: size,
269 avg_time_ms,
270 min_time_ms,
271 max_time_ms,
272 std_dev_ms,
273 throughput_eps,
274 memory_usage_bytes: memory_end - memory_start,
275 peak_memory_bytes: memory_end,
276 accuracy_metrics,
277 hardware_info: self.get_hardware_info(),
278 })
279 }
280
281 fn generate_test_data(&self, size: usize) -> Vec<f32> {
283 use scirs2_core::random::thread_rng;
284
285 (0..size)
287 .map(|_| thread_rng().gen_range(-3.0..3.0))
288 .collect()
289 }
290
291 fn measure_memory_usage(&self) -> usize {
293 std::mem::size_of::<QuantizationBenchmarkSuite>() +
304 std::mem::size_of::<BenchmarkConfig>() * 10 + 1024 * 1024 }
307
308 fn measure_accuracy(
310 &self,
311 original: &Tensor,
312 config: &QuantConfig,
313 ) -> TorshResult<AccuracyMetrics> {
314 let (quantized, scale, zero_point) = crate::quantize_with_config(original, config)?;
315 let dequantized = crate::dequantize_per_tensor_affine(&quantized, scale, zero_point)?;
316
317 let original_data = original.data()?;
318 let dequantized_data = dequantized.data()?;
319
320 let mse = original_data
322 .iter()
323 .zip(dequantized_data.iter())
324 .map(|(a, b)| (a - b).powi(2))
325 .sum::<f32>() as f64
326 / original_data.len() as f64;
327
328 let max_val = original_data
329 .iter()
330 .fold(0.0f32, |acc, &x| acc.max(x.abs()));
331 let psnr = if mse > 0.0 {
332 20.0 * (max_val as f64).log10() - 10.0 * mse.log10()
333 } else {
334 f64::INFINITY
335 };
336
337 let signal_power: f64 = original_data.iter().map(|&x| (x * x) as f64).sum();
338 let noise_power = mse * original_data.len() as f64;
339 let snr = if noise_power > 0.0 {
340 10.0 * (signal_power / noise_power).log10()
341 } else {
342 f64::INFINITY
343 };
344
345 let dot_product: f64 = original_data
347 .iter()
348 .zip(dequantized_data.iter())
349 .map(|(a, b)| (*a * *b) as f64)
350 .sum();
351 let norm_a: f64 = original_data
352 .iter()
353 .map(|&x| (x * x) as f64)
354 .sum::<f64>()
355 .sqrt();
356 let norm_b: f64 = dequantized_data
357 .iter()
358 .map(|&x| (x * x) as f64)
359 .sum::<f64>()
360 .sqrt();
361 let cosine_similarity = if norm_a > 0.0 && norm_b > 0.0 {
362 dot_product / (norm_a * norm_b)
363 } else {
364 0.0
365 };
366
367 let max_abs_error = original_data
368 .iter()
369 .zip(dequantized_data.iter())
370 .map(|(a, b)| (a - b).abs())
371 .fold(0.0f32, f32::max) as f64;
372
373 Ok(AccuracyMetrics {
374 mse,
375 psnr,
376 snr,
377 cosine_similarity,
378 max_abs_error,
379 })
380 }
381
382 fn get_hardware_info(&self) -> HardwareInfo {
384 let cpu_cores = num_cpus::get();
390 let estimated_memory_gb = (cpu_cores.max(4) * 2).min(64); let memory_bytes = estimated_memory_gb * 1024 * 1024 * 1024;
392
393 HardwareInfo {
394 cpu_model: std::env::var("PROCESSOR_IDENTIFIER")
395 .or_else(|_| std::env::var("CPU_MODEL"))
396 .unwrap_or_else(|_| format!("{} CPU", std::env::consts::ARCH)),
397 cpu_cores,
398 memory_bytes: memory_bytes as usize,
399 gpu_info: None, os_info: format!("{} {}", std::env::consts::OS, std::env::consts::ARCH),
401 }
402 }
403
404 fn benchmark_memory_efficiency(&mut self) -> TorshResult<()> {
406 println!("Running memory efficiency benchmarks...");
407
408 let large_size = 100_000; let test_data = self.generate_test_data(large_size);
411 let tensor = Tensor::from_data(test_data, vec![large_size], DeviceType::Cpu)
412 .map_err(|e| TorshError::InvalidArgument(e.to_string()))?;
413
414 for (i, config) in self.config.quantization_configs.iter().enumerate() {
415 let memory_before = self.measure_memory_usage();
416 let start = Instant::now();
417
418 let result = crate::quantize_with_config(&tensor, config);
420 if result.is_err() {
421 eprintln!("Skipping memory benchmark for config {} due to error", i);
422 continue;
423 }
424 let _result = result?;
425
426 let duration = start.elapsed();
427 let memory_after = self.measure_memory_usage();
428
429 self.results.push(BenchmarkResult {
430 name: "memory_efficiency".to_string(),
431 config_name: format!("memory_test_{}", i),
432 data_size: large_size,
433 avg_time_ms: duration.as_secs_f64() * 1000.0,
434 min_time_ms: duration.as_secs_f64() * 1000.0,
435 max_time_ms: duration.as_secs_f64() * 1000.0,
436 std_dev_ms: 0.0,
437 throughput_eps: large_size as f64 / duration.as_secs_f64(),
438 memory_usage_bytes: memory_after - memory_before,
439 peak_memory_bytes: memory_after,
440 accuracy_metrics: None,
441 hardware_info: self.get_hardware_info(),
442 });
443 }
444
445 Ok(())
446 }
447
448 fn benchmark_scalability(&mut self) -> TorshResult<()> {
450 println!("Running scalability benchmarks...");
451
452 let scalability_sizes = vec![1000, 10000, 100000, 1000000, 5000000];
453 let config = &self.config.quantization_configs[0]; for &size in &scalability_sizes {
456 let test_data = self.generate_test_data(size);
457 let tensor = Tensor::from_data(test_data, vec![size], DeviceType::Cpu)
458 .map_err(|e| TorshError::InvalidArgument(e.to_string()))?;
459
460 let start = Instant::now();
461 let _result = crate::quantize_with_config(&tensor, config)?;
462 let duration = start.elapsed();
463
464 self.results.push(BenchmarkResult {
465 name: "scalability".to_string(),
466 config_name: "scalability_test".to_string(),
467 data_size: size,
468 avg_time_ms: duration.as_secs_f64() * 1000.0,
469 min_time_ms: duration.as_secs_f64() * 1000.0,
470 max_time_ms: duration.as_secs_f64() * 1000.0,
471 std_dev_ms: 0.0,
472 throughput_eps: size as f64 / duration.as_secs_f64(),
473 memory_usage_bytes: 0, peak_memory_bytes: 0,
475 accuracy_metrics: None,
476 hardware_info: self.get_hardware_info(),
477 });
478 }
479
480 Ok(())
481 }
482
483 fn benchmark_accuracy_vs_speed(&mut self) -> TorshResult<()> {
485 println!("Running accuracy vs speed trade-off benchmarks...");
486
487 let test_size = 50_000;
488 let test_data = self.generate_test_data(test_size);
489 let tensor = Tensor::from_data(test_data, vec![test_size], DeviceType::Cpu)
490 .map_err(|e| TorshError::InvalidArgument(e.to_string()))?;
491
492 for (i, config) in self.config.quantization_configs.iter().enumerate() {
493 let start = Instant::now();
494 let duration = start.elapsed();
495
496 let accuracy = self.measure_accuracy(&tensor, config)?;
497
498 self.results.push(BenchmarkResult {
499 name: "accuracy_vs_speed".to_string(),
500 config_name: format!("accuracy_speed_{}", i),
501 data_size: test_size,
502 avg_time_ms: duration.as_secs_f64() * 1000.0,
503 min_time_ms: duration.as_secs_f64() * 1000.0,
504 max_time_ms: duration.as_secs_f64() * 1000.0,
505 std_dev_ms: 0.0,
506 throughput_eps: test_size as f64 / duration.as_secs_f64(),
507 memory_usage_bytes: 0,
508 peak_memory_bytes: 0,
509 accuracy_metrics: Some(accuracy),
510 hardware_info: self.get_hardware_info(),
511 });
512 }
513
514 Ok(())
515 }
516
517 fn find_best_throughput(&self) -> Option<BenchmarkResult> {
519 self.results
520 .iter()
521 .max_by(|a, b| {
522 a.throughput_eps
523 .partial_cmp(&b.throughput_eps)
524 .expect("throughput values should be comparable")
525 })
526 .cloned()
527 }
528
529 fn find_best_accuracy(&self) -> Option<BenchmarkResult> {
531 self.results
532 .iter()
533 .filter(|r| r.accuracy_metrics.is_some())
534 .max_by(|a, b| {
535 a.accuracy_metrics
536 .as_ref()
537 .expect("accuracy metrics should exist")
538 .psnr
539 .partial_cmp(
540 &b.accuracy_metrics
541 .as_ref()
542 .expect("accuracy metrics should exist")
543 .psnr,
544 )
545 .expect("psnr values should be comparable")
546 })
547 .cloned()
548 }
549
550 fn find_most_memory_efficient(&self) -> Option<BenchmarkResult> {
552 self.results
553 .iter()
554 .filter(|r| r.memory_usage_bytes > 0)
555 .min_by(|a, b| {
556 let eff_a = a.memory_usage_bytes as f64 / a.data_size as f64;
557 let eff_b = b.memory_usage_bytes as f64 / b.data_size as f64;
558 eff_a
559 .partial_cmp(&eff_b)
560 .expect("memory efficiency values should be comparable")
561 })
562 .cloned()
563 }
564
565 fn generate_recommendations(&self) -> Vec<String> {
567 let mut recommendations = Vec::new();
568
569 if let Some(best) = self.find_best_throughput() {
570 recommendations.push(format!(
571 "For maximum throughput, use {} (achieved {:.0} elements/sec)",
572 best.config_name, best.throughput_eps
573 ));
574 }
575
576 if let Some(best_acc) = self.find_best_accuracy() {
577 if let Some(ref metrics) = best_acc.accuracy_metrics {
578 recommendations.push(format!(
579 "For best accuracy, use {} (PSNR: {:.2} dB)",
580 best_acc.config_name, metrics.psnr
581 ));
582 }
583 }
584
585 if let Some(mem_eff) = self.find_most_memory_efficient() {
586 let efficiency = mem_eff.memory_usage_bytes as f64 / mem_eff.data_size as f64;
587 recommendations.push(format!(
588 "For memory efficiency, use {} ({:.2} bytes per element)",
589 mem_eff.config_name, efficiency
590 ));
591 }
592
593 recommendations
594 }
595
596 pub fn get_results(&self) -> &[BenchmarkResult] {
598 &self.results
599 }
600
601 pub fn export_to_csv(&self) -> String {
603 let mut csv = String::from("name,config,data_size,avg_time_ms,throughput_eps,memory_bytes,psnr,cosine_similarity\n");
604
605 for result in &self.results {
606 let psnr = result
607 .accuracy_metrics
608 .as_ref()
609 .map(|m| m.psnr)
610 .unwrap_or(0.0);
611 let cosine = result
612 .accuracy_metrics
613 .as_ref()
614 .map(|m| m.cosine_similarity)
615 .unwrap_or(0.0);
616
617 csv.push_str(&format!(
618 "{},{},{},{:.3},{:.0},{},{:.2},{:.4}\n",
619 result.name,
620 result.config_name,
621 result.data_size,
622 result.avg_time_ms,
623 result.throughput_eps,
624 result.memory_usage_bytes,
625 psnr,
626 cosine
627 ));
628 }
629
630 csv
631 }
632}
633
634#[derive(Debug, Clone)]
636pub struct BenchmarkSummary {
637 pub total_tests: usize,
638 pub successful_tests: usize,
639 pub total_duration: Duration,
640 pub best_throughput: Option<BenchmarkResult>,
641 pub best_accuracy: Option<BenchmarkResult>,
642 pub most_memory_efficient: Option<BenchmarkResult>,
643 pub recommendations: Vec<String>,
644}
645
646impl BenchmarkSummary {
647 pub fn generate_report(&self) -> String {
649 let mut report = String::new();
650
651 report.push_str("=== ToRSh Quantization Benchmark Report ===\n\n");
652 report.push_str(&format!(
653 "Tests completed: {}/{}\n",
654 self.successful_tests, self.total_tests
655 ));
656 report.push_str(&format!("Total duration: {:.2?}\n\n", self.total_duration));
657
658 if let Some(ref best) = self.best_throughput {
659 report.push_str(&format!(
660 "🚀 Best Throughput: {:.0} elements/sec ({})\n",
661 best.throughput_eps, best.config_name
662 ));
663 }
664
665 if let Some(ref best) = self.best_accuracy {
666 if let Some(ref metrics) = best.accuracy_metrics {
667 report.push_str(&format!(
668 "🎯 Best Accuracy: PSNR {:.2} dB ({})\n",
669 metrics.psnr, best.config_name
670 ));
671 }
672 }
673
674 if let Some(ref best) = self.most_memory_efficient {
675 let efficiency = best.memory_usage_bytes as f64 / best.data_size as f64;
676 report.push_str(&format!(
677 "💾 Most Memory Efficient: {:.2} bytes/element ({})\n\n",
678 efficiency, best.config_name
679 ));
680 }
681
682 if !self.recommendations.is_empty() {
683 report.push_str("📋 Recommendations:\n");
684 for rec in &self.recommendations {
685 report.push_str(&format!(" • {}\n", rec));
686 }
687 }
688
689 report
690 }
691}
692
693pub fn run_quick_benchmark() -> TorshResult<BenchmarkSummary> {
695 let config = BenchmarkConfig {
696 iterations: 10,
697 test_sizes: vec![1000, 10000],
698 enable_framework_comparison: false,
699 ..Default::default()
700 };
701
702 let mut suite = QuantizationBenchmarkSuite::new(config);
703 suite.run_benchmarks()
704}
705
706#[cfg(test)]
707mod tests {
708 use super::*;
709
710 #[test]
711 fn test_benchmark_config_default() {
712 let config = BenchmarkConfig::default();
713 assert!(config.iterations > 0);
714 assert!(!config.test_sizes.is_empty());
715 assert!(!config.quantization_configs.is_empty());
716 }
717
718 #[test]
719 fn test_quick_benchmark() {
720 let result = run_quick_benchmark();
721 match result {
723 Ok(summary) => {
724 assert!(summary.total_tests > 0);
725 assert!(summary.successful_tests <= summary.total_tests);
726 println!(
727 "Benchmark completed: {}/{} tests successful",
728 summary.successful_tests, summary.total_tests
729 );
730 }
731 Err(e) => {
732 eprintln!("Benchmark encountered errors (acceptable in test): {}", e);
734 }
735 }
736 }
737
738 #[test]
739 fn test_benchmark_suite_creation() {
740 let config = BenchmarkConfig::default();
741 let suite = QuantizationBenchmarkSuite::new(config);
742 assert!(suite.results.is_empty());
743 assert!(suite.baselines.is_empty());
744 }
745
746 #[test]
747 fn test_csv_export() {
748 let suite = QuantizationBenchmarkSuite::new(BenchmarkConfig::default());
749 let csv = suite.export_to_csv();
750 assert!(csv.contains("name,config,data_size"));
751 }
752
753 #[test]
754 fn test_hardware_info() {
755 let config = BenchmarkConfig::default();
756 let suite = QuantizationBenchmarkSuite::new(config);
757 let hw_info = suite.get_hardware_info();
758
759 assert!(hw_info.cpu_cores > 0);
760 assert!(!hw_info.os_info.is_empty());
761 }
762
763 #[test]
764 fn test_test_data_generation() {
765 let config = BenchmarkConfig::default();
766 let suite = QuantizationBenchmarkSuite::new(config);
767 let data = suite.generate_test_data(1000);
768
769 assert_eq!(data.len(), 1000);
770 for &val in &data {
772 assert!(val >= -10.0 && val <= 10.0);
773 }
774 }
775}