1#![allow(dead_code, unused_variables, unused_assignments)]
12
13use anyhow::Result;
14use serde::{Deserialize, Serialize};
15use std::collections::HashMap;
16use std::path::Path;
17use std::time::{Duration, Instant};
18use tracing::{debug, info};
19
20use crate::config::Config;
21use crate::utils::progress;
22
23use scirs2_core::ndarray::{Array2, Array4};
25use scirs2_core::random::thread_rng;
26
27#[derive(Debug, Clone, Serialize, Deserialize)]
29#[allow(dead_code)]
30pub struct BenchmarkConfig {
31 pub model_path: String,
33 pub input_shapes: Vec<Vec<usize>>,
35 pub batch_sizes: Vec<usize>,
37 pub devices: Vec<String>,
39 pub warmup_iterations: usize,
41 pub benchmark_iterations: usize,
43 pub profile_memory: bool,
45 pub profile_compute: bool,
47 pub output_format: String,
49}
50
51#[derive(Debug, Clone, Serialize, Deserialize)]
53#[allow(dead_code)]
54pub struct BenchmarkResults {
55 pub model_name: String,
57 pub total_duration: f64,
59 pub per_config_results: Vec<ConfigBenchmark>,
61 pub summary: BenchmarkSummary,
63 pub system_info: SystemInfo,
65 pub timestamp: String,
67}
68
69#[derive(Debug, Clone, Serialize, Deserialize)]
71#[allow(dead_code)]
72pub struct ConfigBenchmark {
73 pub device: String,
75 pub batch_size: usize,
77 pub input_shape: Vec<usize>,
79 pub throughput: ThroughputMetrics,
81 pub latency: LatencyMetrics,
83 pub memory: Option<MemoryMetrics>,
85 pub compute: Option<ComputeMetrics>,
87}
88
89#[derive(Debug, Clone, Serialize, Deserialize)]
90pub struct ThroughputMetrics {
91 pub samples_per_second: f64,
93 pub batches_per_second: f64,
95 pub tokens_per_second: Option<f64>,
97}
98
99#[derive(Debug, Clone, Serialize, Deserialize)]
100pub struct LatencyMetrics {
101 pub mean_ms: f64,
103 pub median_ms: f64,
105 pub p50_ms: f64,
107 pub p90_ms: f64,
109 pub p95_ms: f64,
111 pub p99_ms: f64,
113 pub min_ms: f64,
115 pub max_ms: f64,
117 pub std_dev_ms: f64,
119}
120
121#[derive(Debug, Clone, Serialize, Deserialize)]
122pub struct MemoryMetrics {
123 pub peak_memory_mb: f64,
125 pub avg_memory_mb: f64,
127 pub model_memory_mb: f64,
129 pub activation_memory_mb: f64,
131 pub memory_bandwidth_gbs: f64,
133}
134
135#[derive(Debug, Clone, Serialize, Deserialize)]
136pub struct ComputeMetrics {
137 pub gpu_utilization: Option<f64>,
139 pub cpu_utilization: f64,
141 pub flops: f64,
143 pub peak_flops: f64,
145 pub flops_utilization: f64,
147 pub bottleneck: String,
149}
150
151#[derive(Debug, Clone, Serialize, Deserialize)]
152pub struct BenchmarkSummary {
153 pub best_throughput: ConfigSummary,
155 pub best_latency: ConfigSummary,
157 pub most_efficient: ConfigSummary,
159 pub device_comparison: HashMap<String, DevicePerformance>,
161}
162
163#[derive(Debug, Clone, Serialize, Deserialize)]
164pub struct ConfigSummary {
165 pub device: String,
166 pub batch_size: usize,
167 pub input_shape: Vec<usize>,
168 pub metric_value: f64,
169}
170
171#[derive(Debug, Clone, Serialize, Deserialize)]
172pub struct DevicePerformance {
173 pub average_throughput: f64,
174 pub average_latency: f64,
175 pub relative_performance: f64,
176}
177
178#[derive(Debug, Clone, Serialize, Deserialize)]
179pub struct SystemInfo {
180 pub cpu_model: String,
181 pub cpu_cores: usize,
182 pub total_memory_gb: f64,
183 pub gpu_info: Vec<GpuInfo>,
184}
185
186#[derive(Debug, Clone, Serialize, Deserialize)]
187pub struct GpuInfo {
188 pub name: String,
189 pub memory_gb: f64,
190 pub compute_capability: Option<String>,
191}
192
193pub async fn execute_benchmark(
195 config: BenchmarkConfig,
196 _cli_config: &Config,
197) -> Result<BenchmarkResults> {
198 info!("Starting benchmark with configuration: {:?}", config);
199
200 let benchmark_start = Instant::now();
201
202 let system_info = gather_system_info().await?;
204 info!(
205 "System: {} with {} cores",
206 system_info.cpu_model, system_info.cpu_cores
207 );
208
209 let mut per_config_results = Vec::new();
210
211 let total_configs = config.devices.len() * config.batch_sizes.len() * config.input_shapes.len();
213 let pb = progress::create_progress_bar(total_configs as u64, "Benchmarking configurations");
214
215 let mut iteration = 0;
216
217 for device in &config.devices {
219 for &batch_size in &config.batch_sizes {
220 for input_shape in &config.input_shapes {
221 info!(
222 "Benchmarking: device={}, batch_size={}, input_shape={:?}",
223 device, batch_size, input_shape
224 );
225
226 let config_result =
227 benchmark_configuration(&config, device, batch_size, input_shape).await?;
228
229 per_config_results.push(config_result);
230
231 iteration += 1;
232 pb.set_position(iteration);
233 }
234 }
235 }
236
237 pb.finish_with_message("Benchmarking completed");
238
239 let summary = analyze_results(&per_config_results)?;
241
242 let total_duration = benchmark_start.elapsed().as_secs_f64();
243
244 let results = BenchmarkResults {
245 model_name: extract_model_name(&config.model_path),
246 total_duration,
247 per_config_results,
248 summary,
249 system_info,
250 timestamp: chrono::Utc::now().to_rfc3339(),
251 };
252
253 info!("Benchmark completed in {:.2}s", total_duration);
254
255 Ok(results)
256}
257
258async fn benchmark_configuration(
260 config: &BenchmarkConfig,
261 device: &str,
262 batch_size: usize,
263 input_shape: &[usize],
264) -> Result<ConfigBenchmark> {
265 debug!(
266 "Running benchmark: device={}, batch_size={}, input_shape={:?}",
267 device, batch_size, input_shape
268 );
269
270 let mut rng = thread_rng();
272 let total_elements: usize = input_shape.iter().product::<usize>() * batch_size;
273 let input_data: Vec<f32> = (0..total_elements).map(|_| rng.random::<f32>()).collect();
274
275 let input_tensor = match input_shape.len() {
277 1 => {
278 let arr = Array2::from_shape_vec((batch_size, input_shape[0]), input_data)?;
280 TensorData::Array2(arr)
281 }
282 3 => {
283 let c = input_shape[0];
285 let h = input_shape[1];
286 let w = input_shape[2];
287 let arr = Array4::from_shape_vec((batch_size, c, h, w), input_data)?;
288 TensorData::Array4(arr)
289 }
290 _ => {
291 let arr =
293 Array2::from_shape_vec((batch_size, input_shape.iter().product()), input_data)?;
294 TensorData::Array2(arr)
295 }
296 };
297
298 debug!("Running {} warmup iterations", config.warmup_iterations);
300 for _ in 0..config.warmup_iterations {
301 let _ = run_inference(&input_tensor, device).await?;
302 tokio::time::sleep(Duration::from_micros(100)).await;
304 }
305
306 debug!(
308 "Running {} benchmark iterations",
309 config.benchmark_iterations
310 );
311 let mut latencies = Vec::with_capacity(config.benchmark_iterations);
312 let mut memory_samples = Vec::new();
313
314 for _ in 0..config.benchmark_iterations {
315 let start = Instant::now();
316 let memory_before = if config.profile_memory {
317 Some(measure_memory_usage(device).await?)
318 } else {
319 None
320 };
321
322 let _ = run_inference(&input_tensor, device).await?;
323
324 let latency = start.elapsed();
325 latencies.push(latency.as_secs_f64() * 1000.0); if let Some(mem_before) = memory_before {
328 let mem_after = measure_memory_usage(device).await?;
329 memory_samples.push(mem_after - mem_before);
330 }
331
332 tokio::time::sleep(Duration::from_micros(50)).await;
334 }
335
336 let latency_metrics = calculate_latency_metrics(&latencies);
338
339 let throughput_metrics = calculate_throughput_metrics(&latency_metrics, batch_size);
341
342 let memory_metrics = if config.profile_memory {
344 Some(calculate_memory_metrics(
345 &memory_samples,
346 batch_size,
347 input_shape,
348 ))
349 } else {
350 None
351 };
352
353 let compute_metrics = if config.profile_compute {
355 Some(calculate_compute_metrics(device, &latency_metrics, input_shape).await?)
356 } else {
357 None
358 };
359
360 Ok(ConfigBenchmark {
361 device: device.to_string(),
362 batch_size,
363 input_shape: input_shape.to_vec(),
364 throughput: throughput_metrics,
365 latency: latency_metrics,
366 memory: memory_metrics,
367 compute: compute_metrics,
368 })
369}
370
371#[allow(dead_code)]
373enum TensorData {
374 Array2(Array2<f32>),
375 Array4(Array4<f32>),
376}
377
378async fn run_inference(_input: &TensorData, device: &str) -> Result<Array2<f32>> {
380 let inference_time_us = match device {
382 "cpu" => 1000, "cuda" | "cuda:0" => 200, "metal" | "metal:0" => 300, _ => 500,
386 };
387
388 tokio::time::sleep(Duration::from_micros(inference_time_us)).await;
389
390 let mut rng = thread_rng();
392 let output_data: Vec<f32> = (0..1000).map(|_| rng.random::<f32>()).collect();
393 Ok(Array2::from_shape_vec((10, 100), output_data)?)
394}
395
396async fn measure_memory_usage(device: &str) -> Result<f64> {
398 let base_memory = match device {
400 "cuda" | "cuda:0" => 512.0, "metal" | "metal:0" => 384.0,
402 _ => 256.0,
403 };
404
405 let mut rng = thread_rng();
406 let variation = rng.gen_range(-50.0..50.0);
407
408 Ok(base_memory + variation)
409}
410
411fn calculate_latency_metrics(latencies: &[f64]) -> LatencyMetrics {
413 let mut sorted = latencies.to_vec();
414 sorted.sort_by(|a, b| {
415 a.partial_cmp(b)
416 .expect("latency values should be comparable")
417 });
418
419 let mean = sorted.iter().sum::<f64>() / sorted.len() as f64;
420 let median = sorted[sorted.len() / 2];
421
422 let variance = sorted.iter().map(|&x| (x - mean).powi(2)).sum::<f64>() / sorted.len() as f64;
423 let std_dev = variance.sqrt();
424
425 LatencyMetrics {
426 mean_ms: mean,
427 median_ms: median,
428 p50_ms: percentile(&sorted, 50.0),
429 p90_ms: percentile(&sorted, 90.0),
430 p95_ms: percentile(&sorted, 95.0),
431 p99_ms: percentile(&sorted, 99.0),
432 min_ms: sorted[0],
433 max_ms: sorted[sorted.len() - 1],
434 std_dev_ms: std_dev,
435 }
436}
437
438fn percentile(sorted_data: &[f64], p: f64) -> f64 {
440 let index = (p / 100.0 * (sorted_data.len() - 1) as f64) as usize;
441 sorted_data[index]
442}
443
444fn calculate_throughput_metrics(latency: &LatencyMetrics, batch_size: usize) -> ThroughputMetrics {
446 let samples_per_second = 1000.0 / latency.mean_ms * batch_size as f64;
447 let batches_per_second = 1000.0 / latency.mean_ms;
448
449 ThroughputMetrics {
450 samples_per_second,
451 batches_per_second,
452 tokens_per_second: None, }
454}
455
456fn calculate_memory_metrics(
458 memory_samples: &[f64],
459 batch_size: usize,
460 input_shape: &[usize],
461) -> MemoryMetrics {
462 let peak_memory = memory_samples
463 .iter()
464 .fold(f64::NEG_INFINITY, |a, &b| a.max(b));
465 let avg_memory = memory_samples.iter().sum::<f64>() / memory_samples.len() as f64;
466
467 let model_memory = 256.0; let activation_elements: usize = input_shape.iter().product::<usize>() * batch_size;
472 let activation_memory = (activation_elements * 4) as f64 / (1024.0 * 1024.0); let memory_bandwidth = avg_memory * 1000.0 / 1024.0; MemoryMetrics {
478 peak_memory_mb: peak_memory,
479 avg_memory_mb: avg_memory,
480 model_memory_mb: model_memory,
481 activation_memory_mb: activation_memory,
482 memory_bandwidth_gbs: memory_bandwidth,
483 }
484}
485
486async fn calculate_compute_metrics(
488 device: &str,
489 latency: &LatencyMetrics,
490 input_shape: &[usize],
491) -> Result<ComputeMetrics> {
492 let input_elements: usize = input_shape.iter().product();
494 let estimated_flops = (input_elements * 1000 * 2) as f64; let peak_flops = match device {
498 "cuda" | "cuda:0" => 35_000_000_000_000.0, "metal" | "metal:0" => 10_000_000_000_000.0, _ => 1_000_000_000_000.0, };
502
503 let achieved_flops = estimated_flops / (latency.mean_ms / 1000.0);
504 let flops_utilization = (achieved_flops / peak_flops * 100.0).min(100.0);
505
506 let bottleneck = if flops_utilization < 30.0 {
508 "memory_bound".to_string()
509 } else {
510 "compute_bound".to_string()
511 };
512
513 let (cpu_util, gpu_util) = measure_device_utilization(device).await?;
515
516 Ok(ComputeMetrics {
517 gpu_utilization: gpu_util,
518 cpu_utilization: cpu_util,
519 flops: achieved_flops,
520 peak_flops,
521 flops_utilization,
522 bottleneck,
523 })
524}
525
526async fn measure_device_utilization(device: &str) -> Result<(f64, Option<f64>)> {
528 let cpu_util = 45.0 + thread_rng().gen_range(-10.0..10.0);
529
530 let gpu_util = if device.starts_with("cuda") || device.starts_with("metal") {
531 Some(75.0 + thread_rng().gen_range(-15.0..15.0))
532 } else {
533 None
534 };
535
536 Ok((cpu_util, gpu_util))
537}
538
539fn analyze_results(results: &[ConfigBenchmark]) -> Result<BenchmarkSummary> {
541 let best_throughput = results
543 .iter()
544 .max_by(|a, b| {
545 a.throughput
546 .samples_per_second
547 .partial_cmp(&b.throughput.samples_per_second)
548 .expect("throughput values should be comparable")
549 })
550 .expect("results should not be empty for throughput analysis");
551
552 let best_latency = results
553 .iter()
554 .min_by(|a, b| {
555 a.latency
556 .mean_ms
557 .partial_cmp(&b.latency.mean_ms)
558 .expect("latency values should be comparable")
559 })
560 .expect("results should not be empty for latency analysis");
561
562 let most_efficient = results
564 .iter()
565 .max_by(|a, b| {
566 let score_a = a.throughput.samples_per_second / a.latency.mean_ms;
567 let score_b = b.throughput.samples_per_second / b.latency.mean_ms;
568 score_a
569 .partial_cmp(&score_b)
570 .expect("efficiency scores should be comparable")
571 })
572 .expect("results should not be empty for efficiency analysis");
573
574 let mut device_comparison = HashMap::new();
576 let devices: std::collections::HashSet<_> = results.iter().map(|r| r.device.clone()).collect();
577
578 for device in devices {
579 let device_results: Vec<_> = results.iter().filter(|r| r.device == device).collect();
580
581 let avg_throughput = device_results
582 .iter()
583 .map(|r| r.throughput.samples_per_second)
584 .sum::<f64>()
585 / device_results.len() as f64;
586
587 let avg_latency = device_results
588 .iter()
589 .map(|r| r.latency.mean_ms)
590 .sum::<f64>()
591 / device_results.len() as f64;
592
593 let best_avg_throughput = results
595 .iter()
596 .map(|r| r.throughput.samples_per_second)
597 .fold(f64::NEG_INFINITY, f64::max);
598
599 let relative_performance = (avg_throughput / best_avg_throughput * 100.0).min(100.0);
600
601 device_comparison.insert(
602 device.clone(),
603 DevicePerformance {
604 average_throughput: avg_throughput,
605 average_latency: avg_latency,
606 relative_performance,
607 },
608 );
609 }
610
611 Ok(BenchmarkSummary {
612 best_throughput: ConfigSummary {
613 device: best_throughput.device.clone(),
614 batch_size: best_throughput.batch_size,
615 input_shape: best_throughput.input_shape.clone(),
616 metric_value: best_throughput.throughput.samples_per_second,
617 },
618 best_latency: ConfigSummary {
619 device: best_latency.device.clone(),
620 batch_size: best_latency.batch_size,
621 input_shape: best_latency.input_shape.clone(),
622 metric_value: best_latency.latency.mean_ms,
623 },
624 most_efficient: ConfigSummary {
625 device: most_efficient.device.clone(),
626 batch_size: most_efficient.batch_size,
627 input_shape: most_efficient.input_shape.clone(),
628 metric_value: most_efficient.throughput.samples_per_second
629 / most_efficient.latency.mean_ms,
630 },
631 device_comparison,
632 })
633}
634
635async fn gather_system_info() -> Result<SystemInfo> {
637 use sysinfo::System;
638
639 let mut sys = System::new_all();
640 sys.refresh_all();
641
642 let cpu_model = sys
643 .cpus()
644 .first()
645 .map(|cpu| cpu.brand())
646 .unwrap_or("Unknown")
647 .to_string();
648
649 let cpu_cores = sys.cpus().len();
650 let total_memory_gb = sys.total_memory() as f64 / (1024.0 * 1024.0 * 1024.0);
651
652 let gpu_info = detect_gpus().await?;
654
655 Ok(SystemInfo {
656 cpu_model,
657 cpu_cores,
658 total_memory_gb,
659 gpu_info,
660 })
661}
662
663async fn detect_gpus() -> Result<Vec<GpuInfo>> {
665 let mut gpus = Vec::new();
666
667 if let Ok(output) = std::process::Command::new("nvidia-smi")
669 .arg("--query-gpu=name,memory.total")
670 .arg("--format=csv,noheader,nounits")
671 .output()
672 {
673 if output.status.success() {
674 let info = String::from_utf8_lossy(&output.stdout);
675 for line in info.lines() {
676 let parts: Vec<&str> = line.split(',').collect();
677 if parts.len() >= 2 {
678 gpus.push(GpuInfo {
679 name: parts[0].trim().to_string(),
680 memory_gb: parts[1].trim().parse::<f64>().unwrap_or(0.0) / 1024.0,
681 compute_capability: None,
682 });
683 }
684 }
685 }
686 }
687
688 #[cfg(target_os = "macos")]
690 {
691 if let Ok(output) = std::process::Command::new("system_profiler")
692 .arg("SPDisplaysDataType")
693 .output()
694 {
695 if output.status.success() {
696 let info = String::from_utf8_lossy(&output.stdout);
697 if info.contains("Metal") {
698 gpus.push(GpuInfo {
699 name: "Apple Metal GPU".to_string(),
700 memory_gb: 16.0, compute_capability: Some("Metal".to_string()),
702 });
703 }
704 }
705 }
706 }
707
708 Ok(gpus)
709}
710
711#[allow(dead_code)]
713fn extract_model_name(path: &str) -> String {
714 std::path::Path::new(path)
715 .file_stem()
716 .and_then(|s| s.to_str())
717 .unwrap_or("unknown_model")
718 .to_string()
719}
720
721#[allow(dead_code)]
723pub async fn export_results(
724 results: &BenchmarkResults,
725 output_path: &Path,
726 format: &str,
727) -> Result<()> {
728 match format {
729 "json" => {
730 let json = serde_json::to_string_pretty(results)?;
731 tokio::fs::write(output_path, json).await?;
732 }
733 "csv" => {
734 let csv = results_to_csv(results)?;
735 tokio::fs::write(output_path, csv).await?;
736 }
737 "html" => {
738 let html = results_to_html(results)?;
739 tokio::fs::write(output_path, html).await?;
740 }
741 _ => {
742 anyhow::bail!("Unsupported export format: {}", format);
743 }
744 }
745
746 info!("Results exported to: {}", output_path.display());
747 Ok(())
748}
749
750#[allow(dead_code)]
752fn results_to_csv(results: &BenchmarkResults) -> Result<String> {
753 let mut csv = String::new();
754 csv.push_str("Device,Batch Size,Input Shape,Throughput (samples/s),Mean Latency (ms),P99 Latency (ms),Peak Memory (MB)\n");
755
756 for config in &results.per_config_results {
757 csv.push_str(&format!(
758 "{},{},{:?},{:.2},{:.2},{:.2},{}\n",
759 config.device,
760 config.batch_size,
761 config.input_shape,
762 config.throughput.samples_per_second,
763 config.latency.mean_ms,
764 config.latency.p99_ms,
765 config
766 .memory
767 .as_ref()
768 .map(|m| format!("{:.2}", m.peak_memory_mb))
769 .unwrap_or_else(|| "N/A".to_string())
770 ));
771 }
772
773 Ok(csv)
774}
775
776#[allow(dead_code)]
778fn results_to_html(results: &BenchmarkResults) -> Result<String> {
779 let html = format!(
780 r#"<!DOCTYPE html>
781<html>
782<head>
783 <title>Benchmark Results - {}</title>
784 <style>
785 body {{ font-family: Arial, sans-serif; margin: 20px; }}
786 h1 {{ color: #333; }}
787 table {{ border-collapse: collapse; width: 100%; margin-top: 20px; }}
788 th, td {{ border: 1px solid #ddd; padding: 8px; text-align: left; }}
789 th {{ background-color: #4CAF50; color: white; }}
790 tr:nth-child(even) {{ background-color: #f2f2f2; }}
791 .summary {{ background-color: #e7f3fe; padding: 15px; border-left: 6px solid #2196F3; margin: 20px 0; }}
792 </style>
793</head>
794<body>
795 <h1>Benchmark Results: {}</h1>
796 <p>Total Duration: {:.2}s</p>
797 <p>Timestamp: {}</p>
798
799 <div class="summary">
800 <h2>Summary</h2>
801 <p><strong>Best Throughput:</strong> {} on {} (batch size: {})</p>
802 <p><strong>Best Latency:</strong> {:.2}ms on {} (batch size: {})</p>
803 </div>
804
805 <h2>Detailed Results</h2>
806 <table>
807 <tr>
808 <th>Device</th>
809 <th>Batch Size</th>
810 <th>Input Shape</th>
811 <th>Throughput (samples/s)</th>
812 <th>Mean Latency (ms)</th>
813 <th>P99 Latency (ms)</th>
814 <th>Peak Memory (MB)</th>
815 </tr>
816 {}
817 </table>
818</body>
819</html>"#,
820 results.model_name,
821 results.model_name,
822 results.total_duration,
823 results.timestamp,
824 results.summary.best_throughput.metric_value,
825 results.summary.best_throughput.device,
826 results.summary.best_throughput.batch_size,
827 results.summary.best_latency.metric_value,
828 results.summary.best_latency.device,
829 results.summary.best_latency.batch_size,
830 results.per_config_results.iter().map(|config| {
831 format!(
832 "<tr><td>{}</td><td>{}</td><td>{:?}</td><td>{:.2}</td><td>{:.2}</td><td>{:.2}</td><td>{}</td></tr>",
833 config.device,
834 config.batch_size,
835 config.input_shape,
836 config.throughput.samples_per_second,
837 config.latency.mean_ms,
838 config.latency.p99_ms,
839 config.memory.as_ref().map(|m| format!("{:.2}", m.peak_memory_mb)).unwrap_or_else(|| "N/A".to_string())
840 )
841 }).collect::<Vec<_>>().join("\n")
842 );
843
844 Ok(html)
845}