#![allow(dead_code, unused_variables, unused_assignments)]
use anyhow::Result;
use serde::{Deserialize, Serialize};
use std::collections::HashMap;
use std::path::Path;
use std::time::{Duration, Instant};
use tracing::{debug, info};
use crate::config::Config;
use crate::utils::progress;
use scirs2_core::ndarray::{Array2, Array4};
use scirs2_core::random::thread_rng;
#[derive(Debug, Clone, Serialize, Deserialize)]
#[allow(dead_code)]
pub struct BenchmarkConfig {
pub model_path: String,
pub input_shapes: Vec<Vec<usize>>,
pub batch_sizes: Vec<usize>,
pub devices: Vec<String>,
pub warmup_iterations: usize,
pub benchmark_iterations: usize,
pub profile_memory: bool,
pub profile_compute: bool,
pub output_format: String,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
#[allow(dead_code)]
pub struct BenchmarkResults {
pub model_name: String,
pub total_duration: f64,
pub per_config_results: Vec<ConfigBenchmark>,
pub summary: BenchmarkSummary,
pub system_info: SystemInfo,
pub timestamp: String,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
#[allow(dead_code)]
pub struct ConfigBenchmark {
pub device: String,
pub batch_size: usize,
pub input_shape: Vec<usize>,
pub throughput: ThroughputMetrics,
pub latency: LatencyMetrics,
pub memory: Option<MemoryMetrics>,
pub compute: Option<ComputeMetrics>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ThroughputMetrics {
pub samples_per_second: f64,
pub batches_per_second: f64,
pub tokens_per_second: Option<f64>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct LatencyMetrics {
pub mean_ms: f64,
pub median_ms: f64,
pub p50_ms: f64,
pub p90_ms: f64,
pub p95_ms: f64,
pub p99_ms: f64,
pub min_ms: f64,
pub max_ms: f64,
pub std_dev_ms: f64,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct MemoryMetrics {
pub peak_memory_mb: f64,
pub avg_memory_mb: f64,
pub model_memory_mb: f64,
pub activation_memory_mb: f64,
pub memory_bandwidth_gbs: f64,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ComputeMetrics {
pub gpu_utilization: Option<f64>,
pub cpu_utilization: f64,
pub flops: f64,
pub peak_flops: f64,
pub flops_utilization: f64,
pub bottleneck: String,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct BenchmarkSummary {
pub best_throughput: ConfigSummary,
pub best_latency: ConfigSummary,
pub most_efficient: ConfigSummary,
pub device_comparison: HashMap<String, DevicePerformance>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ConfigSummary {
pub device: String,
pub batch_size: usize,
pub input_shape: Vec<usize>,
pub metric_value: f64,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct DevicePerformance {
pub average_throughput: f64,
pub average_latency: f64,
pub relative_performance: f64,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct SystemInfo {
pub cpu_model: String,
pub cpu_cores: usize,
pub total_memory_gb: f64,
pub gpu_info: Vec<GpuInfo>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct GpuInfo {
pub name: String,
pub memory_gb: f64,
pub compute_capability: Option<String>,
}
pub async fn execute_benchmark(
config: BenchmarkConfig,
_cli_config: &Config,
) -> Result<BenchmarkResults> {
info!("Starting benchmark with configuration: {:?}", config);
let benchmark_start = Instant::now();
let system_info = gather_system_info().await?;
info!(
"System: {} with {} cores",
system_info.cpu_model, system_info.cpu_cores
);
let mut per_config_results = Vec::new();
let total_configs = config.devices.len() * config.batch_sizes.len() * config.input_shapes.len();
let pb = progress::create_progress_bar(total_configs as u64, "Benchmarking configurations");
let mut iteration = 0;
for device in &config.devices {
for &batch_size in &config.batch_sizes {
for input_shape in &config.input_shapes {
info!(
"Benchmarking: device={}, batch_size={}, input_shape={:?}",
device, batch_size, input_shape
);
let config_result =
benchmark_configuration(&config, device, batch_size, input_shape).await?;
per_config_results.push(config_result);
iteration += 1;
pb.set_position(iteration);
}
}
}
pb.finish_with_message("Benchmarking completed");
let summary = analyze_results(&per_config_results)?;
let total_duration = benchmark_start.elapsed().as_secs_f64();
let results = BenchmarkResults {
model_name: extract_model_name(&config.model_path),
total_duration,
per_config_results,
summary,
system_info,
timestamp: chrono::Utc::now().to_rfc3339(),
};
info!("Benchmark completed in {:.2}s", total_duration);
Ok(results)
}
async fn benchmark_configuration(
config: &BenchmarkConfig,
device: &str,
batch_size: usize,
input_shape: &[usize],
) -> Result<ConfigBenchmark> {
debug!(
"Running benchmark: device={}, batch_size={}, input_shape={:?}",
device, batch_size, input_shape
);
let mut rng = thread_rng();
let total_elements: usize = input_shape.iter().product::<usize>() * batch_size;
let input_data: Vec<f32> = (0..total_elements).map(|_| rng.random::<f32>()).collect();
let input_tensor = match input_shape.len() {
1 => {
let arr = Array2::from_shape_vec((batch_size, input_shape[0]), input_data)?;
TensorData::Array2(arr)
}
3 => {
let c = input_shape[0];
let h = input_shape[1];
let w = input_shape[2];
let arr = Array4::from_shape_vec((batch_size, c, h, w), input_data)?;
TensorData::Array4(arr)
}
_ => {
let arr =
Array2::from_shape_vec((batch_size, input_shape.iter().product()), input_data)?;
TensorData::Array2(arr)
}
};
debug!("Running {} warmup iterations", config.warmup_iterations);
for _ in 0..config.warmup_iterations {
let _ = run_inference(&input_tensor, device).await?;
tokio::time::sleep(Duration::from_micros(100)).await;
}
debug!(
"Running {} benchmark iterations",
config.benchmark_iterations
);
let mut latencies = Vec::with_capacity(config.benchmark_iterations);
let mut memory_samples = Vec::new();
for _ in 0..config.benchmark_iterations {
let start = Instant::now();
let memory_before = if config.profile_memory {
Some(measure_memory_usage(device).await?)
} else {
None
};
let _ = run_inference(&input_tensor, device).await?;
let latency = start.elapsed();
latencies.push(latency.as_secs_f64() * 1000.0);
if let Some(mem_before) = memory_before {
let mem_after = measure_memory_usage(device).await?;
memory_samples.push(mem_after - mem_before);
}
tokio::time::sleep(Duration::from_micros(50)).await;
}
let latency_metrics = calculate_latency_metrics(&latencies);
let throughput_metrics = calculate_throughput_metrics(&latency_metrics, batch_size);
let memory_metrics = if config.profile_memory {
Some(calculate_memory_metrics(
&memory_samples,
batch_size,
input_shape,
))
} else {
None
};
let compute_metrics = if config.profile_compute {
Some(calculate_compute_metrics(device, &latency_metrics, input_shape).await?)
} else {
None
};
Ok(ConfigBenchmark {
device: device.to_string(),
batch_size,
input_shape: input_shape.to_vec(),
throughput: throughput_metrics,
latency: latency_metrics,
memory: memory_metrics,
compute: compute_metrics,
})
}
#[allow(dead_code)]
enum TensorData {
Array2(Array2<f32>),
Array4(Array4<f32>),
}
async fn run_inference(_input: &TensorData, device: &str) -> Result<Array2<f32>> {
let inference_time_us = match device {
"cpu" => 1000, "cuda" | "cuda:0" => 200, "metal" | "metal:0" => 300, _ => 500,
};
tokio::time::sleep(Duration::from_micros(inference_time_us)).await;
let mut rng = thread_rng();
let output_data: Vec<f32> = (0..1000).map(|_| rng.random::<f32>()).collect();
Ok(Array2::from_shape_vec((10, 100), output_data)?)
}
async fn measure_memory_usage(device: &str) -> Result<f64> {
let base_memory = match device {
"cuda" | "cuda:0" => 512.0, "metal" | "metal:0" => 384.0,
_ => 256.0,
};
let mut rng = thread_rng();
let variation = rng.gen_range(-50.0..50.0);
Ok(base_memory + variation)
}
fn calculate_latency_metrics(latencies: &[f64]) -> LatencyMetrics {
let mut sorted = latencies.to_vec();
sorted.sort_by(|a, b| {
a.partial_cmp(b)
.expect("latency values should be comparable")
});
let mean = sorted.iter().sum::<f64>() / sorted.len() as f64;
let median = sorted[sorted.len() / 2];
let variance = sorted.iter().map(|&x| (x - mean).powi(2)).sum::<f64>() / sorted.len() as f64;
let std_dev = variance.sqrt();
LatencyMetrics {
mean_ms: mean,
median_ms: median,
p50_ms: percentile(&sorted, 50.0),
p90_ms: percentile(&sorted, 90.0),
p95_ms: percentile(&sorted, 95.0),
p99_ms: percentile(&sorted, 99.0),
min_ms: sorted[0],
max_ms: sorted[sorted.len() - 1],
std_dev_ms: std_dev,
}
}
fn percentile(sorted_data: &[f64], p: f64) -> f64 {
let index = (p / 100.0 * (sorted_data.len() - 1) as f64) as usize;
sorted_data[index]
}
fn calculate_throughput_metrics(latency: &LatencyMetrics, batch_size: usize) -> ThroughputMetrics {
let samples_per_second = 1000.0 / latency.mean_ms * batch_size as f64;
let batches_per_second = 1000.0 / latency.mean_ms;
ThroughputMetrics {
samples_per_second,
batches_per_second,
tokens_per_second: None, }
}
fn calculate_memory_metrics(
memory_samples: &[f64],
batch_size: usize,
input_shape: &[usize],
) -> MemoryMetrics {
let peak_memory = memory_samples
.iter()
.fold(f64::NEG_INFINITY, |a, &b| a.max(b));
let avg_memory = memory_samples.iter().sum::<f64>() / memory_samples.len() as f64;
let model_memory = 256.0;
let activation_elements: usize = input_shape.iter().product::<usize>() * batch_size;
let activation_memory = (activation_elements * 4) as f64 / (1024.0 * 1024.0);
let memory_bandwidth = avg_memory * 1000.0 / 1024.0;
MemoryMetrics {
peak_memory_mb: peak_memory,
avg_memory_mb: avg_memory,
model_memory_mb: model_memory,
activation_memory_mb: activation_memory,
memory_bandwidth_gbs: memory_bandwidth,
}
}
async fn calculate_compute_metrics(
device: &str,
latency: &LatencyMetrics,
input_shape: &[usize],
) -> Result<ComputeMetrics> {
let input_elements: usize = input_shape.iter().product();
let estimated_flops = (input_elements * 1000 * 2) as f64;
let peak_flops = match device {
"cuda" | "cuda:0" => 35_000_000_000_000.0, "metal" | "metal:0" => 10_000_000_000_000.0, _ => 1_000_000_000_000.0, };
let achieved_flops = estimated_flops / (latency.mean_ms / 1000.0);
let flops_utilization = (achieved_flops / peak_flops * 100.0).min(100.0);
let bottleneck = if flops_utilization < 30.0 {
"memory_bound".to_string()
} else {
"compute_bound".to_string()
};
let (cpu_util, gpu_util) = measure_device_utilization(device).await?;
Ok(ComputeMetrics {
gpu_utilization: gpu_util,
cpu_utilization: cpu_util,
flops: achieved_flops,
peak_flops,
flops_utilization,
bottleneck,
})
}
async fn measure_device_utilization(device: &str) -> Result<(f64, Option<f64>)> {
let cpu_util = 45.0 + thread_rng().gen_range(-10.0..10.0);
let gpu_util = if device.starts_with("cuda") || device.starts_with("metal") {
Some(75.0 + thread_rng().gen_range(-15.0..15.0))
} else {
None
};
Ok((cpu_util, gpu_util))
}
fn analyze_results(results: &[ConfigBenchmark]) -> Result<BenchmarkSummary> {
let best_throughput = results
.iter()
.max_by(|a, b| {
a.throughput
.samples_per_second
.partial_cmp(&b.throughput.samples_per_second)
.expect("throughput values should be comparable")
})
.expect("results should not be empty for throughput analysis");
let best_latency = results
.iter()
.min_by(|a, b| {
a.latency
.mean_ms
.partial_cmp(&b.latency.mean_ms)
.expect("latency values should be comparable")
})
.expect("results should not be empty for latency analysis");
let most_efficient = results
.iter()
.max_by(|a, b| {
let score_a = a.throughput.samples_per_second / a.latency.mean_ms;
let score_b = b.throughput.samples_per_second / b.latency.mean_ms;
score_a
.partial_cmp(&score_b)
.expect("efficiency scores should be comparable")
})
.expect("results should not be empty for efficiency analysis");
let mut device_comparison = HashMap::new();
let devices: std::collections::HashSet<_> = results.iter().map(|r| r.device.clone()).collect();
for device in devices {
let device_results: Vec<_> = results.iter().filter(|r| r.device == device).collect();
let avg_throughput = device_results
.iter()
.map(|r| r.throughput.samples_per_second)
.sum::<f64>()
/ device_results.len() as f64;
let avg_latency = device_results
.iter()
.map(|r| r.latency.mean_ms)
.sum::<f64>()
/ device_results.len() as f64;
let best_avg_throughput = results
.iter()
.map(|r| r.throughput.samples_per_second)
.fold(f64::NEG_INFINITY, f64::max);
let relative_performance = (avg_throughput / best_avg_throughput * 100.0).min(100.0);
device_comparison.insert(
device.clone(),
DevicePerformance {
average_throughput: avg_throughput,
average_latency: avg_latency,
relative_performance,
},
);
}
Ok(BenchmarkSummary {
best_throughput: ConfigSummary {
device: best_throughput.device.clone(),
batch_size: best_throughput.batch_size,
input_shape: best_throughput.input_shape.clone(),
metric_value: best_throughput.throughput.samples_per_second,
},
best_latency: ConfigSummary {
device: best_latency.device.clone(),
batch_size: best_latency.batch_size,
input_shape: best_latency.input_shape.clone(),
metric_value: best_latency.latency.mean_ms,
},
most_efficient: ConfigSummary {
device: most_efficient.device.clone(),
batch_size: most_efficient.batch_size,
input_shape: most_efficient.input_shape.clone(),
metric_value: most_efficient.throughput.samples_per_second
/ most_efficient.latency.mean_ms,
},
device_comparison,
})
}
async fn gather_system_info() -> Result<SystemInfo> {
use sysinfo::System;
let mut sys = System::new_all();
sys.refresh_all();
let cpu_model = sys
.cpus()
.first()
.map(|cpu| cpu.brand())
.unwrap_or("Unknown")
.to_string();
let cpu_cores = sys.cpus().len();
let total_memory_gb = sys.total_memory() as f64 / (1024.0 * 1024.0 * 1024.0);
let gpu_info = detect_gpus().await?;
Ok(SystemInfo {
cpu_model,
cpu_cores,
total_memory_gb,
gpu_info,
})
}
async fn detect_gpus() -> Result<Vec<GpuInfo>> {
let mut gpus = Vec::new();
if let Ok(output) = std::process::Command::new("nvidia-smi")
.arg("--query-gpu=name,memory.total")
.arg("--format=csv,noheader,nounits")
.output()
{
if output.status.success() {
let info = String::from_utf8_lossy(&output.stdout);
for line in info.lines() {
let parts: Vec<&str> = line.split(',').collect();
if parts.len() >= 2 {
gpus.push(GpuInfo {
name: parts[0].trim().to_string(),
memory_gb: parts[1].trim().parse::<f64>().unwrap_or(0.0) / 1024.0,
compute_capability: None,
});
}
}
}
}
#[cfg(target_os = "macos")]
{
if let Ok(output) = std::process::Command::new("system_profiler")
.arg("SPDisplaysDataType")
.output()
{
if output.status.success() {
let info = String::from_utf8_lossy(&output.stdout);
if info.contains("Metal") {
gpus.push(GpuInfo {
name: "Apple Metal GPU".to_string(),
memory_gb: 16.0, compute_capability: Some("Metal".to_string()),
});
}
}
}
}
Ok(gpus)
}
#[allow(dead_code)]
fn extract_model_name(path: &str) -> String {
std::path::Path::new(path)
.file_stem()
.and_then(|s| s.to_str())
.unwrap_or("unknown_model")
.to_string()
}
#[allow(dead_code)]
pub async fn export_results(
results: &BenchmarkResults,
output_path: &Path,
format: &str,
) -> Result<()> {
match format {
"json" => {
let json = serde_json::to_string_pretty(results)?;
tokio::fs::write(output_path, json).await?;
}
"csv" => {
let csv = results_to_csv(results)?;
tokio::fs::write(output_path, csv).await?;
}
"html" => {
let html = results_to_html(results)?;
tokio::fs::write(output_path, html).await?;
}
_ => {
anyhow::bail!("Unsupported export format: {}", format);
}
}
info!("Results exported to: {}", output_path.display());
Ok(())
}
#[allow(dead_code)]
fn results_to_csv(results: &BenchmarkResults) -> Result<String> {
let mut csv = String::new();
csv.push_str("Device,Batch Size,Input Shape,Throughput (samples/s),Mean Latency (ms),P99 Latency (ms),Peak Memory (MB)\n");
for config in &results.per_config_results {
csv.push_str(&format!(
"{},{},{:?},{:.2},{:.2},{:.2},{}\n",
config.device,
config.batch_size,
config.input_shape,
config.throughput.samples_per_second,
config.latency.mean_ms,
config.latency.p99_ms,
config
.memory
.as_ref()
.map(|m| format!("{:.2}", m.peak_memory_mb))
.unwrap_or_else(|| "N/A".to_string())
));
}
Ok(csv)
}
#[allow(dead_code)]
fn results_to_html(results: &BenchmarkResults) -> Result<String> {
let html = format!(
r#"<!DOCTYPE html>
<html>
<head>
<title>Benchmark Results - {}</title>
<style>
body {{ font-family: Arial, sans-serif; margin: 20px; }}
h1 {{ color: #333; }}
table {{ border-collapse: collapse; width: 100%; margin-top: 20px; }}
th, td {{ border: 1px solid #ddd; padding: 8px; text-align: left; }}
th {{ background-color: #4CAF50; color: white; }}
tr:nth-child(even) {{ background-color: #f2f2f2; }}
.summary {{ background-color: #e7f3fe; padding: 15px; border-left: 6px solid #2196F3; margin: 20px 0; }}
</style>
</head>
<body>
<h1>Benchmark Results: {}</h1>
<p>Total Duration: {:.2}s</p>
<p>Timestamp: {}</p>
<div class="summary">
<h2>Summary</h2>
<p><strong>Best Throughput:</strong> {} on {} (batch size: {})</p>
<p><strong>Best Latency:</strong> {:.2}ms on {} (batch size: {})</p>
</div>
<h2>Detailed Results</h2>
<table>
<tr>
<th>Device</th>
<th>Batch Size</th>
<th>Input Shape</th>
<th>Throughput (samples/s)</th>
<th>Mean Latency (ms)</th>
<th>P99 Latency (ms)</th>
<th>Peak Memory (MB)</th>
</tr>
{}
</table>
</body>
</html>"#,
results.model_name,
results.model_name,
results.total_duration,
results.timestamp,
results.summary.best_throughput.metric_value,
results.summary.best_throughput.device,
results.summary.best_throughput.batch_size,
results.summary.best_latency.metric_value,
results.summary.best_latency.device,
results.summary.best_latency.batch_size,
results.per_config_results.iter().map(|config| {
format!(
"<tr><td>{}</td><td>{}</td><td>{:?}</td><td>{:.2}</td><td>{:.2}</td><td>{:.2}</td><td>{}</td></tr>",
config.device,
config.batch_size,
config.input_shape,
config.throughput.samples_per_second,
config.latency.mean_ms,
config.latency.p99_ms,
config.memory.as_ref().map(|m| format!("{:.2}", m.peak_memory_mb)).unwrap_or_else(|| "N/A".to_string())
)
}).collect::<Vec<_>>().join("\n")
);
Ok(html)
}